From 3fcf228b6494e707aef960bad33461a513f16721 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Jul 2025 13:34:04 +0800 Subject: [PATCH 01/43] f2fs: dump more information when checkpoint was blocked for long time generic/299 w/ mode=lfs will cause long time latency of checkpoint, let's dump more information once we hit case. CP merge: - Queued : 0 - Issued : 1 - Total : 1 - Cur time : 9765(ms) - Peak time : 9765(ms) F2FS-fs (vdc): blocked on checkpoint for 9765 ms CPU: 11 UID: 0 PID: 237 Comm: kworker/u128:29 Tainted: G O 6.16.0-rc3+ #409 PREEMPT(voluntary) Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Workqueue: writeback wb_workfn (flush-253:32) Call Trace: dump_stack_lvl+0x6e/0xa0 f2fs_issue_checkpoint+0x268/0x280 f2fs_write_node_pages+0x6a/0x2c0 do_writepages+0xd0/0x170 __writeback_single_inode+0x56/0x4c0 writeback_sb_inodes+0x22a/0x550 __writeback_inodes_wb+0x4c/0xf0 wb_writeback+0x300/0x400 wb_workfn+0x3de/0x500 process_one_work+0x230/0x5c0 worker_thread+0x1da/0x3d0 kthread+0x10d/0x250 ret_from_fork+0x164/0x190 ret_from_fork_asm+0x1a/0x30 Cc: Jan Prusakowski Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +++++++ fs/f2fs/f2fs.h | 8 +++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index db3831f7f2f5..02806e2edce4 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1778,6 +1778,7 @@ static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) llist_for_each_entry_safe(req, next, dispatch_list, llnode) { diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); req->ret = ret; + req->delta_time = diff; complete(&req->wait); sum_diff += diff; @@ -1873,6 +1874,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) else flush_remained_ckpt_reqs(sbi, &req); + if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) { + f2fs_warn_ratelimited(sbi, + "blocked on checkpoint for %u ms", cprc->peak_time); + dump_stack(); + } + return req.ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46be7560548c..7b567ab953e4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -334,7 +334,10 @@ struct ckpt_req { struct completion wait; /* completion for checkpoint done */ struct llist_node llnode; /* llist_node to be linked in wait queue */ int ret; /* return code of checkpoint */ - ktime_t queue_time; /* request queued time */ + union { + ktime_t queue_time; /* request queued time */ + ktime_t delta_time; /* time in queue */ + }; }; struct ckpt_req_control { @@ -350,6 +353,9 @@ struct ckpt_req_control { unsigned int peak_time; /* peak wait time in msec until now */ }; +/* a time threshold that checkpoint was blocked for, unit: ms */ +#define CP_LONG_LATENCY_THRESHOLD 5000 + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ From 57e74035ad5e427ed301c2dc28e2b07a1d523175 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Jul 2025 13:34:05 +0800 Subject: [PATCH 02/43] f2fs: add time stats of checkpoint for debug checkpoint was blocked for 18643 ms Step 0: 0 ms Step 1: 38 ms Step 2: 63 ms Step 3: 4 ms Step 4: 0 ms Step 5: 0 ms Step 6: 9 ms Step 7: 0 ms Step 8: 18277 ms Step 9: 249 ms Cc: Jan Prusakowski Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 22 +++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 02806e2edce4..bbe07e3a6c75 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1442,6 +1442,34 @@ u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) return get_sectors_written(sbi->sb->s_bdev); } +static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type) +{ + cpc->stats.times[type] = ktime_get(); +} + +static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long long sb_diff, cur_diff; + enum cp_time ct; + + sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END], + sbi->cp_stats.times[CP_TIME_START]); + cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END], + cpc->stats.times[CP_TIME_START]); + + if (cur_diff > sb_diff) { + sbi->cp_stats = cpc->stats; + if (cur_diff < CP_LONG_LATENCY_THRESHOLD) + return; + + f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff); + for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++) + f2fs_warn(sbi, "Step#%d: %llu ms", ct, + (u64)ktime_ms_delta(cpc->stats.times[ct + 1], + cpc->stats.times[ct])); + } +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1459,6 +1487,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_META); + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); @@ -1555,20 +1585,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_CP_META); + /* Wait for all dirty meta pages to be submitted for IO */ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); + stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) return err; + stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE); /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP); /* * invalidate intermediate page cache borrowed from meta inode which are @@ -1613,6 +1649,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long long ckpt_ver; int err = 0; + stat_cp_time(cpc, CP_TIME_START); + if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return -EROFS; @@ -1624,6 +1662,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason != CP_RESIZE) f2fs_down_write(&sbi->cp_global_sem); + stat_cp_time(cpc, CP_TIME_LOCK); + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) @@ -1639,6 +1679,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (err) goto out; + stat_cp_time(cpc, CP_TIME_OP_LOCK); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_flush_merged_writes(sbi); @@ -1678,6 +1720,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + stat_cp_time(cpc, CP_TIME_FLUSH_META); + /* save inmem log status */ f2fs_save_inmem_curseg(sbi); @@ -1695,6 +1739,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) stat_inc_cp_count(sbi); stop: unblock_operations(sbi); + stat_cp_time(cpc, CP_TIME_END); + check_cp_time(sbi, cpc); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7b567ab953e4..64a4fda315c9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -269,11 +269,32 @@ enum { #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ +enum cp_time { + CP_TIME_START, /* begin */ + CP_TIME_LOCK, /* after cp_global_sem */ + CP_TIME_OP_LOCK, /* after block_operation */ + CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_SYNC_META, /* after sync_meta_pages */ + CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ + CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ + CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ + CP_TIME_FLUSH_DEVICE, /* after flush device cache */ + CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ + CP_TIME_END, /* after unblock_operation */ + CP_TIME_MAX, +}; + +/* time cost stats of checkpoint */ +struct cp_stats { + ktime_t times[CP_TIME_MAX]; +}; + struct cp_control { int reason; __u64 trim_start; __u64 trim_end; __u64 trim_minlen; + struct cp_stats stats; }; /* @@ -1643,6 +1664,7 @@ struct f2fs_sb_info { unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ + struct cp_stats cp_stats; /* for time stat of checkpoint */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ From e75ce117905d2830976a289e718470f3230fa30a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Jul 2025 14:03:38 +0800 Subject: [PATCH 03/43] f2fs: fix condition in __allow_reserved_blocks() If reserve_root mount option is not assigned, __allow_reserved_blocks() will return false, it's not correct, fix it. Fixes: 7e65be49ed94 ("f2fs: add reserved blocks for root user") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 64a4fda315c9..6b66db642e09 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2389,8 +2389,6 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, { if (!inode) return true; - if (!test_opt(sbi, RESERVE_ROOT)) - return false; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) @@ -2411,7 +2409,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode, cap)) + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_blocks(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { From 76bb6a72bc1daa3d973395349f366231768f8877 Mon Sep 17 00:00:00 2001 From: "mason.zhang" Date: Thu, 31 Jul 2025 23:19:17 +0800 Subject: [PATCH 04/43] f2fs: add error checking in do_write_page() Otherwise, the filesystem may unaware of potential file corruption. Signed-off-by: mason.zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc82d42ef14c..04b0a3c1804d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3936,12 +3936,18 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); + int err; if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio)) { + err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio); + if (unlikely(err)) { + f2fs_err_ratelimited(fio->sbi, + "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d", + __func__, fio->ino, folio->index, type, + fio->old_blkaddr, fio->new_blkaddr, err); if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); From 632f0b6c3e32758e5c93d4e3c2860a3708b9853e Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 4 Aug 2025 23:52:26 -0700 Subject: [PATCH 05/43] f2fs: add lookup_mode mount option For casefolded directories, f2fs may fall back to a linear search if a hash-based lookup fails. This can cause severe performance regressions. While this behavior can be controlled by userspace tools (e.g. mkfs, fsck) by setting an on-disk flag, a kernel-level solution is needed to guarantee the lookup behavior regardless of the on-disk state. This commit introduces the 'lookup_mode' mount option to provide this kernel-side control. The option accepts three values: - perf: (Default) Enforces a hash-only lookup. The linear fallback is always disabled. - compat: Enables the linear search fallback for compatibility with directory entries from older kernels. - auto: Determines the mode based on the on-disk flag, preserving the userspace-based behavior. Signed-off-by: Daniel Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 19 +++++++++++++++++++ fs/f2fs/dir.c | 17 ++++++++++++++++- fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/super.c | 25 +++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index e5bb89452aff..5cad369ceb92 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -370,6 +370,25 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes: ====================== =============== =============== ======== nat_bits Enable nat_bits feature to enhance full/empty nat blocks access, by default it's disabled. +lookup_mode=%s Control the directory lookup behavior for casefolded + directories. This option has no effect on directories + that do not have the casefold feature enabled. + + ================== ======================================== + Value Description + ================== ======================================== + perf (Default) Enforces a hash-only lookup. + The linear search fallback is always + disabled, ignoring the on-disk flag. + compat Enables the linear search fallback for + compatibility with directory entries + created by older kernel that used a + different case-folding algorithm. + This mode ignores the on-disk flag. + auto F2FS determines the mode based on the + on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL` + flag. + ================== ======================================== ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fffd7749d6d1..48f4f98afb01 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,21 @@ #include "xattr.h" #include +static inline bool f2fs_should_fallback_to_linear(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return false; + case LOOKUP_COMPAT: + return true; + case LOOKUP_AUTO: + return !sb_no_casefold_compat_fallback(sbi->sb); + } + return false; +} + #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -366,7 +381,7 @@ start_find_entry: out: #if IS_ENABLED(CONFIG_UNICODE) - if (!sb_no_casefold_compat_fallback(dir->i_sb) && + if (f2fs_should_fallback_to_linear(dir) && IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6b66db642e09..36d6a6803916 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -212,6 +212,7 @@ struct f2fs_mount_info { int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned int lookup_mode; }; #define F2FS_FEATURE_ENCRYPT 0x00000001 @@ -1475,6 +1476,12 @@ enum { TOTAL_CALL = FOREGROUND, }; +enum f2fs_lookup_mode { + LOOKUP_PERF, + LOOKUP_COMPAT, + LOOKUP_AUTO, +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..0638ae9816ac 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -181,6 +181,7 @@ enum { Opt_nat_bits, Opt_jqfmt, Opt_checkpoint, + Opt_lookup_mode, Opt_err, }; @@ -244,6 +245,13 @@ static const struct constant_table f2fs_param_errors[] = { {} }; +static const struct constant_table f2fs_param_lookup_mode[] = { + {"perf", LOOKUP_PERF}, + {"compat", LOOKUP_COMPAT}, + {"auto", LOOKUP_AUTO}, + {} +}; + static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), @@ -300,6 +308,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), fsparam_flag("age_extent_cache", Opt_age_extent_cache), fsparam_enum("errors", Opt_errors, f2fs_param_errors), + fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode), {} }; @@ -336,6 +345,7 @@ static match_table_t f2fs_checkpoint_tokens = { #define F2FS_SPEC_discard_unit (1 << 21) #define F2FS_SPEC_memory_mode (1 << 22) #define F2FS_SPEC_errors (1 << 23) +#define F2FS_SPEC_lookup_mode (1 << 24) struct f2fs_fs_context { struct f2fs_mount_info info; @@ -1143,6 +1153,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_nat_bits: ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); break; + case Opt_lookup_mode: + F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_lookup_mode; + break; } return 0; } @@ -1652,6 +1666,8 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; if (ctx->spec_mask & F2FS_SPEC_errors) F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + if (ctx->spec_mask & F2FS_SPEC_lookup_mode) + F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode; f2fs_apply_compression(fc, sb); f2fs_apply_test_dummy_encryption(fc, sb); @@ -2416,6 +2432,13 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, NAT_BITS)) seq_puts(seq, ",nat_bits"); + if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF) + seq_show_option(seq, "lookup_mode", "perf"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT) + seq_show_option(seq, "lookup_mode", "compat"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO) + seq_show_option(seq, "lookup_mode", "auto"); + return 0; } @@ -2480,6 +2503,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) #endif f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); + + F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF; } #ifdef CONFIG_QUOTA From 1bd119da0b93d6d61891edf2b9d036800aba687f Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 4 Aug 2025 23:52:27 -0700 Subject: [PATCH 06/43] f2fs: add sysfs entry for effective lookup mode This commit introduces a new read-only sysfs entry at /sys/fs/f2fs//effective_lookup_mode. This entry displays the actual directory lookup mode F2FS is currently using. This is needed for debugging and verification, as the behavior is determined by both on-disk flags and mount options. Signed-off-by: Daniel Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 15 +++++++++++++++ fs/f2fs/sysfs.c | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index bc0e7fefc39d..ce189acd1908 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -883,3 +883,18 @@ Date: June 2025 Contact: "Daeho Jeong" Description: Control GC algorithm for boost GC. 0: cost benefit, 1: greedy Default: 1 + +What: /sys/fs/f2fs//effective_lookup_mode +Date: August 2025 +Contact: "Daniel Lee" +Description: + This is a read-only entry to show the effective directory lookup mode + F2FS is currently using for casefolded directories. + This considers both the "lookup_mode" mount option and the on-disk + encoding flag, SB_ENC_NO_COMPAT_FALLBACK_FL. + + Possible values are: + - "perf": Hash-only lookup. + - "compat": Hash-based lookup with a linear search fallback enabled + - "auto:perf": lookup_mode is auto and fallback is disabled on-disk + - "auto:compat": lookup_mode is auto and fallback is enabled on-disk diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f736052dea50..82489c78aeda 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -281,6 +281,22 @@ static ssize_t encoding_flags_show(struct f2fs_attr *a, le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags)); } +static ssize_t effective_lookup_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return sysfs_emit(buf, "perf\n"); + case LOOKUP_COMPAT: + return sysfs_emit(buf, "compat\n"); + case LOOKUP_AUTO: + if (sb_no_casefold_compat_fallback(sbi->sb)) + return sysfs_emit(buf, "auto:perf\n"); + return sysfs_emit(buf, "auto:compat\n"); + } + return 0; +} + static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1211,6 +1227,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(encoding_flags); +F2FS_GENERAL_RO_ATTR(effective_lookup_mode); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); @@ -1329,6 +1346,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), ATTR_LIST(encoding_flags), + ATTR_LIST(effective_lookup_mode), ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), From 0fe1c6bec54ea68ed8c987b3890f2296364e77bb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Aug 2025 14:29:10 +0800 Subject: [PATCH 07/43] f2fs: fix to avoid overflow while left shift operation Should cast type of folio->index from pgoff_t to loff_t to avoid overflow while left shift operation. Fixes: 3265d3db1f16 ("f2fs: support partial truncation on compressed inode") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5c1f47e45dab..6cd8902849cf 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1245,7 +1245,7 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) for (i = cluster_size - 1; i >= 0; i--) { struct folio *folio = page_folio(rpages[i]); - loff_t start = folio->index << PAGE_SHIFT; + loff_t start = (loff_t)folio->index << PAGE_SHIFT; if (from <= start) { folio_zero_segment(folio, 0, folio_size(folio)); From 0b2cd5092139f499544c77b5107a74e5fdb3a386 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Aug 2025 14:11:06 +0800 Subject: [PATCH 08/43] f2fs: fix to zero data after EOF for compressed file correctly generic/091 may fail, then it bisects to the bad commit ba8dac350faf ("f2fs: fix to zero post-eof page"). What will cause generic/091 to fail is something like below Testcase #1: 1. write 16k as compressed blocks 2. truncate to 12k 3. truncate to 20k 4. verify data in range of [12k, 16k], however data is not zero as expected Script of Testcase #1 mkfs.f2fs -f -O extra_attr,compression /dev/vdb mount -t f2fs -o compress_extension=* /dev/vdb /mnt/f2fs dd if=/dev/zero of=/mnt/f2fs/file bs=12k count=1 dd if=/dev/random of=/mnt/f2fs/file bs=4k count=1 seek=3 conv=notrunc sync truncate -s $((12*1024)) /mnt/f2fs/file truncate -s $((20*1024)) /mnt/f2fs/file dd if=/mnt/f2fs/file of=/mnt/f2fs/data bs=4k count=1 skip=3 od /mnt/f2fs/data umount /mnt/f2fs Analisys: in step 2), we will redirty all data pages from #0 to #3 in compressed cluster, and zero page #3, in step 3), f2fs_setattr() will call f2fs_zero_post_eof_page() to drop all page cache post eof, includeing dirtied page #3, in step 4) when we read data from page #3, it will decompressed cluster and extra random data to page #3, finally, we hit the non-zeroed data post eof. However, the commit ba8dac350faf ("f2fs: fix to zero post-eof page") just let the issue be reproduced easily, w/o the commit, it can reproduce this bug w/ below Testcase #2: 1. write 16k as compressed blocks 2. truncate to 8k 3. truncate to 12k 4. truncate to 20k 5. verify data in range of [12k, 16k], however data is not zero as expected Script of Testcase #2 mkfs.f2fs -f -O extra_attr,compression /dev/vdb mount -t f2fs -o compress_extension=* /dev/vdb /mnt/f2fs dd if=/dev/zero of=/mnt/f2fs/file bs=12k count=1 dd if=/dev/random of=/mnt/f2fs/file bs=4k count=1 seek=3 conv=notrunc sync truncate -s $((8*1024)) /mnt/f2fs/file truncate -s $((12*1024)) /mnt/f2fs/file truncate -s $((20*1024)) /mnt/f2fs/file echo 3 > /proc/sys/vm/drop_caches dd if=/mnt/f2fs/file of=/mnt/f2fs/data bs=4k count=1 skip=3 od /mnt/f2fs/data umount /mnt/f2fs Anlysis: in step 2), we will redirty all data pages from #0 to #3 in compressed cluster, and zero page #2 and #3, in step 3), we will truncate page #3 in page cache, in step 4), expand file size, in step 5), hit random data post eof w/ the same reason in Testcase #1. Root Cause: In f2fs_truncate_partial_cluster(), after we truncate partial data block on compressed cluster, all pages in cluster including the one post eof will be dirtied, after another tuncation, dirty page post eof will be dropped, however on-disk compressed cluster is still valid, it may include non-zero data post eof, result in exposing previous non-zero data post eof while reading. Fix: In f2fs_truncate_partial_cluster(), let change as below to fix: - call filemap_write_and_wait_range() to flush dirty page - call truncate_pagecache() to drop pages or zero partial page post eof - call f2fs_do_truncate_blocks() to truncate non-compress cluster to last valid block Fixes: 3265d3db1f16 ("f2fs: support partial truncation on compressed inode") Reported-by: Jan Prusakowski Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6cd8902849cf..72bc05b913af 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1246,19 +1246,28 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) for (i = cluster_size - 1; i >= 0; i--) { struct folio *folio = page_folio(rpages[i]); loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; - if (from <= start) { - folio_zero_segment(folio, 0, folio_size(folio)); - } else { - folio_zero_segment(folio, from - start, - folio_size(folio)); + folio_zero_segment(folio, offset, folio_size(folio)); + + if (from >= start) break; - } } f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + err = f2fs_do_truncate_blocks(inode, + round_up(from, PAGE_SIZE), lock); } - return 0; + return err; } static int f2fs_write_compressed_pages(struct compress_ctx *cc, From cbba5038ee29f16f583978d6debe486503693c70 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Aug 2025 14:29:12 +0800 Subject: [PATCH 09/43] f2fs: clean up f2fs_truncate_partial_cluster() Clean up codes as below: - avoid unnecessary "err > 0" check condition - use "1 << log_cluster_size" instead of F2FS_I(inode)->i_cluster_size No logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 48 +++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 72bc05b913af..6ad8d3bc6df7 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1215,9 +1215,11 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) { void *fsdata = NULL; struct page *pagep; + struct page **rpages; int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << log_cluster_size; + int i; int err; err = f2fs_is_compressed_cluster(inode, start_idx); @@ -1238,36 +1240,30 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) if (err <= 0) return err; - if (err > 0) { - struct page **rpages = fsdata; - int cluster_size = F2FS_I(inode)->i_cluster_size; - int i; + rpages = fsdata; - for (i = cluster_size - 1; i >= 0; i--) { - struct folio *folio = page_folio(rpages[i]); - loff_t start = (loff_t)folio->index << PAGE_SHIFT; - loff_t offset = from > start ? from - start : 0; + for (i = (1 << log_cluster_size) - 1; i >= 0; i--) { + struct folio *folio = page_folio(rpages[i]); + loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; - folio_zero_segment(folio, offset, folio_size(folio)); + folio_zero_segment(folio, offset, folio_size(folio)); - if (from >= start) - break; - } - - f2fs_compress_write_end(inode, fsdata, start_idx, true); - - err = filemap_write_and_wait_range(inode->i_mapping, - round_down(from, cluster_size << PAGE_SHIFT), - LLONG_MAX); - if (err) - return err; - - truncate_pagecache(inode, from); - - err = f2fs_do_truncate_blocks(inode, - round_up(from, PAGE_SIZE), lock); + if (from >= start) + break; } - return err; + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, 1 << log_cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock); } static int f2fs_write_compressed_pages(struct compress_ctx *cc, From 2e8f4c2b2bb12fc3d40762f1bb778e95c6ddbc93 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Aug 2025 09:48:35 +0800 Subject: [PATCH 10/43] f2fs: fix to clear unusable_cap for checkpoint=enable mount -t f2fs -o checkpoint=disable:10% /dev/vdb /mnt/f2fs/ mount -t f2fs -o remount,checkpoint=enable /dev/vdb /mnt/f2fs/ kernel log: F2FS-fs (vdb): Adjust unusable cap for checkpoint=disable = 204440 / 10% If we has assigned checkpoint=enable mount option, unusable_cap{,_perc} parameters of checkpoint=disable should be reset, then calculation and log print could be avoid in adjust_unusable_cap_perc(). Fixes: 1ae18f71cb52 ("f2fs: fix checkpoint=disable:%u%%") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0638ae9816ac..195ec6e598d2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -998,6 +998,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: + F2FS_CTX_INFO(ctx).unusable_cap_perc = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + F2FS_CTX_INFO(ctx).unusable_cap = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; default: From 8fc6056dcf79937c46c97fa4996cda65956437a9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Aug 2025 10:44:31 +0800 Subject: [PATCH 11/43] f2fs: fix to detect potential corrupted nid in free_nid_list As reported, on-disk footer.ino and footer.nid is the same and out-of-range, let's add sanity check on f2fs_alloc_nid() to detect any potential corruption in free_nid_list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 ++++++++++++++++- include/linux/f2fs_fs.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..286e8a53f8e7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; static struct kmem_cache *fsync_node_entry_slab; +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} + /* * Check whether the given nid is within node id range. */ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + if (unlikely(is_invalid_nid(sbi, nid))) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); @@ -2634,6 +2639,16 @@ retry: f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; + } + *nid = i->nid; __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2f8b8bfc0e73..6afb4a13b81d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -79,6 +79,7 @@ enum stop_cp_reason { STOP_CP_REASON_FLUSH_FAIL, STOP_CP_REASON_NO_SEGMENT, STOP_CP_REASON_CORRUPTED_FREE_BITMAP, + STOP_CP_REASON_CORRUPTED_NID, STOP_CP_REASON_MAX, }; From 4bc347779698b5e67e1514bab105c2c083e55502 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Aug 2025 12:00:25 +0800 Subject: [PATCH 12/43] f2fs: add timeout in f2fs_enable_checkpoint() During f2fs_enable_checkpoint() in remount(), if we flush a large amount of dirty pages into slow device, it may take long time which will block write IO, let's add a timeout machanism during dirty pages flush to avoid long time block in f2fs_enable_checkpoint(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 21 +++++++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 36d6a6803916..0655c5c20eee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -267,6 +267,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ @@ -1397,6 +1398,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 195ec6e598d2..7f5ee1daa5a4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2594,16 +2594,24 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - int retry = DEFAULT_RETRY_IO_COUNT; + unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + + f2fs_update_time(sbi, ENABLE_TIME); /* we should flush all the data to keep data consistency */ - do { - sync_inodes_sb(sbi->sb); + while (get_pages(sbi, F2FS_DIRTY_DATA)) { + writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); - if (unlikely(retry < 0)) - f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); + if (f2fs_time_over(sbi, ENABLE_TIME)) + break; + } + + sync_inodes_sb(sbi->sb); + + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", + get_pages(sbi, F2FS_DIRTY_DATA)); f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -4200,6 +4208,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); From 80b6d1d2535a343e43d658777a46f1ebce8f3413 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Aug 2025 12:00:26 +0800 Subject: [PATCH 13/43] f2fs: dump more information for f2fs_{enable,disable}_checkpoint() Changes as below: - print more logs for f2fs_{enable,disable}_checkpoint() - account and dump time stats for f2fs_enable_checkpoint() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7f5ee1daa5a4..87f6490d9247 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2589,15 +2589,24 @@ out_unlock: restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err); return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + long long start, writeback, end; + + f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DATA)); f2fs_update_time(sbi, ENABLE_TIME); + start = ktime_get(); + /* we should flush all the data to keep data consistency */ while (get_pages(sbi, F2FS_DIRTY_DATA)) { writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); @@ -2606,6 +2615,7 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) if (f2fs_time_over(sbi, ENABLE_TIME)) break; } + writeback = ktime_get(); sync_inodes_sb(sbi->sb); @@ -2624,6 +2634,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); + + end = ktime_get(); + + f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", + ktime_ms_delta(writeback, start), + ktime_ms_delta(end, writeback)); } static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) From 00798cd24f014fe55cbcdfeac4bab19ad6cd2bcb Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Fri, 8 Aug 2025 17:48:01 +0800 Subject: [PATCH 14/43] f2fs: Add bggc_io_aware to adjust the priority of BG_GC when issuing IO Currently, we have encountered some issues while testing ZUFS. In situations near the storage limit (e.g., 50GB remaining), and after simulating fragmentation by repeatedly writing and deleting data, we found that application installation and startup tests conducted after idling for a few minutes take significantly longer several times that of traditional UFS. Tracing the operations revealed that the majority of I/Os were issued by background GC, which blocks normal I/O operations. Under normal circumstances, ZUFS indeed requires more background GC and employs a more aggressive GC strategy. However, I aim to find a way to minimize the impact on regular I/O operations under these near-limit conditions. To address this, I have introduced a bggc_io_aware feature, which controls the prioritization of background GC in the presence of I/Os. This switch can be adjusted at the framework level to implement different strategies. If set to AWARE_ALL_IO, all background GC operations will be skipped during active I/O issuance. The default option remains consistent with the current strategy, ensuring no change in behavior. Signed-off-by: Liao Yuanhong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 13 +++++++++++++ fs/f2fs/f2fs.h | 18 +++++++++++------- fs/f2fs/super.c | 2 ++ fs/f2fs/sysfs.c | 9 +++++++++ 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ce189acd1908..66ee15681983 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -898,3 +898,16 @@ Description: - "compat": Hash-based lookup with a linear search fallback enabled - "auto:perf": lookup_mode is auto and fallback is disabled on-disk - "auto:compat": lookup_mode is auto and fallback is enabled on-disk + +What: /sys/fs/f2fs//bggc_io_aware +Date: August 2025 +Contact: "Liao Yuanhong" +Description: Used to adjust the BG_GC priority when pending IO, with a default value + of 0. Specifically, for ZUFS, the default value is 1. + + ================== ====================================================== + value description + bggc_io_aware = 0 skip background GC if there is any kind of pending IO + bggc_io_aware = 1 skip background GC if there is pending read IO + bggc_io_aware = 2 don't aware IO for background GC + ================== ====================================================== diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0655c5c20eee..dba391b72119 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -155,6 +155,12 @@ enum blkzone_allocation_policy { BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; +enum bggc_io_aware_policy { + AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ + AWARE_READ_IO, /* skip background GC if there is pending read IO */ + AWARE_NONE, /* don't aware IO for background GC */ +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -1841,6 +1847,7 @@ struct f2fs_sb_info { spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ + unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ /* For write statistics */ u64 sectors_written_start; @@ -3033,13 +3040,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (sbi->gc_mode == GC_URGENT_HIGH) return true; - if (zoned_gc) { - if (is_inflight_read_io(sbi)) - return false; - } else { - if (is_inflight_io(sbi, type)) - return false; - } + if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) + return false; + if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) + return false; if (sbi->gc_mode == GC_URGENT_MID) return true; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 87f6490d9247..2000880b7dca 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4683,9 +4683,11 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); sbi->aligned_blksize = true; + sbi->bggc_io_aware = AWARE_ALL_IO; #ifdef CONFIG_BLK_DEV_ZONED sbi->max_open_zones = UINT_MAX; sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ; + sbi->bggc_io_aware = AWARE_READ_IO; #endif for (i = 0; i < max_devices; i++) { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 82489c78aeda..1ffaf9e74ce9 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -882,6 +882,13 @@ out: return count; } + if (!strcmp(a->attr.name, "bggc_io_aware")) { + if (t < AWARE_ALL_IO || t > AWARE_NONE) + return -EINVAL; + sbi->bggc_io_aware = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1191,6 +1198,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); +F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1320,6 +1328,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), + ATTR_LIST(bggc_io_aware), #ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), From 2141879369681ff899a6a0857eef8bded785a5b5 Mon Sep 17 00:00:00 2001 From: Chunhai Guo Date: Thu, 7 Aug 2025 21:35:01 +0800 Subject: [PATCH 15/43] f2fs: add reserved nodes for privileged users This patch allows privileged users to reserve nodes via the 'reserve_node' mount option, which is similar to the existing 'reserve_root' option. "-o reserve_node=" means nodes are reserved for privileged users only. Signed-off-by: Chunhai Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 9 ++++--- fs/f2fs/f2fs.h | 17 ++++++++---- fs/f2fs/super.c | 43 +++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 5cad369ceb92..e06cbb823bb7 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -173,9 +173,12 @@ data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. reserve_root=%d Support configuring reserved space which is used for allocation from a privileged user with specified uid or - gid, unit: 4KB, the default limit is 0.2% of user blocks. -resuid=%d The user ID which may use the reserved blocks. -resgid=%d The group ID which may use the reserved blocks. + gid, unit: 4KB, the default limit is 12.5% of user blocks. +reserve_node=%d Support configuring reserved nodes which are used for + allocation from a privileged user with specified uid or + gid, the default limit is 12.5% of all nodes. +resuid=%d The user ID which may use the reserved blocks and nodes. +resgid=%d The group ID which may use the reserved blocks and nodes. fault_injection=%d Enable fault injection in all supported types with specified injection rate. fault_type=%d Support configuring fault injection type, should be diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dba391b72119..d6a49de1b7e9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -131,6 +131,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; * string rather than using the MS_LAZYTIME flag, so this must remain. */ #define F2FS_MOUNT_LAZYTIME 0x40000000 +#define F2FS_MOUNT_RESERVE_NODE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -178,6 +179,7 @@ struct f2fs_rwsem { struct f2fs_mount_info { unsigned int opt; block_t root_reserved_blocks; /* root reserved blocks */ + block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ int active_logs; /* # of active logs */ @@ -2400,7 +2402,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, +static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { if (!inode) @@ -2425,7 +2427,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_blocks(sbi, inode, cap)) + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { @@ -2783,7 +2785,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; - unsigned int valid_node_count; + unsigned int valid_node_count, avail_user_node_count; unsigned int avail_user_block_count; int err; @@ -2805,15 +2807,20 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - avail_user_block_count = get_available_block_count(sbi, inode, false); + avail_user_block_count = get_available_block_count(sbi, inode, + test_opt(sbi, RESERVE_NODE)); if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } + avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (test_opt(sbi, RESERVE_NODE) && + !__allow_reserved_root(sbi, inode, true)) + avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; valid_node_count = sbi->total_valid_node_count + 1; - if (unlikely(valid_node_count > sbi->total_node_count)) { + if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2000880b7dca..5aa9d650512d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -143,6 +143,7 @@ enum { Opt_extent_cache, Opt_data_flush, Opt_reserve_root, + Opt_reserve_node, Opt_resgid, Opt_resuid, Opt_mode, @@ -273,6 +274,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_flag_no("extent_cache", Opt_extent_cache), fsparam_flag("data_flush", Opt_data_flush), fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_u32("reserve_node", Opt_reserve_node), fsparam_gid("resgid", Opt_resgid), fsparam_uid("resuid", Opt_resuid), fsparam_enum("mode", Opt_mode, f2fs_param_mode), @@ -346,6 +348,7 @@ static match_table_t f2fs_checkpoint_tokens = { #define F2FS_SPEC_memory_mode (1 << 22) #define F2FS_SPEC_errors (1 << 23) #define F2FS_SPEC_lookup_mode (1 << 24) +#define F2FS_SPEC_reserve_node (1 << 25) struct f2fs_fs_context { struct f2fs_mount_info info; @@ -447,22 +450,30 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count >> 3), + block_t block_limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); + block_t node_limit = sbi->total_node_count >> 3; /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && - F2FS_OPTION(sbi).root_reserved_blocks > limit) { - F2FS_OPTION(sbi).root_reserved_blocks = limit; + F2FS_OPTION(sbi).root_reserved_blocks > block_limit) { + F2FS_OPTION(sbi).root_reserved_blocks = block_limit; f2fs_info(sbi, "Reduce reserved blocks for root = %u", F2FS_OPTION(sbi).root_reserved_blocks); } - if (!test_opt(sbi, RESERVE_ROOT) && + if (test_opt(sbi, RESERVE_NODE) && + F2FS_OPTION(sbi).root_reserved_nodes > node_limit) { + F2FS_OPTION(sbi).root_reserved_nodes = node_limit; + f2fs_info(sbi, "Reduce reserved nodes for root = %u", + F2FS_OPTION(sbi).root_reserved_nodes); + } + if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) && (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) - f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root" + " and reserve_node", from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -851,6 +862,11 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; ctx->spec_mask |= F2FS_SPEC_reserve_root; break; + case Opt_reserve_node: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_node; + break; case Opt_resuid: F2FS_CTX_INFO(ctx).s_resuid = result.uid; ctx->spec_mask |= F2FS_SPEC_resuid; @@ -1442,6 +1458,14 @@ static int f2fs_check_opt_consistency(struct fs_context *fc, ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; } + if (test_opt(sbi, RESERVE_NODE) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) { + f2fs_info(sbi, "Preserve previous reserve_node=%u", + F2FS_OPTION(sbi).root_reserved_nodes); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE; + } err = f2fs_check_test_dummy_encryption(fc, sb); if (err) @@ -1641,6 +1665,9 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) if (ctx->spec_mask & F2FS_SPEC_reserve_root) F2FS_OPTION(sbi).root_reserved_blocks = F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_reserve_node) + F2FS_OPTION(sbi).root_reserved_nodes = + F2FS_CTX_INFO(ctx).root_reserved_nodes; if (ctx->spec_mask & F2FS_SPEC_resgid) F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; if (ctx->spec_mask & F2FS_SPEC_resuid) @@ -2363,9 +2390,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); - if (test_opt(sbi, RESERVE_ROOT)) - seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE)) + seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u," + "resgid=%u", F2FS_OPTION(sbi).root_reserved_blocks, + F2FS_OPTION(sbi).root_reserved_nodes, from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, From 248a99832499036b330d471336b8765ef0e14cb4 Mon Sep 17 00:00:00 2001 From: Soham Metha Date: Wed, 13 Aug 2025 01:45:15 +0530 Subject: [PATCH 16/43] docs: f2fs: fixed spelling mistakes in documentation found/fixed the following typos - deivces -> devices - substracting -> subtracting in `Documentation/ABI/testing/sysfs-fs-f2fs` Signed-off-by: Soham Metha Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 66ee15681983..ee3acc8c2cb8 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -822,8 +822,8 @@ What: /sys/fs/f2fs//gc_valid_thresh_ratio Date: September 2024 Contact: "Daeho Jeong" Description: It controls the valid block ratio threshold not to trigger excessive GC - for zoned deivces. The initial value of it is 95(%). F2FS will stop the - background GC thread from intiating GC for sections having valid blocks + for zoned devices. The initial value of it is 95(%). F2FS will stop the + background GC thread from initiating GC for sections having valid blocks exceeding the ratio. What: /sys/fs/f2fs//max_read_extent_count @@ -847,7 +847,7 @@ Description: For several zoned storage devices, vendors will provide extra space filesystem level GC. To do that, we can reserve the space using reserved_blocks. However, it is not enough, since this extra space should not be shown to users. So, with this new sysfs node, we can hide the space - by substracting reserved_blocks from total bytes. + by subtracting reserved_blocks from total bytes. What: /sys/fs/f2fs//encoding_flags Date: April 2025 From 4978f0a5ee239e66b2633725ecc81d3b43c2a7a5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Aug 2025 19:41:23 +0800 Subject: [PATCH 17/43] f2fs: clean up w/ get_left_section_blocks() Introduce get_left_section_blocks() for cleanup, no logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e2ee5c686b1..1ce2c8abaf48 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -600,6 +600,16 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } +static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, + enum log_type type, unsigned int segno) +{ + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi, + (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + CURSEG_I(sbi, type)->next_blkoff; + return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); +} + static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) @@ -614,14 +624,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, i)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, i, segno); blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; if (blocks > left_blocks) @@ -634,14 +637,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); if (dent_blocks > left_blocks) return false; From 930a9a6ee8e7ffa20af4bffbfc2bbd21d83bf81c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 18 Aug 2025 10:09:38 +0800 Subject: [PATCH 18/43] f2fs: fix to avoid NULL pointer dereference in f2fs_check_quota_consistency() syzbot reported a f2fs bug as below: Oops: gen[ 107.736417][ T5848] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 5848 Comm: syz-executor263 Tainted: G W 6.17.0-rc1-syzkaller-00014-g0e39a731820a #0 PREEMPT_{RT,(full)} RIP: 0010:strcmp+0x3c/0xc0 lib/string.c:284 Call Trace: f2fs_check_quota_consistency fs/f2fs/super.c:1188 [inline] f2fs_check_opt_consistency+0x1378/0x2c10 fs/f2fs/super.c:1436 __f2fs_remount fs/f2fs/super.c:2653 [inline] f2fs_reconfigure+0x482/0x1770 fs/f2fs/super.c:5297 reconfigure_super+0x224/0x890 fs/super.c:1077 do_remount fs/namespace.c:3314 [inline] path_mount+0xd18/0xfe0 fs/namespace.c:4112 do_mount fs/namespace.c:4133 [inline] __do_sys_mount fs/namespace.c:4344 [inline] __se_sys_mount+0x317/0x410 fs/namespace.c:4321 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The direct reason is f2fs_check_quota_consistency() may suffer null-ptr-deref issue in strcmp(). The bug can be reproduced w/ below scripts: mkfs.f2fs -f /dev/vdb mount -t f2fs -o usrquota /dev/vdb /mnt/f2fs quotacheck -uc /mnt/f2fs/ umount /mnt/f2fs mount -t f2fs -o usrjquota=aquota.user,jqfmt=vfsold /dev/vdb /mnt/f2fs mount -t f2fs -o remount,usrjquota=,jqfmt=vfsold /dev/vdb /mnt/f2fs umount /mnt/f2fs So, before old_qname and new_qname comparison, we need to check whether they are all valid pointers, fix it. Reported-by: syzbot+d371efea57d5aeab877b@syzkaller.appspotmail.com Fixes: d18535132523 ("f2fs: separate the options parsing and options checking") Closes: https://lore.kernel.org/linux-f2fs-devel/689ff889.050a0220.e29e5.0037.GAE@google.com Cc: Hongbo Li Signed-off-by: Chao Yu Reviewed-by: Hongbo Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5aa9d650512d..465604fdc5dd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1219,7 +1219,8 @@ static int f2fs_check_quota_consistency(struct fs_context *fc, goto err_jquota_change; if (old_qname) { - if (strcmp(old_qname, new_qname) == 0) { + if (new_qname && + strcmp(old_qname, new_qname) == 0) { ctx->qname_mask &= ~(1 << i); continue; } From ff11d8701b77e303593fd86cf9ef74ef3ac4048e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 18 Aug 2025 10:09:39 +0800 Subject: [PATCH 19/43] f2fs: fix to allow removing qf_name The mount behavior changed after commit d18535132523 ("f2fs: separate the options parsing and options checking"), let's fix it. [Scripts] mkfs.f2fs -f /dev/vdb mount -t f2fs -o usrquota /dev/vdb /mnt/f2fs quotacheck -uc /mnt/f2fs umount /mnt/f2fs mount -t f2fs -o usrjquota=aquota.user,jqfmt=vfsold /dev/vdb /mnt/f2fs mount|grep f2fs mount -t f2fs -o remount,usrjquota=,jqfmt=vfsold /dev/vdb /mnt/f2fs mount|grep f2fs dmesg [Before commit] mount#1: ...,quota,jqfmt=vfsold,usrjquota=aquota.user,... mount#2: ...,quota,jqfmt=vfsold,... kmsg: no output [After commit] mount#1: ...,quota,jqfmt=vfsold,usrjquota=aquota.user,... mount#2: ...,quota,jqfmt=vfsold,usrjquota=aquota.user,... kmsg: "user quota file already specified" [After patch] mount#1: ...,quota,jqfmt=vfsold,usrjquota=aquota.user,... mount#2: ...,quota,jqfmt=vfsold,... kmsg: "remove qf_name aquota.user" Fixes: d18535132523 ("f2fs: separate the options parsing and options checking") Cc: Hongbo Li Signed-off-by: Chao Yu Reviewed-by: Hongbo Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 465604fdc5dd..07f6c8cac07a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1219,8 +1219,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc, goto err_jquota_change; if (old_qname) { - if (new_qname && - strcmp(old_qname, new_qname) == 0) { + if (!new_qname) { + f2fs_info(sbi, "remove qf_name %s", + old_qname); + continue; + } else if (strcmp(old_qname, new_qname) == 0) { ctx->qname_mask &= ~(1 << i); continue; } From f1a49c1b112b899643b01a5b86cf642a09a01cb6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Aug 2025 23:50:25 +0000 Subject: [PATCH 20/43] f2fs: show the list of donation files This patch introduces a proc entry to show the currently enrolled donation files. - "File path" indicates a file. - "Status" a. "Donated" means the file is registed in the donation list by fadvise(offset, length, POSIX_FADV_NOREUSE) b. "Evicted" means the donated pages were reclaimed. - "Offset (kb)" and "Length (kb) show the registered donation range. - "Cached pages (kb)" shows the amount of cached pages in the inode page cache. For example, # of files : 2 File path Status Donation offset (kb) Donation size (kb) File cached size (kb) --- /local/test2 Donated 0 1048576 2097152 /local/test Evicted 0 1048576 1048576 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1ffaf9e74ce9..86ebe97c256d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1769,6 +1769,68 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused donation_list_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + struct f2fs_inode_info *fi; + struct dentry *dentry; + char *buf, *path; + int i; + + buf = f2fs_getname(sbi); + if (!buf) + return 0; + + seq_printf(seq, "Donation List\n"); + seq_printf(seq, " # of files : %u\n", sbi->donate_files); + seq_printf(seq, " %-50s %10s %20s %20s %22s\n", + "File path", "Status", "Donation offset (kb)", + "Donation size (kb)", "File cached size (kb)"); + seq_printf(seq, "---\n"); + + for (i = 0; i < sbi->donate_files; i++) { + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + inode_lock_shared(inode); + + dentry = d_find_alias(inode); + if (!dentry) { + path = NULL; + } else { + path = dentry_path_raw(dentry, buf, PATH_MAX); + if (IS_ERR(path)) + goto next; + } + seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n", + path ? path : "", + is_inode_flag_set(inode, FI_DONATE_FINISHED) ? + "Evicted" : "Donated", + (loff_t)fi->donate_start << (PAGE_SHIFT - 10), + (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10), + (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10)); +next: + inode_unlock_shared(inode); + iput(inode); + } + f2fs_putname(buf); + return 0; +} + #ifdef CONFIG_F2FS_FAULT_INJECTION static int __maybe_unused inject_stats_seq_show(struct seq_file *seq, void *offset) @@ -1878,6 +1940,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) discard_plist_seq_show, sb); proc_create_single_data("disk_map", 0444, sbi->s_proc, disk_map_seq_show, sb); + proc_create_single_data("donation_list", 0444, sbi->s_proc, + donation_list_seq_show, sb); #ifdef CONFIG_F2FS_FAULT_INJECTION proc_create_single_data("inject_stats", 0444, sbi->s_proc, inject_stats_seq_show, sb); From c18ecd99e0c707ef8f83cace861cbc3162f4fdf1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Aug 2025 13:45:34 +0800 Subject: [PATCH 21/43] f2fs: fix to do sanity check on node footer for non inode dnode As syzbot reported below: ------------[ cut here ]------------ kernel BUG at fs/f2fs/file.c:1243! Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI CPU: 0 UID: 0 PID: 5354 Comm: syz.0.0 Not tainted 6.17.0-rc1-syzkaller-00211-g90d970cade8e #0 PREEMPT(full) RIP: 0010:f2fs_truncate_hole+0x69e/0x6c0 fs/f2fs/file.c:1243 Call Trace: f2fs_punch_hole+0x2db/0x330 fs/f2fs/file.c:1306 f2fs_fallocate+0x546/0x990 fs/f2fs/file.c:2018 vfs_fallocate+0x666/0x7e0 fs/open.c:342 ksys_fallocate fs/open.c:366 [inline] __do_sys_fallocate fs/open.c:371 [inline] __se_sys_fallocate fs/open.c:369 [inline] __x64_sys_fallocate+0xc0/0x110 fs/open.c:369 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1e65f8ebe9 w/ a fuzzed image, f2fs may encounter panic due to it detects inconsistent truncation range in direct node in f2fs_truncate_hole(). The root cause is: a non-inode dnode may has the same footer.ino and footer.nid, so the dnode will be parsed as an inode, then ADDRS_PER_PAGE() may return wrong blkaddr count which may be 923 typically, by chance, dn.ofs_in_node is equal to 923, then count can be calculated to 0 in below statement, later it will trigger panic w/ f2fs_bug_on(, count == 0 || ...). count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); This patch introduces a new node_type NODE_TYPE_NON_INODE, then allowing passing the new_type to sanity_check_node_footer in f2fs_get_node_folio() to detect corruption that a non-inode dnode has the same footer.ino and footer.nid. Scripts to reproduce: mkfs.f2fs -f /dev/vdb mount /dev/vdb /mnt/f2fs touch /mnt/f2fs/foo touch /mnt/f2fs/bar dd if=/dev/zero of=/mnt/f2fs/foo bs=1M count=8 umount /mnt/f2fs inject.f2fs --node --mb i_nid --nid 4 --idx 0 --val 5 /dev/vdb mount /dev/vdb /mnt/f2fs xfs_io /mnt/f2fs/foo -c "fpunch 6984k 4k" Cc: stable@kernel.org Reported-by: syzbot+b9c7ffd609c3f09416ab@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/68a68e27.050a0220.1a3988.0002.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/gc.c | 4 ++-- fs/f2fs/node.c | 58 +++++++++++++++++++++++++++++++--------------- fs/f2fs/node.h | 1 + fs/f2fs/recovery.c | 2 +- 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d6a49de1b7e9..69ff43f81c88 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3810,6 +3810,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); * node.c */ struct node_info; +enum node_type; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); @@ -3832,7 +3833,8 @@ int f2fs_remove_inode_page(struct inode *inode); struct folio *f2fs_new_inode_folio(struct inode *inode); struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 098e9f71421e..c0f209f74688 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1071,7 +1071,7 @@ next_step: } /* phase == 2 */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) continue; @@ -1145,7 +1145,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return false; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 286e8a53f8e7..4254db453b2d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -876,7 +876,8 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - nfolio[i] = f2fs_get_node_folio(sbi, nids[i]); + nfolio[i] = f2fs_get_node_folio(sbi, nids[i], + NODE_TYPE_NON_INODE); if (IS_ERR(nfolio[i])) { err = PTR_ERR(nfolio[i]); f2fs_folio_put(nfolio[0], false); @@ -994,7 +995,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - folio = f2fs_get_node_folio(sbi, dn->nid); + folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE); if (PTR_ERR(folio) == -ENOENT) return 1; else if (IS_ERR(folio)) @@ -1038,7 +1039,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid); + folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid, + NODE_TYPE_NON_INODE); if (IS_ERR(folio)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio)); return PTR_ERR(folio); @@ -1116,7 +1118,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]); + folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i], + NODE_TYPE_NON_INODE); if (IS_ERR(folios[i])) { err = PTR_ERR(folios[i]); idx = i - 1; @@ -1501,21 +1504,37 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - if (unlikely(nid != nid_of_node(folio) || - (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || - (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(folio))) || - time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { - f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " - "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(folio), ino_of_node(folio), - ofs_of_node(folio), cpver_of_node(folio), - next_blkaddr_of_node(folio)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - return -EFSCORRUPTED; + if (unlikely(nid != nid_of_node(folio))) + goto out_err; + + switch (ntype) { + case NODE_TYPE_INODE: + if (!IS_INODE(folio)) + goto out_err; + break; + case NODE_TYPE_XATTR: + if (!f2fs_has_xattr_block(ofs_of_node(folio))) + goto out_err; + break; + case NODE_TYPE_NON_INODE: + if (IS_INODE(folio)) + goto out_err; + break; + default: + break; } + if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER)) + goto out_err; return 0; +out_err: + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; } static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, @@ -1572,9 +1591,10 @@ out_put_err: return ERR_PTR(err); } -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type) { - return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR); + return __get_node_folio(sbi, nid, NULL, 0, node_type); } struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 030390543b54..9cb8dcf8d417 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -57,6 +57,7 @@ enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, }; /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4cb3a91801b4..215e442db72c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -548,7 +548,7 @@ got_it: } /* Get the node page */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return PTR_ERR(node_folio); From 4bc6bf7b8830cc403a3cea7f4ef8794831126f54 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 20 Aug 2025 11:34:27 +0700 Subject: [PATCH 22/43] Documentation: f2fs: Separate errors mode subtable errors=%s subtable is shown in htmldocs output as long-running paragraph instead due to missing separator from its previous paragraph. Add it. Fixes: b62e71be2110 ("f2fs: support errors=remount-ro|continue|panic mountoption") Signed-off-by: Bagas Sanjaya Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index e06cbb823bb7..acae1ce0f069 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -360,6 +360,7 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes: panic immediately, continue without doing anything, and remount the partition in read-only mode. By default it uses "continue" mode. + ====================== =============== =============== ======== mode continue remount-ro panic ====================== =============== =============== ======== From f23044152af2614c6fddfa9d975ac1f293b58729 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 20 Aug 2025 11:34:28 +0700 Subject: [PATCH 23/43] Documentation: f2fs: Format compression level subtable Format compression_algorithm subtable as reST table as it does the semantic job rather than normal paragraph. Signed-off-by: Bagas Sanjaya Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index acae1ce0f069..41fa95b779e9 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -294,9 +294,13 @@ compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo" "lz4", "zstd" and "lzo-rle" algorithm. compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only "lz4" and "zstd" support compress level config. + + ========= =========== algorithm level range + ========= =========== lz4 3 - 16 zstd 1 - 22 + ========= =========== compress_log_size=%u Support configuring compress cluster size. The size will be 4KB * (1 << %u). The default and minimum sizes are 16KB. compress_extension=%s Support adding specified extension, so that f2fs can enable From f9c97e496293e50f9131196c26f94d29b2d6451d Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 20 Aug 2025 11:34:29 +0700 Subject: [PATCH 24/43] Documentation: f2fs: Span write hint table section rows Write hint policy table has two rows which act as section rows: buffered io and direct io, yet these rows are written as normal rows instead. Column-span them. Signed-off-by: Bagas Sanjaya Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 41fa95b779e9..2e1ee10e8c60 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -822,11 +822,13 @@ ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " -- buffered io +------------------------------------------------------------------ N/A COLD_DATA WRITE_LIFE_EXTREME N/A HOT_DATA WRITE_LIFE_SHORT N/A WARM_DATA WRITE_LIFE_NOT_SET -- direct io +------------------------------------------------------------------ WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET From f4f36fffd8722d801d2bdacefe98d62d33f85537 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 20 Aug 2025 11:34:30 +0700 Subject: [PATCH 25/43] Documentation: f2fs: Wrap snippets in literal code blocks Compression mode code and device aliasing shell snippets are shown in htmldocs output as long-running paragraph instead. Wrap them. Fixes: 602a16d58e9a ("f2fs: add compress_mode mount option") Fixes: 128d333f0dff ("f2fs: introduce device aliasing file") Signed-off-by: Bagas Sanjaya Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 58 +++++++++++++++--------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 2e1ee10e8c60..673ca35b6639 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -953,15 +953,15 @@ target file and the timing. The user can do manual compression/decompression on compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE ioctls like the below. -To decompress a file, +To decompress a file:: -fd = open(filename, O_WRONLY, 0); -ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE); + fd = open(filename, O_WRONLY, 0); + ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE); -To compress a file, +To compress a file:: -fd = open(filename, O_WRONLY, 0); -ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE); + fd = open(filename, O_WRONLY, 0); + ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE); NVMe Zoned Namespace devices ---------------------------- @@ -991,32 +991,32 @@ reserved and used by another filesystem or for different purposes. Once that external usage is complete, the device aliasing file can be deleted, releasing the reserved space back to F2FS for its own use. - +.. code-block:: -# ls /dev/vd* -/dev/vdb (32GB) /dev/vdc (32GB) -# mkfs.ext4 /dev/vdc -# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb -# mount /dev/vdb /mnt/f2fs -# ls -l /mnt/f2fs -vdc.file -# df -h -/dev/vdb 64G 33G 32G 52% /mnt/f2fs + # ls /dev/vd* + /dev/vdb (32GB) /dev/vdc (32GB) + # mkfs.ext4 /dev/vdc + # mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb + # mount /dev/vdb /mnt/f2fs + # ls -l /mnt/f2fs + vdc.file + # df -h + /dev/vdb 64G 33G 32G 52% /mnt/f2fs -# mount -o loop /dev/vdc /mnt/ext4 -# df -h -/dev/vdb 64G 33G 32G 52% /mnt/f2fs -/dev/loop7 32G 24K 30G 1% /mnt/ext4 -# umount /mnt/ext4 + # mount -o loop /dev/vdc /mnt/ext4 + # df -h + /dev/vdb 64G 33G 32G 52% /mnt/f2fs + /dev/loop7 32G 24K 30G 1% /mnt/ext4 + # umount /mnt/ext4 -# f2fs_io getflags /mnt/f2fs/vdc.file -get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable -# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file -get a flag on noimmutable ret=0, flags=800010 -set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable -# rm /mnt/f2fs/vdc.file -# df -h -/dev/vdb 64G 753M 64G 2% /mnt/f2fs + # f2fs_io getflags /mnt/f2fs/vdc.file + get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable + # f2fs_io setflags noimmutable /mnt/f2fs/vdc.file + get a flag on noimmutable ret=0, flags=800010 + set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable + # rm /mnt/f2fs/vdc.file + # df -h + /dev/vdb 64G 753M 64G 2% /mnt/f2fs So, the key idea is, user can do any file operations on /dev/vdc, and reclaim the space after the use, while the space is counted as /data. From e7822326887897922fda6ba2f8540290cd373ee3 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 20 Aug 2025 11:34:31 +0700 Subject: [PATCH 26/43] Documentation: f2fs: Indent compression_mode option list Indent description text so that compression_mode numbered list gets rendered as such in htmldocs output. Signed-off-by: Bagas Sanjaya Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 673ca35b6639..83479d2147c3 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -944,14 +944,16 @@ compression enabled files (refer to "Compression implementation" section for how enable compression on a regular inode). 1) compress_mode=fs -This is the default option. f2fs does automatic compression in the writeback of the -compression enabled files. + + This is the default option. f2fs does automatic compression in the writeback of the + compression enabled files. 2) compress_mode=user -This disables the automatic compression and gives the user discretion of choosing the -target file and the timing. The user can do manual compression/decompression on the -compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE -ioctls like the below. + + This disables the automatic compression and gives the user discretion of choosing the + target file and the timing. The user can do manual compression/decompression on the + compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE + ioctls like the below. To decompress a file:: From 62242ac510614761c1af644d056c2aedf639f479 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 20 Aug 2025 11:34:32 +0700 Subject: [PATCH 27/43] Documentation: f2fs: Reword title "What is F2FS" is rather a mistitle for the whole f2fs docs, as it implies the overview section (before "Background and design issues" section) and the docs covers beyond that: from mount options to filesystem implementation details. Retitle and add explicit overview section. Signed-off-by: Bagas Sanjaya Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 83479d2147c3..a8d02fe5be83 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -1,8 +1,11 @@ .. SPDX-License-Identifier: GPL-2.0 -========================================== -WHAT IS Flash-Friendly File System (F2FS)? -========================================== +================================= +Flash-Friendly File System (F2FS) +================================= + +Overview +======== NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have been equipped on a variety systems ranging from mobile to server systems. Since From b639c20e748cbe2962fd0c0cef22c0d354842fd5 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Thu, 28 Aug 2025 16:11:30 +0800 Subject: [PATCH 28/43] f2fs: Use allocate_section_policy to control write priority in multi-devices setups Introduces two new sys nodes: allocate_section_hint and allocate_section_policy. The allocate_section_hint identifies the boundary between devices, measured in sections; it defaults to the end of the device for single storage setups, and the end of the first device for multiple storage setups. The allocate_section_policy determines the write strategy, with a default value of 0 for normal sequential write strategy. A value of 1 prioritizes writes before the allocate_section_hint, while a value of 2 prioritizes writes after it. This strategy addresses the issue where, despite F2FS supporting multiple devices, SOC vendors lack multi-devices support (currently only supporting zoned devices). As a workaround, multiple storage devices are mapped to a single dm device. Both this workaround and the F2FS multi-devices solution may require prioritizing writing to certain devices, such as a device with better performance or when switching is needed due to performance degradation near a device's end. For scenarios with more than two devices, sort them at mount time to utilize this feature. When using this feature with a single storage device, it has almost no impact. However, for configurations where multiple storage devices are mapped to the same dm device using F2FS, utilizing this feature can provide some optimization benefits. Therefore, I believe it should not be limited to just multi-devices usage. Signed-off-by: Liao Yuanhong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 22 ++++++++++++++++++++++ fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/gc.c | 5 +++++ fs/f2fs/segment.c | 17 +++++++++++++++++ fs/f2fs/super.c | 4 ++++ fs/f2fs/sysfs.c | 18 ++++++++++++++++++ 6 files changed, 74 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ee3acc8c2cb8..b590809869ca 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -911,3 +911,25 @@ Description: Used to adjust the BG_GC priority when pending IO, with a default v bggc_io_aware = 1 skip background GC if there is pending read IO bggc_io_aware = 2 don't aware IO for background GC ================== ====================================================== + +What: /sys/fs/f2fs//allocate_section_hint +Date: August 2025 +Contact: "Liao Yuanhong" +Description: Indicates the hint section between the first device and others in multi-devices + setup. It defaults to the end of the first device in sections. For a single storage + device, it defaults to the total number of sections. It can be manually set to match + scenarios where multi-devices are mapped to the same dm device. + +What: /sys/fs/f2fs//allocate_section_policy +Date: August 2025 +Contact: "Liao Yuanhong" +Description: Controls write priority in multi-devices setups. A value of 0 means normal writing. + A value of 1 prioritizes writing to devices before the allocate_section_hint. A value of 2 + prioritizes writing to devices after the allocate_section_hint. The default is 0. + + =========================== ========================================================== + value description + allocate_section_policy = 0 Normal writing + allocate_section_policy = 1 Prioritize writing to section before allocate_section_hint + allocate_section_policy = 2 Prioritize writing to section after allocate_section_hint + =========================== ========================================================== diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 69ff43f81c88..9d3bc9633c1d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -162,6 +162,12 @@ enum bggc_io_aware_policy { AWARE_NONE, /* don't aware IO for background GC */ }; +enum device_allocation_policy { + ALLOCATE_FORWARD_NOHINT, + ALLOCATE_FORWARD_WITHIN_HINT, + ALLOCATE_FORWARD_FROM_HINT, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -1850,6 +1856,8 @@ struct f2fs_sb_info { bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ + unsigned int allocate_section_hint; /* the boundary position between devices */ + unsigned int allocate_section_policy; /* determine the section writing priority */ /* For write statistics */ u64 sectors_written_start; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c0f209f74688..ed3acbfc83ca 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2182,6 +2182,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; MAIN_SECS(sbi) += secs; + if (sbi->allocate_section_hint > MAIN_SECS(sbi)) + sbi->allocate_section_hint = MAIN_SECS(sbi); FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); @@ -2189,6 +2191,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; + sbi->allocate_section_hint = FDEV(0).total_segments / + SEGS_PER_SEC(sbi); + FDEV(last_dev).total_segments = (int)FDEV(last_dev).total_segments + segs; FDEV(last_dev).end_blk = diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 04b0a3c1804d..e4d71755a60f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2774,6 +2774,8 @@ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int alloc_policy = sbi->allocate_section_policy; + unsigned int alloc_hint = sbi->allocate_section_hint; bool init = true; int i; int ret = 0; @@ -2807,6 +2809,21 @@ static int get_new_segment(struct f2fs_sb_info *sbi, } #endif + /* + * Prevent allocate_section_hint from exceeding MAIN_SECS() + * due to desynchronization. + */ + if (alloc_policy != ALLOCATE_FORWARD_NOHINT && + alloc_hint > MAIN_SECS(sbi)) + alloc_hint = MAIN_SECS(sbi); + + if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT && + hint < alloc_hint) + hint = alloc_hint; + else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT && + hint >= alloc_hint) + hint = 0; + find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07f6c8cac07a..1e0678e37a30 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4234,6 +4234,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = SEGS_TO_BLKS(sbi, ((le32_to_cpu(raw_super->segment_count_nat) / 2) * NAT_ENTRY_PER_BLOCK)); + sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count); + sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT; F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); @@ -4748,6 +4750,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) SEGS_TO_BLKS(sbi, FDEV(i).total_segments) - 1 + le32_to_cpu(raw_super->segment0_blkaddr); + sbi->allocate_section_hint = FDEV(i).total_segments / + SEGS_PER_SEC(sbi); } else { FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; FDEV(i).end_blk = FDEV(i).start_blk + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 86ebe97c256d..081b5df0e664 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -889,6 +889,20 @@ out: return count; } + if (!strcmp(a->attr.name, "allocate_section_hint")) { + if (t < 0 || t > MAIN_SECS(sbi)) + return -EINVAL; + sbi->allocate_section_hint = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_policy")) { + if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT) + return -EINVAL; + sbi->allocate_section_policy = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1161,6 +1175,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity); F2FS_SBI_GENERAL_RW_ATTR(dir_level); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy); #ifdef CONFIG_F2FS_IOSTAT F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); @@ -1398,6 +1414,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), ATTR_LIST(reserved_pin_section), + ATTR_LIST(allocate_section_hint), + ATTR_LIST(allocate_section_policy), NULL, }; ATTRIBUTE_GROUPS(f2fs); From c872b6279cd26762339ff02513e2a3f16149a6f1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 27 Aug 2025 21:50:29 +0000 Subject: [PATCH 29/43] f2fs: allocate HOT_DATA for IPU writes Let's split IPU writes in hot data area to improve the GC efficiency. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e4d71755a60f..b45eace879d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3689,7 +3689,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_cow_file(inode)) + f2fs_is_cow_file(inode) || + is_inode_flag_set(inode, FI_NEED_IPU)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); From 44749759d5e61479f747f2f5471b161861172aaf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 4 Sep 2025 18:08:08 +0000 Subject: [PATCH 30/43] f2fs: merge FUA command with the existing writes FUA writes can be merged to the existing write IOs. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7961e0ddfca3..30cb2f230690 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -733,9 +733,11 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { + blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA); + if (io->fio.op != fio->op) return false; - return io->fio.op_flags == fio->op_flags; + return (io->fio.op_flags & mask) == (fio->op_flags & mask); } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, From 2f84e99d61946eb2d17bf97c41362ac9a7473008 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Sep 2025 20:27:19 +0800 Subject: [PATCH 31/43] f2fs: avoid unnecessary folio_clear_uptodate() for cleanup In error path of __get_node_folio(), if the folio is not uptodate, let's avoid unnecessary folio_clear_uptodate() for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4254db453b2d..482a362f2625 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1570,7 +1570,7 @@ repeat: if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto out_err; + goto out_put_err; } if (!f2fs_inode_chksum_verify(sbi, folio)) { From d0236266cbfe12284e05c51f2cd7ec150a23d9f0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Sep 2025 20:27:20 +0800 Subject: [PATCH 32/43] f2fs: clean up error handing of f2fs_submit_page_read() Below two functions should never fail, clean up error handling in their callers: 1) f2fs_grab_read_bio() in f2fs_submit_page_read() 2) bio_add_folio() in f2fs_submit_page_read() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 30cb2f230690..86395b546311 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1085,7 +1085,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, } /* This can handle encryption stuffs */ -static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, +static void f2fs_submit_page_read(struct inode *inode, struct folio *folio, block_t blkaddr, blk_opf_t op_flags, bool for_write) { @@ -1094,23 +1094,16 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, folio->index, for_write); - if (IS_ERR(bio)) - return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); - if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) { - iostat_update_and_unbind_ctx(bio); - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); - return -EFAULT; - } + if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) + f2fs_bug_on(sbi, 1); + inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); - return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) @@ -1267,10 +1260,8 @@ got_it: return folio; } - err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, + f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); - if (err) - goto put_err; return folio; put_err: @@ -2147,16 +2138,10 @@ submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } - if (bio == NULL) { + if (bio == NULL) bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, f2fs_ra_op_flags(rac), index, false); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - bio = NULL; - goto out; - } - } /* * If the page is under writeback, we need to wait for @@ -2305,18 +2290,10 @@ submit_and_realloc: bio = NULL; } - if (!bio) { + if (!bio) bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret, true); - f2fs_put_dnode(&dn); - *bio_ret = NULL; - return ret; - } - } if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; @@ -3641,11 +3618,9 @@ repeat: err = -EFSCORRUPTED; goto put_folio; } - err = f2fs_submit_page_read(use_cow ? + f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, folio, blkaddr, 0, true); - if (err) - goto put_folio; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { From a33be64b98d0723748d2fab0832b926613e1fce0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 9 Sep 2025 00:26:06 +0000 Subject: [PATCH 33/43] f2fs: fix wrong layout information on 16KB page This patch fixes to support different block size. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 081b5df0e664..7992386fb9e6 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1768,12 +1768,15 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, seq_printf(seq, " Main : 0x%010x (%10d)\n", SM_I(sbi)->main_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); - seq_printf(seq, " # of Sections : %12d\n", - le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); + seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10); + seq_printf(seq, " Segment size : %12d MB\n", + (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); seq_printf(seq, " Segs/Sections : %12d\n", SEGS_PER_SEC(sbi)); seq_printf(seq, " Section size : %12d MB\n", - SEGS_PER_SEC(sbi) << 1); + (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); + seq_printf(seq, " # of Sections : %12d\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); if (!f2fs_is_multi_device(sbi)) return 0; From 869833f54e8306326b85ca3ed08979b7ad412a4a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Sep 2025 16:12:50 +0800 Subject: [PATCH 34/43] f2fs: fix to update map->m_next_extent correctly in f2fs_map_blocks() Script to reproduce: mkfs.f2fs -O extra_attr,compression /dev/vdb -f mount /dev/vdb /mnt/f2fs -o mode=lfs,noextent_cache cd /mnt/f2fs f2fs_io write 1 0 1024 rand dsync testfile xfs_io testfile -c "fsync" f2fs_io write 1 0 512 rand dsync testfile xfs_io testfile -c "fsync" cd / umount /mnt/f2fs mount /dev/vdb /mnt/f2fs f2fs_io precache_extents /mnt/f2fs/testfile umount /mnt/f2fs Tracepoint output: f2fs_update_read_extent_tree_range: dev = (253,16), ino = 4, pgofs = 0, len = 512, blkaddr = 1055744, c_len = 0 f2fs_update_read_extent_tree_range: dev = (253,16), ino = 4, pgofs = 513, len = 351, blkaddr = 17921, c_len = 0 f2fs_update_read_extent_tree_range: dev = (253,16), ino = 4, pgofs = 864, len = 160, blkaddr = 18272, c_len = 0 During precache_extents, there is off-by-one issue, we should update map->m_next_extent to pgofs rather than pgofs + 1, if last blkaddr is valid and not contiguous to previous extent. Fixes: c4020b2da4c9 ("f2fs: support F2FS_IOC_PRECACHE_EXTENTS") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 86395b546311..2bec936174e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1776,7 +1776,7 @@ sync_out: map->m_len - ofs); } if (map->m_next_extent) - *map->m_next_extent = pgofs + 1; + *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; } f2fs_put_dnode(&dn); unlock_out: From 9251a9e6e871cb03c4714a18efa8f5d4a8818450 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 10 Sep 2025 16:40:24 +0800 Subject: [PATCH 35/43] f2fs: fix to truncate first page in error path of f2fs_truncate() syzbot reports a bug as below: loop0: detected capacity change from 0 to 40427 F2FS-fs (loop0): Wrong SSA boundary, start(3584) end(4096) blocks(3072) F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop0): invalid crc value F2FS-fs (loop0): f2fs_convert_inline_folio: corrupted inline inode ino=3, i_addr[0]:0x1601, run fsck to fix. ------------[ cut here ]------------ kernel BUG at fs/inode.c:753! RIP: 0010:clear_inode+0x169/0x190 fs/inode.c:753 Call Trace: evict+0x504/0x9c0 fs/inode.c:810 f2fs_fill_super+0x5612/0x6fa0 fs/f2fs/super.c:5047 get_tree_bdev_flags+0x40e/0x4d0 fs/super.c:1692 vfs_get_tree+0x8f/0x2b0 fs/super.c:1815 do_new_mount+0x2a2/0x9e0 fs/namespace.c:3808 do_mount fs/namespace.c:4136 [inline] __do_sys_mount fs/namespace.c:4347 [inline] __se_sys_mount+0x317/0x410 fs/namespace.c:4324 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f During f2fs_evict_inode(), clear_inode() detects that we missed to truncate all page cache before destorying inode, that is because in below path, we will create page #0 in cache, but missed to drop it in error path, let's fix it. - evict - f2fs_evict_inode - f2fs_truncate - f2fs_convert_inline_inode - f2fs_grab_cache_folio : create page #0 in cache - f2fs_convert_inline_folio : sanity check failed, return -EFSCORRUPTED - clear_inode detects that inode->i_data.nrpages is not zero Fixes: 92dffd01790a ("f2fs: convert inline_data when i_size becomes large") Reported-by: syzbot+90266696fe5daacebd35@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/68c09802.050a0220.3c6139.000e.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 42faaed6a02d..1aae4361d0a8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -904,8 +904,16 @@ int f2fs_truncate(struct inode *inode) /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); - if (err) + if (err) { + /* + * Always truncate page #0 to avoid page cache + * leak in evict() path. + */ + truncate_inode_pages_range(inode->i_mapping, + F2FS_BLK_TO_BYTES(0), + F2FS_BLK_END_BYTES(0)); return err; + } } err = f2fs_truncate_blocks(inode, i_size_read(inode), true); From d625a2b08c089397d3a03bff13fa8645e4ec7a01 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 1 Sep 2025 10:04:15 +0800 Subject: [PATCH 36/43] f2fs: fix to avoid migrating empty section It reports a bug from device w/ zufs: F2FS-fs (dm-64): Inconsistent segment (173822) type [1, 0] in SSA and SIT F2FS-fs (dm-64): Stopped filesystem due to reason: 4 Thread A Thread B - f2fs_expand_inode_data - f2fs_allocate_pinning_section - f2fs_gc_range - do_garbage_collect w/ segno #x - writepage - f2fs_allocate_data_block - new_curseg - allocate segno #x The root cause is: fallocate on pinning file may race w/ block allocation as above, result in do_garbage_collect() from fallocate() may migrate segment which is just allocated by a log, the log will update segment type in its in-memory structure, however GC will get segment type from on-disk SSA block, once segment type changes by log, we can detect such inconsistency, then shutdown filesystem. In this case, on-disk SSA shows type of segno #173822 is 1 (SUM_TYPE_NODE), however segno #173822 was just allocated as data type segment, so in-memory SIT shows type of segno #173822 is 0 (SUM_TYPE_DATA). Change as below to fix this issue: - check whether current section is empty before gc - add sanity checks on do_garbage_collect() to avoid any race case, result in migrating segment used by log. - btw, it fixes misc issue in printed logs: "SSA and SIT" -> "SIT and SSA". Fixes: 9703d69d9d15 ("f2fs: support file pinning for zoned devices") Cc: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ed3acbfc83ca..a7708cf80c04 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { + f2fs_err(sbi, "%s: segment %u is used by log", + __func__, segno); + f2fs_bug_on(sbi, 1); + goto skip; + } + if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && @@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sum = folio_address(sum_folio); if (type != GET_SUM_TYPE((&sum->footer))) { - f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", + f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA", segno, type, GET_SUM_TYPE((&sum->footer))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); @@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + /* + * avoid migrating empty section, as it can be allocated by + * log in parallel. + */ + if (!get_valid_blocks(sbi, segno, true)) + continue; + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; From c2f7c32b254006ad48f8e4efb2e7e7bf71739f17 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 16 Sep 2025 10:47:09 +0800 Subject: [PATCH 37/43] f2fs: fix to mitigate overhead of f2fs_zero_post_eof_page() f2fs_zero_post_eof_page() may cuase more overhead due to invalidate_lock and page lookup, change as below to mitigate its overhead: - check new_size before grabbing invalidate_lock - lookup and invalidate pages only in range of [old_size, new_size] Fixes: ba8dac350faf ("f2fs: fix to zero post-eof page") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1aae4361d0a8..ffa045b39c01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,15 +35,23 @@ #include #include -static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +static void f2fs_zero_post_eof_page(struct inode *inode, + loff_t new_size, bool lock) { loff_t old_size = i_size_read(inode); if (old_size >= new_size) return; + if (mapping_empty(inode->i_mapping)) + return; + + if (lock) + filemap_invalidate_lock(inode->i_mapping); /* zero or drop pages only in range of [old_size, new_size] */ - truncate_pagecache(inode, old_size); + truncate_inode_pages_range(inode->i_mapping, old_size, new_size); + if (lock) + filemap_invalidate_unlock(inode->i_mapping); } static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) @@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); @@ -1149,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, filemap_invalidate_lock(inode->i_mapping); if (attr->ia_size > old_size) - f2fs_zero_post_eof_page(inode, attr->ia_size); + f2fs_zero_post_eof_page(inode, attr->ia_size, false); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1268,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1555,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1678,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1814,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1872,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); f2fs_balance_fs(sbi, true); @@ -4922,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, + iocb->ki_pos + iov_iter_count(from), true); return count; } From 8175c864391753b210f3dcfae1aeed686a226ebb Mon Sep 17 00:00:00 2001 From: wangzijie Date: Wed, 17 Sep 2025 10:36:21 +0800 Subject: [PATCH 38/43] f2fs: fix zero-sized extent for precache extents Script to reproduce: f2fs_io write 1 0 1881 rand dsync testfile f2fs_io fallocate 0 7708672 4096 testfile f2fs_io write 1 1881 1 rand buffered testfile fsync testfile umount mount f2fs_io precache_extents testfile When the data layout is something like this: dnode1: dnode2: [0] A [0] NEW_ADDR [1] A+1 [1] 0x0 ... [1016] A+1016 [1017] B (B!=A+1017) [1017] 0x0 During precache_extents, we map the last block(valid blkaddr) in dnode1: map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr(valid blkaddr); map->m_len = 1; then we goto next_dnode, meet the first block in dnode2(hole), goto sync_out: map->m_flags & F2FS_MAP_MAPPED == true, and we make zero-sized extent: map->m_len = 1 ofs = start_pgofs - map->m_lblk = 1882 - 1881 = 1 ei.fofs = start_pgofs = 1882 ei.len = map->m_len - ofs = 1 - 1 = 0 Rebased on patch[1], this patch can cover these cases to avoid zero-sized extent: A,B,C is valid blkaddr case1: dnode1: dnode2: [0] A [0] NEW_ADDR [1] A+1 [1] 0x0 ... .... [1016] A+1016 [1017] B (B!=A+1017) [1017] 0x0 case2: dnode1: dnode2: [0] A [0] C (C!=B+1) [1] A+1 [1] C+1 ... .... [1016] A+1016 [1017] B (B!=A+1017) [1017] 0x0 case3: dnode1: dnode2: [0] A [0] C (C!=B+2) [1] A+1 [1] C+1 ... .... [1015] A+1015 [1016] B (B!=A+1016) [1017] B+1 [1017] 0x0 [1] https://lore.kernel.org/linux-f2fs-devel/20250912081250.44383-1-chao@kernel.org/ Fixes: c4020b2da4c9 ("f2fs: support F2FS_IOC_PRECACHE_EXTENTS") Signed-off-by: wangzijie Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2bec936174e2..3a974e2637aa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1771,9 +1771,10 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_read_extent_cache_range(&dn, - start_pgofs, map->m_pblk + ofs, - map->m_len - ofs); + if (map->m_len > ofs) + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); } if (map->m_next_extent) *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; From 23361bd54966b437e1ed3eb1a704572f4b279e58 Mon Sep 17 00:00:00 2001 From: wangzijie Date: Wed, 17 Sep 2025 10:36:22 +0800 Subject: [PATCH 39/43] f2fs: fix infinite loop in __insert_extent_tree() When we get wrong extent info data, and look up extent_node in rb tree, it will cause infinite loop (CONFIG_F2FS_CHECK_FS=n). Avoiding this by return NULL and print some kernel messages in that case. Signed-off-by: wangzijie Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 199c1e7a83ef..ba0a07bfd346 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -604,7 +604,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, p = &(*p)->rb_right; leftmost = false; } else { + f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, " + "extent node in rb tree [%u, %u, %u], age [%llu, %llu], " + "extent node to insert [%u, %u, %u], age [%llu, %llu]", + __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age, + en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks); f2fs_bug_on(sbi, 1); + return NULL; } } From 45b70947a425fa121a8b9bcbb77472d9e35def6a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 16 Sep 2025 16:52:43 +0800 Subject: [PATCH 40/43] f2fs: add sanity check on ei.len in __update_extent_tree_range() Add a sanity check in __update_extent_tree_range() to detect any zero-sized extent update. Signed-off-by: wangzijie Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ba0a07bfd346..33e09c453c70 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -670,6 +670,15 @@ static void __update_extent_tree_range(struct inode *inode, if (!et) return; + if (unlikely(len == 0)) { + f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, " + "extent [%u, %u, %u], age [%llu, %llu]", + __func__, type, tei->fofs, tei->blk, tei->len, + tei->age, tei->last_blocks); + f2fs_bug_on(sbi, 1); + return; + } + if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); From 72bdca6231a35aaf3fc1f3ac174d5203621d9e7f Mon Sep 17 00:00:00 2001 From: Yunji Kang Date: Wed, 24 Sep 2025 16:43:58 +0900 Subject: [PATCH 41/43] f2fs: readahead node blocks in F2FS_GET_BLOCK_PRECACHE mode In f2fs_precache_extents(), For large files, It requires reading many node blocks. Instead of reading each node block with synchronous I/O, this patch applies readahead so that node blocks can be fetched in advance. It reduces the overhead of repeated sync reads and improves efficiency when precaching extents of large files. I created a file with the same largest extent and executed the test. For this experiment, I set the file's largest extent with an offset of 0 and a size of 1GB. I configured the remaining area with 100MB extents. 5GB test file: dd if=/dev/urandom of=test1 bs=1m count=5120 cp test1 test2 fsync test1 dd if=test1 of=test2 bs=1m skip=1024 seek=1024 count=100 conv=notrunc dd if=test1 of=test2 bs=1m skip=1224 seek=1224 count=100 conv=notrunc ... dd if=test1 of=test2 bs=1m skip=5024 seek=5024 count=100 conv=notrunc reboot I also created 10GB and 20GB files with large extents using the same method. ioctl(F2FS_IOC_PRECACHE_EXTENTS) test results are as follows: +-----------+---------+---------+-----------+ | File size | Before | After | Reduction | +-----------+---------+---------+-----------+ | 5GB | 101.8ms | 37.0ms | 72.1% | | 10GB | 222.9ms | 56.0ms | 74.9% | | 20GB | 446.2ms | 116.4ms | 73.9% | +-----------+---------+---------+-----------+ Tested on a 256GB mobile device with an SM8750 chipset. Reviewed-by: Sungjong Seo Reviewed-by: Sunmin Jeong Signed-off-by: Yunji Kang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3a974e2637aa..ed90b2c18d4c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1565,6 +1565,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; + if (flag == F2FS_GET_BLOCK_PRECACHE) + mode = LOOKUP_NODE_RA; + next_dnode: if (map->m_may_create) { if (f2fs_lfs_mode(sbi)) From edf7e9040fc52c922db947f9c6c36f07377c52ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 28 Sep 2025 18:24:22 +0800 Subject: [PATCH 42/43] f2fs: fix UAF issue in f2fs_merge_page_bio() As JY reported in bugzilla [1], Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : [0xffffffe51d249484] f2fs_is_cp_guaranteed+0x70/0x98 lr : [0xffffffe51d24adbc] f2fs_merge_page_bio+0x520/0x6d4 CPU: 3 UID: 0 PID: 6790 Comm: kworker/u16:3 Tainted: P B W OE 6.12.30-android16-5-maybe-dirty-4k #1 5f7701c9cbf727d1eebe77c89bbbeb3371e895e5 Tainted: [P]=PROPRIETARY_MODULE, [B]=BAD_PAGE, [W]=WARN, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Workqueue: writeback wb_workfn (flush-254:49) Call trace: f2fs_is_cp_guaranteed+0x70/0x98 f2fs_inplace_write_data+0x174/0x2f4 f2fs_do_write_data_page+0x214/0x81c f2fs_write_single_data_page+0x28c/0x764 f2fs_write_data_pages+0x78c/0xce4 do_writepages+0xe8/0x2fc __writeback_single_inode+0x4c/0x4b4 writeback_sb_inodes+0x314/0x540 __writeback_inodes_wb+0xa4/0xf4 wb_writeback+0x160/0x448 wb_workfn+0x2f0/0x5dc process_scheduled_works+0x1c8/0x458 worker_thread+0x334/0x3f0 kthread+0x118/0x1ac ret_from_fork+0x10/0x20 [1] https://bugzilla.kernel.org/show_bug.cgi?id=220575 The panic was caused by UAF issue w/ below race condition: kworker - writepages - f2fs_write_cache_pages - f2fs_write_single_data_page - f2fs_do_write_data_page - f2fs_inplace_write_data - f2fs_merge_page_bio - add_inu_page : cache page #1 into bio & cache bio in io->bio_list - f2fs_write_single_data_page - f2fs_do_write_data_page - f2fs_inplace_write_data - f2fs_merge_page_bio - add_inu_page : cache page #2 into bio which is linked in io->bio_list write - f2fs_write_begin : write page #1 - f2fs_folio_wait_writeback - f2fs_submit_merged_ipu_write - f2fs_submit_write_bio : submit bio which inclues page #1 and #2 software IRQ - f2fs_write_end_io - fscrypt_free_bounce_page : freed bounced page which belongs to page #2 - inc_page_count( , WB_DATA_TYPE(data_folio), false) : data_folio points to fio->encrypted_page the bounced page can be freed before accessing it in f2fs_is_cp_guarantee() It can reproduce w/ below testcase: Run below script in shell #1: for ((i=1;i>0;i++)) do xfs_io -f /mnt/f2fs/enc/file \ -c "pwrite 0 32k" -c "fdatasync" Run below script in shell #2: for ((i=1;i>0;i++)) do xfs_io -f /mnt/f2fs/enc/file \ -c "pwrite 0 32k" -c "fdatasync" So, in f2fs_merge_page_bio(), let's avoid using fio->encrypted_page after commit page into internal ipu cache. Fixes: 0b20fcec8651 ("f2fs: cache global IPU bio") Reported-by: JY Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ed90b2c18d4c..ef38e62cda8f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -913,7 +913,7 @@ alloc_new: if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; From 4e715744bf7b4e5521cc3b77f310060f862cb719 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Oct 2025 01:55:32 +0000 Subject: [PATCH 43/43] f2fs: add missing dput() when printing the donation list We missed to call dput() on the grabbed dentry. Fixes: f1a49c1b112b ("f2fs: show the list of donation files") Reviewed-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 7992386fb9e6..6d2a4fba68a2 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1845,6 +1845,7 @@ static int __maybe_unused donation_list_seq_show(struct seq_file *seq, (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10), (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10)); next: + dput(dentry); inode_unlock_shared(inode); iput(inode); }