From 573c14c8218b408b3fe9f6be058f85d0f586cf26 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:25:56 -0700 Subject: [PATCH 01/14] iomap: move bio read logic into helper function Move the iomap_readpage_iter() bio read logic into a separate helper function, iomap_bio_read_folio_range(). This is needed to make iomap read/readahead more generically usable, especially for filesystems that do not require CONFIG_BLOCK. Additionally rename buffered write's iomap_read_folio_range() function to iomap_bio_read_folio_range_sync() to better describe its synchronous behavior. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 70 ++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9535733ed07a..7e65075b6345 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -367,36 +367,15 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static int iomap_readpage_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) +static void iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx, loff_t pos, size_t plen) { - const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; - struct iomap_folio_state *ifs; - size_t poff, plen; + const struct iomap *iomap = &iter->iomap; + struct iomap_folio_state *ifs = folio->private; + size_t poff = offset_in_folio(folio, pos); + loff_t length = iomap_length(iter); sector_t sector; - int ret; - - if (iomap->type == IOMAP_INLINE) { - ret = iomap_read_inline_data(iter, folio); - if (ret) - return ret; - return iomap_iter_advance(iter, length); - } - - /* zero post-eof blocks as the page may be mapped */ - ifs = ifs_alloc(iter->inode, folio, iter->flags); - iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); - if (plen == 0) - goto done; - - if (iomap_block_needs_zeroing(iter, pos)) { - folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, poff, plen); - goto done; - } ctx->cur_folio_in_bio = true; if (ifs) { @@ -435,6 +414,37 @@ static int iomap_readpage_iter(struct iomap_iter *iter, ctx->bio->bi_end_io = iomap_read_end_io; bio_add_folio_nofail(ctx->bio, folio, plen, poff); } +} + +static int iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) +{ + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + struct folio *folio = ctx->cur_folio; + size_t poff, plen; + int ret; + + if (iomap->type == IOMAP_INLINE) { + ret = iomap_read_inline_data(iter, folio); + if (ret) + return ret; + return iomap_iter_advance(iter, length); + } + + /* zero post-eof blocks as the page may be mapped */ + ifs_alloc(iter->inode, folio, iter->flags); + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); + if (plen == 0) + goto done; + + if (iomap_block_needs_zeroing(iter, pos)) { + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); + } else { + iomap_bio_read_folio_range(iter, ctx, pos, plen); + } done: /* @@ -559,7 +569,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readahead); -static int iomap_read_folio_range(const struct iomap_iter *iter, +static int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t len) { const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -572,7 +582,7 @@ static int iomap_read_folio_range(const struct iomap_iter *iter, return submit_bio_wait(&bio); } #else -static int iomap_read_folio_range(const struct iomap_iter *iter, +static int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t len) { WARN_ON_ONCE(1); @@ -749,7 +759,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, status = write_ops->read_folio_range(iter, folio, block_start, plen); else - status = iomap_read_folio_range(iter, + status = iomap_bio_read_folio_range_sync(iter, folio, block_start, plen); if (status) return status; From 7588469b5ea4b67ece0d725f6c600f0d8791e5cb Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:25:57 -0700 Subject: [PATCH 02/14] iomap: move read/readahead bio submission logic into helper function Move the read/readahead bio submission logic into a separate helper. This is needed to make iomap read/readahead more generically usable, especially for filesystems that do not require CONFIG_BLOCK. Signed-off-by: Joanne Koong Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7e65075b6345..f8b985bb5a6b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -367,6 +367,14 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; +static void iomap_bio_submit_read(struct iomap_readpage_ctx *ctx) +{ + struct bio *bio = ctx->bio; + + if (bio) + submit_bio(bio); +} + static void iomap_bio_read_folio_range(const struct iomap_iter *iter, struct iomap_readpage_ctx *ctx, loff_t pos, size_t plen) { @@ -392,8 +400,7 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - if (ctx->bio) - submit_bio(ctx->bio); + iomap_bio_submit_read(ctx); if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; @@ -488,13 +495,10 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = iomap_read_folio_iter(&iter, &ctx); - if (ctx.bio) { - submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_folio_in_bio); - } else { - WARN_ON_ONCE(ctx.cur_folio_in_bio); + iomap_bio_submit_read(&ctx); + + if (!ctx.cur_folio_in_bio) folio_unlock(folio); - } /* * Just like mpage_readahead and block_read_full_folio, we always @@ -560,12 +564,10 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) while (iomap_iter(&iter, ops) > 0) iter.status = iomap_readahead_iter(&iter, &ctx); - if (ctx.bio) - submit_bio(ctx.bio); - if (ctx.cur_folio) { - if (!ctx.cur_folio_in_bio) - folio_unlock(ctx.cur_folio); - } + iomap_bio_submit_read(&ctx); + + if (ctx.cur_folio && !ctx.cur_folio_in_bio) + folio_unlock(ctx.cur_folio); } EXPORT_SYMBOL_GPL(iomap_readahead); From d1f9893fcd846dd912243046e66470c06b505cfa Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:25:58 -0700 Subject: [PATCH 03/14] iomap: store read/readahead bio generically Store the iomap_readpage_ctx bio generically as a "void *read_ctx". This makes the read/readahead interface more generic, which allows it to be used by filesystems that may not be block-based and may not have CONFIG_BLOCK set. Signed-off-by: Joanne Koong Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f8b985bb5a6b..b06b532033ad 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -363,13 +363,13 @@ static void iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct folio *cur_folio; bool cur_folio_in_bio; - struct bio *bio; + void *read_ctx; struct readahead_control *rac; }; static void iomap_bio_submit_read(struct iomap_readpage_ctx *ctx) { - struct bio *bio = ctx->bio; + struct bio *bio = ctx->read_ctx; if (bio) submit_bio(bio); @@ -384,6 +384,7 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, size_t poff = offset_in_folio(folio, pos); loff_t length = iomap_length(iter); sector_t sector; + struct bio *bio = ctx->read_ctx; ctx->cur_folio_in_bio = true; if (ifs) { @@ -393,9 +394,8 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, } sector = iomap_sector(iomap, pos); - if (!ctx->bio || - bio_end_sector(ctx->bio) != sector || - !bio_add_folio(ctx->bio, folio, plen, poff)) { + if (!bio || bio_end_sector(bio) != sector || + !bio_add_folio(bio, folio, plen, poff)) { gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -404,22 +404,21 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), - REQ_OP_READ, gfp); + bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates * what do_mpage_read_folio does. */ - if (!ctx->bio) { - ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, - orig_gfp); - } + if (!bio) + bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); if (ctx->rac) - ctx->bio->bi_opf |= REQ_RAHEAD; - ctx->bio->bi_iter.bi_sector = sector; - ctx->bio->bi_end_io = iomap_read_end_io; - bio_add_folio_nofail(ctx->bio, folio, plen, poff); + bio->bi_opf |= REQ_RAHEAD; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = iomap_read_end_io; + bio_add_folio_nofail(bio, folio, plen, poff); + ctx->read_ctx = bio; } } From e0e15340e41e99334670caa3807133f23f43064f Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:25:59 -0700 Subject: [PATCH 04/14] iomap: iterate over folio mapping in iomap_readpage_iter() Iterate over all non-uptodate ranges of a folio mapping in a single call to iomap_readpage_iter() instead of leaving the partial iteration to the caller. Signed-off-by: Joanne Koong Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 53 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index b06b532033ad..dbe5783ee68c 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -430,6 +430,7 @@ static int iomap_readpage_iter(struct iomap_iter *iter, loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; size_t poff, plen; + loff_t count; int ret; if (iomap->type == IOMAP_INLINE) { @@ -439,41 +440,35 @@ static int iomap_readpage_iter(struct iomap_iter *iter, return iomap_iter_advance(iter, length); } - /* zero post-eof blocks as the page may be mapped */ ifs_alloc(iter->inode, folio, iter->flags); - iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); - if (plen == 0) - goto done; - if (iomap_block_needs_zeroing(iter, pos)) { - folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, poff, plen); - } else { - iomap_bio_read_folio_range(iter, ctx, pos, plen); - } + length = min_t(loff_t, length, + folio_size(folio) - offset_in_folio(folio, pos)); + while (length) { + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, + &plen); -done: - /* - * Move the caller beyond our range so that it keeps making progress. - * For that, we have to include any leading non-uptodate ranges, but - * we can skip trailing ones as they will be handled in the next - * iteration. - */ - length = pos - iter->pos + plen; - return iomap_iter_advance(iter, length); -} + count = pos - iter->pos + plen; + if (WARN_ON_ONCE(count > length)) + return -EIO; -static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) -{ - int ret; + if (plen == 0) + return iomap_iter_advance(iter, count); - while (iomap_length(iter)) { - ret = iomap_readpage_iter(iter, ctx); + /* zero post-eof blocks as the page may be mapped */ + if (iomap_block_needs_zeroing(iter, pos)) { + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); + } else { + iomap_bio_read_folio_range(iter, ctx, pos, plen); + } + + ret = iomap_iter_advance(iter, count); if (ret) return ret; + length -= count; + pos = iter->pos; } - return 0; } @@ -492,7 +487,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_readpage_iter(&iter, &ctx); iomap_bio_submit_read(&ctx); @@ -522,6 +517,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter, } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); + if (WARN_ON_ONCE(!ctx->cur_folio)) + return -EINVAL; ctx->cur_folio_in_bio = false; } ret = iomap_readpage_iter(iter, ctx); From 8805a9c64be79148dca14562c3ed56b12239bdca Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:00 -0700 Subject: [PATCH 05/14] iomap: rename iomap_readpage_iter() to iomap_read_folio_iter() ->readpage was deprecated and reads are now on folios. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index dbe5783ee68c..23601373573e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -422,7 +422,7 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, } } -static int iomap_readpage_iter(struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; @@ -487,7 +487,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_readpage_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); iomap_bio_submit_read(&ctx); @@ -521,7 +521,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter, return -EINVAL; ctx->cur_folio_in_bio = false; } - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_read_folio_iter(iter, ctx); if (ret) return ret; } From 87a13819dde34e8af247744e979583a31d353244 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:01 -0700 Subject: [PATCH 06/14] iomap: rename iomap_readpage_ctx struct to iomap_read_folio_ctx ->readpage was deprecated and reads are now on folios. Signed-off-by: Joanne Koong Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 23601373573e..09e65771a947 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -360,14 +360,14 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } -struct iomap_readpage_ctx { +struct iomap_read_folio_ctx { struct folio *cur_folio; bool cur_folio_in_bio; void *read_ctx; struct readahead_control *rac; }; -static void iomap_bio_submit_read(struct iomap_readpage_ctx *ctx) +static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; @@ -376,7 +376,7 @@ static void iomap_bio_submit_read(struct iomap_readpage_ctx *ctx) } static void iomap_bio_read_folio_range(const struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx, loff_t pos, size_t plen) + struct iomap_read_folio_ctx *ctx, loff_t pos, size_t plen) { struct folio *folio = ctx->cur_folio; const struct iomap *iomap = &iter->iomap; @@ -423,7 +423,7 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, } static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + struct iomap_read_folio_ctx *ctx) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; @@ -479,7 +479,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) .pos = folio_pos(folio), .len = folio_size(folio), }; - struct iomap_readpage_ctx ctx = { + struct iomap_read_folio_ctx ctx = { .cur_folio = folio, }; int ret; @@ -504,7 +504,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + struct iomap_read_folio_ctx *ctx) { int ret; @@ -551,7 +551,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) .pos = readahead_pos(rac), .len = readahead_length(rac), }; - struct iomap_readpage_ctx ctx = { + struct iomap_read_folio_ctx ctx = { .rac = rac, }; From d43558ae67299266ea54715773610fdec4291c90 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:02 -0700 Subject: [PATCH 07/14] iomap: track pending read bytes more optimally Instead of incrementing read_bytes_pending for every folio range read in (which requires acquiring the spinlock to do so), set read_bytes_pending to the folio size when the first range is asynchronously read in, keep track of how many bytes total are asynchronously read in, and adjust read_bytes_pending accordingly after issuing requests to read in all the necessary ranges. iomap_read_folio_ctx->cur_folio_in_bio can be removed since a non-zero value for pending bytes necessarily indicates the folio is in the bio. Signed-off-by: Joanne Koong Suggested-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 123 ++++++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 21 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 09e65771a947..f8b17ce549eb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -362,7 +362,6 @@ static void iomap_read_end_io(struct bio *bio) struct iomap_read_folio_ctx { struct folio *cur_folio; - bool cur_folio_in_bio; void *read_ctx; struct readahead_control *rac; }; @@ -380,19 +379,11 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, { struct folio *folio = ctx->cur_folio; const struct iomap *iomap = &iter->iomap; - struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); loff_t length = iomap_length(iter); sector_t sector; struct bio *bio = ctx->read_ctx; - ctx->cur_folio_in_bio = true; - if (ifs) { - spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += plen; - spin_unlock_irq(&ifs->state_lock); - } - sector = iomap_sector(iomap, pos); if (!bio || bio_end_sector(bio) != sector || !bio_add_folio(bio, folio, plen, poff)) { @@ -422,8 +413,93 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, } } +static void iomap_read_init(struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) { + size_t len = folio_size(folio); + + /* + * ifs->read_bytes_pending is used to track how many bytes are + * read in asynchronously by the IO helper. We need to track + * this so that we can know when the IO helper has finished + * reading in all the necessary ranges of the folio and can end + * the read. + * + * Increase ->read_bytes_pending by the folio size to start, and + * add a +1 bias. We'll subtract the bias and any uptodate / + * zeroed ranges that did not require IO in iomap_read_end() + * after we're done processing the folio. + * + * We do this because otherwise, we would have to increment + * ifs->read_bytes_pending every time a range in the folio needs + * to be read in, which can get expensive since the spinlock + * needs to be held whenever modifying ifs->read_bytes_pending. + * + * We add the bias to ensure the read has not been ended on the + * folio when iomap_read_end() is called, even if the IO helper + * has already finished reading in the entire folio. + */ + spin_lock_irq(&ifs->state_lock); + ifs->read_bytes_pending += len + 1; + spin_unlock_irq(&ifs->state_lock); + } +} + +/* + * This ends IO if no bytes were submitted to an IO helper. + * + * Otherwise, this calibrates ifs->read_bytes_pending to represent only the + * submitted bytes (see comment in iomap_read_init()). If all bytes submitted + * have already been completed by the IO helper, then this will end the read. + * Else the IO helper will end the read after all submitted ranges have been + * read. + */ +static void iomap_read_end(struct folio *folio, size_t bytes_pending) +{ + struct iomap_folio_state *ifs; + + /* + * If there are no bytes pending, this means we are responsible for + * unlocking the folio here, since no IO helper has taken ownership of + * it. + */ + if (!bytes_pending) { + folio_unlock(folio); + return; + } + + ifs = folio->private; + if (ifs) { + bool end_read, uptodate; + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. + * The +1 accounts for the bias we added in iomap_read_init(). + */ + size_t bytes_accounted = folio_size(folio) + 1 - + bytes_pending; + + spin_lock_irq(&ifs->state_lock); + ifs->read_bytes_pending -= bytes_accounted; + /* + * If !ifs->read_bytes_pending, this means all pending reads + * by the IO helper have already completed, which means we need + * to end the folio read here. If ifs->read_bytes_pending != 0, + * the IO helper will end the folio read. + */ + end_read = !ifs->read_bytes_pending; + if (end_read) + uptodate = ifs_is_fully_uptodate(folio, ifs); + spin_unlock_irq(&ifs->state_lock); + if (end_read) + folio_end_read(folio, uptodate); + } +} + static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx) + struct iomap_read_folio_ctx *ctx, size_t *bytes_pending) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; @@ -460,6 +536,9 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, folio_zero_range(folio, poff, plen); iomap_set_range_uptodate(folio, poff, plen); } else { + if (!*bytes_pending) + iomap_read_init(folio); + *bytes_pending += plen; iomap_bio_read_folio_range(iter, ctx, pos, plen); } @@ -482,17 +561,18 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) struct iomap_read_folio_ctx ctx = { .cur_folio = folio, }; + size_t bytes_pending = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx, + &bytes_pending); iomap_bio_submit_read(&ctx); - if (!ctx.cur_folio_in_bio) - folio_unlock(folio); + iomap_read_end(folio, bytes_pending); /* * Just like mpage_readahead and block_read_full_folio, we always @@ -504,24 +584,23 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx) + struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_pending) { int ret; while (iomap_length(iter)) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos) == 0) { - if (!ctx->cur_folio_in_bio) - folio_unlock(ctx->cur_folio); + iomap_read_end(ctx->cur_folio, *cur_bytes_pending); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); if (WARN_ON_ONCE(!ctx->cur_folio)) return -EINVAL; - ctx->cur_folio_in_bio = false; + *cur_bytes_pending = 0; } - ret = iomap_read_folio_iter(iter, ctx); + ret = iomap_read_folio_iter(iter, ctx, cur_bytes_pending); if (ret) return ret; } @@ -554,16 +633,18 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) struct iomap_read_folio_ctx ctx = { .rac = rac, }; + size_t cur_bytes_pending; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.status = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx, + &cur_bytes_pending); iomap_bio_submit_read(&ctx); - if (ctx.cur_folio && !ctx.cur_folio_in_bio) - folio_unlock(ctx.cur_folio); + if (ctx.cur_folio) + iomap_read_end(ctx.cur_folio, cur_bytes_pending); } EXPORT_SYMBOL_GPL(iomap_readahead); From fb7a10ac4704fe2df12fcc560c0b49a69e016af0 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:03 -0700 Subject: [PATCH 08/14] iomap: set accurate iter->pos when reading folio ranges Advance iter to the correct position before calling an IO helper to read in a folio range. This allows the helper to reliably use iter->pos to determine the starting offset for reading. This will simplify the interface for reading in folio ranges when iomap read/readahead supports caller-provided callbacks. Signed-off-by: Joanne Koong Suggested-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f8b17ce549eb..12b23ff97000 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -375,10 +375,11 @@ static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) } static void iomap_bio_read_folio_range(const struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx, loff_t pos, size_t plen) + struct iomap_read_folio_ctx *ctx, size_t plen) { struct folio *folio = ctx->cur_folio; const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; size_t poff = offset_in_folio(folio, pos); loff_t length = iomap_length(iter); sector_t sector; @@ -506,7 +507,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; size_t poff, plen; - loff_t count; + loff_t pos_diff; int ret; if (iomap->type == IOMAP_INLINE) { @@ -524,12 +525,16 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); - count = pos - iter->pos + plen; - if (WARN_ON_ONCE(count > length)) + pos_diff = pos - iter->pos; + if (WARN_ON_ONCE(pos_diff + plen > length)) return -EIO; + ret = iomap_iter_advance(iter, pos_diff); + if (ret) + return ret; + if (plen == 0) - return iomap_iter_advance(iter, count); + return 0; /* zero post-eof blocks as the page may be mapped */ if (iomap_block_needs_zeroing(iter, pos)) { @@ -539,13 +544,13 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, if (!*bytes_pending) iomap_read_init(folio); *bytes_pending += plen; - iomap_bio_read_folio_range(iter, ctx, pos, plen); + iomap_bio_read_folio_range(iter, ctx, plen); } - ret = iomap_iter_advance(iter, count); + ret = iomap_iter_advance(iter, plen); if (ret) return ret; - length -= count; + length -= pos_diff + plen; pos = iter->pos; } return 0; From b2f35ac4146d32d4424aaa941bbc681f12c1b9e6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:04 -0700 Subject: [PATCH 09/14] iomap: add caller-provided callbacks for read and readahead Add caller-provided callbacks for read and readahead so that it can be used generically, especially by filesystems that are not block-based. In particular, this: * Modifies the read and readahead interface to take in a struct iomap_read_folio_ctx that is publicly defined as: struct iomap_read_folio_ctx { const struct iomap_read_ops *ops; struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; }; where struct iomap_read_ops is defined as: struct iomap_read_ops { int (*read_folio_range)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len); void (*read_submit)(struct iomap_read_folio_ctx *ctx); }; read_folio_range() reads in the folio range and is required by the caller to provide. read_submit() is optional and is used for submitting any pending read requests. * Modifies existing filesystems that use iomap for read and readahead to use the new API, through the new statically inlined helpers iomap_bio_read_folio() and iomap_bio_readahead(). There is no change in functionality for those filesystems. Signed-off-by: Joanne Koong Signed-off-by: Christian Brauner --- .../filesystems/iomap/operations.rst | 44 +++++++++++++ block/fops.c | 5 +- fs/erofs/data.c | 5 +- fs/gfs2/aops.c | 6 +- fs/iomap/buffered-io.c | 55 ++++++++-------- fs/xfs/xfs_aops.c | 5 +- fs/zonefs/file.c | 5 +- include/linux/iomap.h | 63 ++++++++++++++++++- 8 files changed, 149 insertions(+), 39 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 387fd9cc72ca..c88205132039 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -135,6 +135,28 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. +``struct iomap_read_ops`` +-------------------------- + +.. code-block:: c + + struct iomap_read_ops { + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + void (*submit_read)(struct iomap_read_folio_ctx *ctx); + }; + +iomap calls these functions: + + - ``read_folio_range``: Called to read in the range. This must be provided + by the caller. The caller is responsible for calling + iomap_finish_folio_read() after reading in the folio range. This should be + done even if an error is encountered during the read. This returns 0 on + success or a negative error on failure. + + - ``submit_read``: Submit any pending read requests. This function is + optional. + Internal per-Folio State ------------------------ @@ -182,6 +204,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero. The pagecache takes whatever locks it needs before calling the filesystem. +Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct +iomap_read_folio_ctx``: + +.. code-block:: c + + struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; + }; + +``iomap_readahead`` must set: + * ``ops->read_folio_range()`` and ``rac`` + +``iomap_read_folio`` must set: + * ``ops->read_folio_range()`` and ``cur_folio`` + +``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to +pass in any custom data the caller needs accessible in the ops callbacks for +fulfilling reads. + Buffered Writes --------------- diff --git a/block/fops.c b/block/fops.c index 5e3db9fead77..4dad9c2d5796 100644 --- a/block/fops.c +++ b/block/fops.c @@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = { #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { - return iomap_read_folio(folio, &blkdev_iomap_ops); + iomap_bio_read_folio(folio, &blkdev_iomap_ops); + return 0; } static void blkdev_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &blkdev_iomap_ops); + iomap_bio_readahead(rac, &blkdev_iomap_ops); } static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8ca29962a3dd..bb13c4cb8455 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio) { trace_erofs_read_folio(folio, true); - return iomap_read_folio(folio, &erofs_iomap_ops); + iomap_bio_read_folio(folio, &erofs_iomap_ops); + return 0; } static void erofs_readahead(struct readahead_control *rac) @@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac) trace_erofs_readahead(rac->mapping->host, readahead_index(rac), readahead_count(rac), true); - return iomap_readahead(rac, &erofs_iomap_ops); + iomap_bio_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..38d4f343187a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -424,11 +424,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int error = 0; if (!gfs2_is_jdata(ip) || (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { - error = iomap_read_folio(folio, &gfs2_iomap_ops); + iomap_bio_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_read_folio(ip, folio); } else { @@ -503,7 +503,7 @@ static void gfs2_readahead(struct readahead_control *rac) else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); else - iomap_readahead(rac, &gfs2_iomap_ops); + iomap_bio_readahead(rac, &gfs2_iomap_ops); } /** diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 12b23ff97000..d7100a5f953a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -328,8 +328,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, } #ifdef CONFIG_BLOCK -static void iomap_finish_folio_read(struct folio *folio, size_t off, - size_t len, int error) +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; @@ -349,6 +349,7 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (finished) folio_end_read(folio, uptodate); } +EXPORT_SYMBOL_GPL(iomap_finish_folio_read); static void iomap_read_end_io(struct bio *bio) { @@ -360,12 +361,6 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } -struct iomap_read_folio_ctx { - struct folio *cur_folio; - void *read_ctx; - struct readahead_control *rac; -}; - static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; @@ -374,7 +369,7 @@ static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) submit_bio(bio); } -static void iomap_bio_read_folio_range(const struct iomap_iter *iter, +static int iomap_bio_read_folio_range(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen) { struct folio *folio = ctx->cur_folio; @@ -412,8 +407,15 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, bio_add_folio_nofail(bio, folio, plen, poff); ctx->read_ctx = bio; } + return 0; } +const struct iomap_read_ops iomap_bio_read_ops = { + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, +}; +EXPORT_SYMBOL_GPL(iomap_bio_read_ops); + static void iomap_read_init(struct folio *folio) { struct iomap_folio_state *ifs = folio->private; @@ -544,7 +546,9 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, if (!*bytes_pending) iomap_read_init(folio); *bytes_pending += plen; - iomap_bio_read_folio_range(iter, ctx, plen); + ret = ctx->ops->read_folio_range(iter, ctx, plen); + if (ret) + return ret; } ret = iomap_iter_advance(iter, plen); @@ -556,26 +560,25 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, return 0; } -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) +int iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct folio *folio = ctx->cur_folio; struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; - struct iomap_read_folio_ctx ctx = { - .cur_folio = folio, - }; size_t bytes_pending = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx, - &bytes_pending); + iter.status = iomap_read_folio_iter(&iter, ctx, &bytes_pending); - iomap_bio_submit_read(&ctx); + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); iomap_read_end(folio, bytes_pending); @@ -615,8 +618,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter, /** * iomap_readahead - Attempt to read pages from a file. - * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. + * @ctx: The ctx used for issuing readahead. * * This function is for filesystems to call to implement their readahead * address_space operation. @@ -628,28 +631,28 @@ static int iomap_readahead_iter(struct iomap_iter *iter, * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ -void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct readahead_control *rac = ctx->rac; struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; - struct iomap_read_folio_ctx ctx = { - .rac = rac, - }; size_t cur_bytes_pending; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.status = iomap_readahead_iter(&iter, &ctx, + iter.status = iomap_readahead_iter(&iter, ctx, &cur_bytes_pending); - iomap_bio_submit_read(&ctx); + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - if (ctx.cur_folio) - iomap_read_end(ctx.cur_folio, cur_bytes_pending); + if (ctx->cur_folio) + iomap_read_end(ctx->cur_folio, cur_bytes_pending); } EXPORT_SYMBOL_GPL(iomap_readahead); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26f79815533..0c2ed00733f2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -742,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 90e2ad8ee5f4..c1e5e30e90a0 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = { static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_read_iomap_ops); + iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); + return 0; } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_read_iomap_ops); + iomap_bio_readahead(rac, &zonefs_read_iomap_ops); } /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4469b2318b08..37435b912755 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -16,6 +16,7 @@ struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; +struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; @@ -337,8 +338,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); +int iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -465,6 +468,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error); void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, @@ -473,6 +478,34 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio, int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); +struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; +}; + +struct iomap_read_ops { + /* + * Read in a folio range. + * + * The caller is responsible for calling iomap_finish_folio_read() after + * reading in the folio range. This should be done even if an error is + * encountered during the read. + * + * Returns 0 on success or a negative error on failure. + */ + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + + /* + * Submit any pending read requests. + * + * This is optional. + */ + void (*submit_read)(struct iomap_read_folio_ctx *ctx); +}; + /* * Flags for direct I/O ->end_io: */ @@ -538,4 +571,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; +#ifdef CONFIG_BLOCK +extern const struct iomap_read_ops iomap_bio_read_ops; + +static inline void iomap_bio_read_folio(struct folio *folio, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .cur_folio = folio, + }; + + iomap_read_folio(ops, &ctx); +} + +static inline void iomap_bio_readahead(struct readahead_control *rac, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .rac = rac, + }; + + iomap_readahead(ops, &ctx); +} +#endif /* CONFIG_BLOCK */ + #endif /* LINUX_IOMAP_H */ From c2b1adc4624573f9d5a35ccb8ebe31eb1bdb0d22 Mon Sep 17 00:00:00 2001 From: "Christoph Hellwig [1]" Date: Thu, 25 Sep 2025 17:26:05 -0700 Subject: [PATCH 10/14] iomap: move buffered io bio logic into new file Move bio logic in the buffered io code into its own file and remove CONFIG_BLOCK gating for iomap read/readahead. [1] https://lore.kernel.org/linux-fsdevel/aMK2GuumUf93ep99@infradead.org/ Signed-off-by: Christoph Hellwig Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/Makefile | 3 +- fs/iomap/bio.c | 88 ++++++++++++++++++++++++++++++++++++++++++ fs/iomap/buffered-io.c | 88 +----------------------------------------- fs/iomap/internal.h | 12 ++++++ 4 files changed, 103 insertions(+), 88 deletions(-) create mode 100644 fs/iomap/bio.c diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index f7e1c8534c46..a572b8808524 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -14,5 +14,6 @@ iomap-y += trace.o \ iomap-$(CONFIG_BLOCK) += direct-io.o \ ioend.o \ fiemap.o \ - seek.o + seek.o \ + bio.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c new file mode 100644 index 000000000000..fc045f2e4c45 --- /dev/null +++ b/fs/iomap/bio.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2016-2023 Christoph Hellwig. + */ +#include +#include +#include "internal.h" +#include "trace.h" + +static void iomap_read_end_io(struct bio *bio) +{ + int error = blk_status_to_errno(bio->bi_status); + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); + bio_put(bio); +} + +static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +{ + struct bio *bio = ctx->read_ctx; + + if (bio) + submit_bio(bio); +} + +static int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen) +{ + struct folio *folio = ctx->cur_folio; + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; + size_t poff = offset_in_folio(folio, pos); + loff_t length = iomap_length(iter); + sector_t sector; + struct bio *bio = ctx->read_ctx; + + sector = iomap_sector(iomap, pos); + if (!bio || bio_end_sector(bio) != sector || + !bio_add_folio(bio, folio, plen, poff)) { + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); + gfp_t orig_gfp = gfp; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); + + if (bio) + submit_bio(bio); + + if (ctx->rac) /* same as readahead_gfp_mask */ + gfp |= __GFP_NORETRY | __GFP_NOWARN; + bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp); + /* + * If the bio_alloc fails, try it again for a single page to + * avoid having to deal with partial page reads. This emulates + * what do_mpage_read_folio does. + */ + if (!bio) + bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); + if (ctx->rac) + bio->bi_opf |= REQ_RAHEAD; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = iomap_read_end_io; + bio_add_folio_nofail(bio, folio, plen, poff); + ctx->read_ctx = bio; + } + return 0; +} + +const struct iomap_read_ops iomap_bio_read_ops = { + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, +}; +EXPORT_SYMBOL_GPL(iomap_bio_read_ops); + +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct bio_vec bvec; + struct bio bio; + + bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); + return submit_bio_wait(&bio); +} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d7100a5f953a..0d88a4f3c791 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -8,6 +8,7 @@ #include #include #include +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -327,7 +328,6 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, return 0; } -#ifdef CONFIG_BLOCK void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { @@ -351,71 +351,6 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, } EXPORT_SYMBOL_GPL(iomap_finish_folio_read); -static void iomap_read_end_io(struct bio *bio) -{ - int error = blk_status_to_errno(bio->bi_status); - struct folio_iter fi; - - bio_for_each_folio_all(fi, bio) - iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); - bio_put(bio); -} - -static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) -{ - struct bio *bio = ctx->read_ctx; - - if (bio) - submit_bio(bio); -} - -static int iomap_bio_read_folio_range(const struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx, size_t plen) -{ - struct folio *folio = ctx->cur_folio; - const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - size_t poff = offset_in_folio(folio, pos); - loff_t length = iomap_length(iter); - sector_t sector; - struct bio *bio = ctx->read_ctx; - - sector = iomap_sector(iomap, pos); - if (!bio || bio_end_sector(bio) != sector || - !bio_add_folio(bio, folio, plen, poff)) { - gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - gfp_t orig_gfp = gfp; - unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - - iomap_bio_submit_read(ctx); - - if (ctx->rac) /* same as readahead_gfp_mask */ - gfp |= __GFP_NORETRY | __GFP_NOWARN; - bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, - gfp); - /* - * If the bio_alloc fails, try it again for a single page to - * avoid having to deal with partial page reads. This emulates - * what do_mpage_read_folio does. - */ - if (!bio) - bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); - if (ctx->rac) - bio->bi_opf |= REQ_RAHEAD; - bio->bi_iter.bi_sector = sector; - bio->bi_end_io = iomap_read_end_io; - bio_add_folio_nofail(bio, folio, plen, poff); - ctx->read_ctx = bio; - } - return 0; -} - -const struct iomap_read_ops iomap_bio_read_ops = { - .read_folio_range = iomap_bio_read_folio_range, - .submit_read = iomap_bio_submit_read, -}; -EXPORT_SYMBOL_GPL(iomap_bio_read_ops); - static void iomap_read_init(struct folio *folio) { struct iomap_folio_state *ifs = folio->private; @@ -656,27 +591,6 @@ void iomap_readahead(const struct iomap_ops *ops, } EXPORT_SYMBOL_GPL(iomap_readahead); -static int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct bio_vec bvec; - struct bio bio; - - bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); - bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); - return submit_bio_wait(&bio); -} -#else -static int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - WARN_ON_ONCE(1); - return -EIO; -} -#endif /* CONFIG_BLOCK */ - /* * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index d05cb3aed96e..3a4e4aad2bd1 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -6,4 +6,16 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); +#ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len); +#else +static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + WARN_ON_ONCE(1); + return -EIO; +} +#endif /* CONFIG_BLOCK */ + #endif /* _IOMAP_INTERNAL_H */ From d4e88bb08e5f7e6eb4e9c3685894b9b57bfdfb08 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:06 -0700 Subject: [PATCH 11/14] iomap: make iomap_read_folio() a void return No errors are propagated in iomap_read_folio(). Change iomap_read_folio() to a void return to make this clearer to callers. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 9 +-------- include/linux/iomap.h | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0d88a4f3c791..1dbcac17fefd 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -495,7 +495,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, return 0; } -int iomap_read_folio(const struct iomap_ops *ops, +void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx) { struct folio *folio = ctx->cur_folio; @@ -516,13 +516,6 @@ int iomap_read_folio(const struct iomap_ops *ops, ctx->ops->submit_read(ctx); iomap_read_end(folio, bytes_pending); - - /* - * Just like mpage_readahead and block_read_full_folio, we always - * return 0 and just set the folio error flag on errors. This - * should be cleaned up throughout the stack eventually. - */ - return 0; } EXPORT_SYMBOL_GPL(iomap_read_folio); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 37435b912755..6d864b446b6e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -338,7 +338,7 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(const struct iomap_ops *ops, +void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); void iomap_readahead(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); From 03e9618e82402e2aa9cc747793343768ced61a55 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:07 -0700 Subject: [PATCH 12/14] fuse: use iomap for read_folio Read folio data into the page cache using iomap. This gives us granular uptodate tracking for large folios, which optimizes how much data needs to be read in. If some portions of the folio are already uptodate (eg through a prior write), we only need to read in the non-uptodate portions. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fuse/file.c | 80 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f1ef77a0be05..0e4f8734fe9d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -834,23 +834,69 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, return 0; } +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + +struct fuse_fill_read_data { + struct file *file; +}; + +static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, + size_t len) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + struct folio *folio = ctx->cur_folio; + loff_t pos = iter->pos; + size_t off = offset_in_folio(folio, pos); + struct file *file = data->file; + int ret; + + /* + * for non-readahead read requests, do reads synchronously since + * it's not guaranteed that the server can handle out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + iomap_finish_folio_read(folio, off, len, ret); + return ret; +} + +static const struct iomap_read_ops fuse_iomap_read_ops = { + .read_folio_range = fuse_iomap_read_folio_range_async, +}; + static int fuse_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; - int err; + struct fuse_fill_read_data data = { + .file = file, + }; + struct iomap_read_folio_ctx ctx = { + .cur_folio = folio, + .ops = &fuse_iomap_read_ops, + .read_ctx = &data, - err = -EIO; - if (fuse_is_bad(inode)) - goto out; + }; - err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); - if (!err) - folio_mark_uptodate(folio); + if (fuse_is_bad(inode)) { + folio_unlock(folio); + return -EIO; + } + iomap_read_folio(&fuse_iomap_ops, &ctx); fuse_invalidate_atime(inode); - out: - folio_unlock(folio); - return err; + return 0; } static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, @@ -1397,20 +1443,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = { .read_folio_range = fuse_iomap_read_folio_range, }; -static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - iomap->type = IOMAP_MAPPED; - iomap->length = length; - iomap->offset = offset; - return 0; -} - -static const struct iomap_ops fuse_iomap_ops = { - .iomap_begin = fuse_iomap_begin, -}; - static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; From 4ea907108a5ca800e91708f0dbbe01bc95f8f5af Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:08 -0700 Subject: [PATCH 13/14] fuse: use iomap for readahead Do readahead in fuse using iomap. This gives us granular uptodate tracking for large folios, which optimizes how much data needs to be read in. If some portions of the folio are already uptodate (eg through a prior write), we only need to read in the non-uptodate portions. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fuse/file.c | 220 ++++++++++++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 96 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0e4f8734fe9d..8275b6681b9b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -850,8 +850,65 @@ static const struct iomap_ops fuse_iomap_ops = { struct fuse_fill_read_data { struct file *file; + + /* Fields below are used if sending the read request asynchronously */ + struct fuse_conn *fc; + struct fuse_io_args *ia; + unsigned int nr_bytes; }; +/* forward declarations */ +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write); +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async); + +static int fuse_handle_readahead(struct folio *folio, + struct readahead_control *rac, + struct fuse_fill_read_data *data, loff_t pos, + size_t len) +{ + struct fuse_io_args *ia = data->ia; + size_t off = offset_in_folio(folio, pos); + struct fuse_conn *fc = data->fc; + struct fuse_args_pages *ap; + unsigned int nr_pages; + + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, + false)) { + fuse_send_readpages(ia, data->file, data->nr_bytes, + fc->async_read); + data->nr_bytes = 0; + data->ia = NULL; + ia = NULL; + } + if (!ia) { + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + return -EAGAIN; + + nr_pages = min(fc->max_pages, readahead_count(rac)); + data->ia = fuse_io_alloc(NULL, nr_pages); + if (!data->ia) + return -ENOMEM; + ia = data->ia; + } + folio_get(folio); + ap = &ia->ap; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = off; + ap->descs[ap->num_folios].length = len; + data->nr_bytes += len; + ap->num_folios++; + + return 0; +} + static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len) @@ -863,17 +920,39 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, struct file *file = data->file; int ret; - /* - * for non-readahead read requests, do reads synchronously since - * it's not guaranteed that the server can handle out-of-order reads - */ - ret = fuse_do_readfolio(file, folio, off, len); - iomap_finish_folio_read(folio, off, len, ret); + if (ctx->rac) { + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); + /* + * If fuse_handle_readahead was successful, fuse_readpages_end + * will do the iomap_finish_folio_read, else we need to call it + * here + */ + if (ret) + iomap_finish_folio_read(folio, off, len, ret); + } else { + /* + * for non-readahead read requests, do reads synchronously + * since it's not guaranteed that the server can handle + * out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + iomap_finish_folio_read(folio, off, len, ret); + } return ret; } +static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + + if (data->ia) + fuse_send_readpages(data->ia, data->file, data->nr_bytes, + data->fc->async_read); +} + static const struct iomap_read_ops fuse_iomap_read_ops = { .read_folio_range = fuse_iomap_read_folio_range_async, + .submit_read = fuse_iomap_read_submit, }; static int fuse_read_folio(struct file *file, struct folio *folio) @@ -933,7 +1012,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { - folio_end_read(ap->folios[i], !err); + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, + ap->descs[i].length, err); folio_put(ap->folios[i]); } if (ia->ff) @@ -943,7 +1023,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, - unsigned int count) + unsigned int count, bool async) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; @@ -965,7 +1045,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); - if (fm->fc->async_read) { + if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); @@ -982,81 +1062,20 @@ static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int max_pages, nr_pages; - struct folio *folio = NULL; + struct fuse_fill_read_data data = { + .file = rac->file, + .fc = fc, + }; + struct iomap_read_folio_ctx ctx = { + .ops = &fuse_iomap_read_ops, + .rac = rac, + .read_ctx = &data + }; if (fuse_is_bad(inode)) return; - max_pages = min_t(unsigned int, fc->max_pages, - fc->max_read / PAGE_SIZE); - - /* - * This is only accurate the first time through, since readahead_folio() - * doesn't update readahead_count() from the previous folio until the - * next call. Grab nr_pages here so we know how many pages we're going - * to have to process. This means that we will exit here with - * readahead_count() == folio_nr_pages(last_folio), but we will have - * consumed all of the folios, and read_pages() will call - * readahead_folio() again which will clean up the rac. - */ - nr_pages = readahead_count(rac); - - while (nr_pages) { - struct fuse_io_args *ia; - struct fuse_args_pages *ap; - unsigned cur_pages = min(max_pages, nr_pages); - unsigned int pages = 0; - - if (fc->num_background >= fc->congestion_threshold && - rac->ra->async_size >= readahead_count(rac)) - /* - * Congested and only async pages left, so skip the - * rest. - */ - break; - - ia = fuse_io_alloc(NULL, cur_pages); - if (!ia) - break; - ap = &ia->ap; - - while (pages < cur_pages) { - unsigned int folio_pages; - - /* - * This returns a folio with a ref held on it. - * The ref needs to be held until the request is - * completed, since the splice case (see - * fuse_try_move_page()) drops the ref after it's - * replaced in the page cache. - */ - if (!folio) - folio = __readahead_folio(rac); - - folio_pages = folio_nr_pages(folio); - if (folio_pages > cur_pages - pages) { - /* - * Large folios belonging to fuse will never - * have more pages than max_pages. - */ - WARN_ON(!pages); - break; - } - - ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].length = folio_size(folio); - ap->num_folios++; - pages += folio_pages; - folio = NULL; - } - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); - nr_pages -= pages; - } - if (folio) { - folio_end_read(folio, false); - folio_put(folio); - } + iomap_readahead(&fuse_iomap_ops, &ctx); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -2079,7 +2098,7 @@ struct fuse_fill_wb_data { struct fuse_file *ff; unsigned int max_folios; /* - * nr_bytes won't overflow since fuse_writepage_need_send() caps + * nr_bytes won't overflow since fuse_folios_need_send() caps * wb requests to never exceed fc->max_pages (which has an upper bound * of U16_MAX). */ @@ -2124,14 +2143,15 @@ static void fuse_writepages_send(struct inode *inode, spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, - unsigned len, struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; - unsigned bytes = data->nr_bytes + len; + unsigned bytes = cur_bytes + len; loff_t prev_pos; + size_t max_bytes = write ? fc->max_write : fc->max_read; WARN_ON(!ap->num_folios); @@ -2139,8 +2159,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; - /* Reached max write bytes */ - if (bytes > fc->max_write) + if (bytes > max_bytes) return true; /* Discontinuity */ @@ -2150,11 +2169,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if (prev_pos != pos) return true; - /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && - !fuse_pages_realloc(data, fc->max_pages)) - return true; - return false; } @@ -2178,10 +2192,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, return -EIO; } - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { - fuse_writepages_send(inode, data); - data->wpa = NULL; - data->nr_bytes = 0; + if (wpa) { + bool send = fuse_folios_need_send(fc, pos, len, ap, + data->nr_bytes, true); + + if (!send) { + /* + * Need to grow the pages array? If so, did the + * expansion fail? + */ + send = (ap->num_folios == data->max_folios) && + !fuse_pages_realloc(data, fc->max_pages); + } + + if (send) { + fuse_writepages_send(inode, data); + data->wpa = NULL; + data->nr_bytes = 0; + } } if (data->wpa == NULL) { From 93570c652b80cd632dc49b590a35d3f0e268893b Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:09 -0700 Subject: [PATCH 14/14] fuse: remove fc->blkbits workaround for partial writes Now that fuse is integrated with iomap for read/readahead, we can remove the workaround that was added in commit bd24d2108e9c ("fuse: fix fuseblk i_blkbits for iomap partial writes"), which was previously needed to avoid a race condition where an iomap partial write may be overwritten by a read if blocksize < PAGE_SIZE. Now that fuse does iomap read/readahead, this is protected against since there is granular uptodate tracking of blocks, which means this workaround can be removed. Signed-off-by: Joanne Koong Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 2 +- fs/fuse/fuse_i.h | 8 -------- fs/fuse/inode.c | 13 +------------ 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ecaec0fea3a1..316922d5dd13 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = fc->blkbits; + blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c2f2a48156d6..f616c1991fed 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -981,14 +981,6 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; - - /* - * This is a workaround until fuse uses iomap for reads. - * For fuseblk servers, this represents the blocksize passed in at - * mount time and for regular fuse servers, this is equivalent to - * inode->i_blkbits. - */ - u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d1babf56f254..8ba29ca23244 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (attr->blksize) fi->cached_i_blkbits = ilog2(attr->blksize); else - fi->cached_i_blkbits = fc->blkbits; + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; - /* - * This is a workaround until fuse hooks into iomap for reads. - * Use PAGE_SIZE for the blocksize else if the writeback cache - * is enabled, buffered writes go through iomap and a read may - * overwrite partially written data if blocksize < PAGE_SIZE - */ - fc->blkbits = sb->s_blocksize_bits; - if (ctx->blksize != PAGE_SIZE && - !sb_set_blocksize(sb, PAGE_SIZE)) - goto err; #endif fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype;