From 7ee19a59a75e3d5b9ec00499b86af8e2a46fbe86 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Mon, 1 Dec 2025 12:47:14 -0800
Subject: [PATCH 1/9] btrfs: fix qgroup_snapshot_quick_inherit() squota bug

qgroup_snapshot_quick_inherit() detects conditions where the snapshot
destination would land in the same parent qgroup as the snapshot source
subvolume. In this case we can avoid costly qgroup calculations and just
add the nodesize of the new snapshot to the parent.

However, in the case of squotas this is actually a double count, and
also an undercount for deeper qgroup nestings.

The following annotated script shows the issue:

  btrfs quota enable --simple "$mnt"

  # Create 2-level qgroup hierarchy
  btrfs qgroup create 2/100 "$mnt"  # Q2 (level 2)
  btrfs qgroup create 1/100 "$mnt"  # Q1 (level 1)
  btrfs qgroup assign 1/100 2/100 "$mnt"

  # Create base subvolume
  btrfs subvolume create "$mnt/base" >/dev/null
  base_id=$(btrfs subvolume show "$mnt/base" | grep 'Subvolume ID:' | awk '{print $3}')

  # Create intermediate snapshot and add to Q1
  btrfs subvolume snapshot "$mnt/base" "$mnt/intermediate" >/dev/null
  inter_id=$(btrfs subvolume show "$mnt/intermediate" | grep 'Subvolume ID:' | awk '{print $3}')
  btrfs qgroup assign "0/$inter_id" 1/100 "$mnt"

  # Create working snapshot with --inherit (auto-adds to Q1)
  # src=intermediate (in only Q1)
  # dst=snap (inheriting only into Q1)
  # This double counts the 16k nodesize of the snapshot in Q1, and
  # undercounts it in Q2.
  btrfs subvolume snapshot -i 1/100 "$mnt/intermediate" "$mnt/snap" >/dev/null
  snap_id=$(btrfs subvolume show "$mnt/snap" | grep 'Subvolume ID:' | awk '{print $3}')

  # Fully complete snapshot creation
  sync

  # Delete working snapshot
  # Q1 and Q2 will lose the full snap usage
  btrfs subvolume delete "$mnt/snap" >/dev/null

  # Delete intermediate and remove from Q1
  # Q1 and Q2 will lose the full intermediate usage
  btrfs qgroup remove "0/$inter_id" 1/100 "$mnt"
  btrfs subvolume delete "$mnt/intermediate" >/dev/null

  # Q1 should be at 0, but still has 16k. Q2 is "correct" at 0 (for now...)

  # Trigger cleaner, wait for deletions
  mount -o remount,sync=1 "$mnt"
  btrfs subvolume sync "$mnt" "$snap_id"
  btrfs subvolume sync "$mnt" "$inter_id"

  # Remove Q1 from Q2
  # Frees 16k more from Q2, underflowing it to 16EiB
  btrfs qgroup remove 1/100 2/100 "$mnt"

  # And show the bad state:
  btrfs qgroup show -pc "$mnt"

        Qgroupid    Referenced    Exclusive Parent   Child   Path
        --------    ----------    --------- ------   -----   ----
        0/5           16.00KiB     16.00KiB -        -       <toplevel>
        0/256         16.00KiB     16.00KiB -        -       base
        1/100         16.00KiB     16.00KiB -        -       <0 member qgroups>
        2/100         16.00EiB     16.00EiB -        -       <0 member qgroups>

Fix this by simply not doing this quick inheritance with squotas.

I suspect that it is also wrong in normal qgroups to not recurse up the
qgroup tree in the quick inherit case, though other consistency checks
will likely fix it anyway.

Fixes: b20fe56cd285 ("btrfs: qgroup: allow quick inherit if snapshot is created and added to the same parent")
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d9d8d9968a58..904d2a05e63a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3211,6 +3211,9 @@ static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
 	struct btrfs_qgroup_list *list;
 	int nr_parents = 0;
 
+	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_FULL)
+		return 0;
+
 	src = find_qgroup_rb(fs_info, srcid);
 	if (!src)
 		return -ENOENT;

From 68d4b3fa18d72b7f649e83012e7e08f1881f6b75 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 4 Dec 2025 14:38:23 +1030
Subject: [PATCH 2/9] btrfs: qgroup: update all parent qgroups when doing quick
 inherit

[BUG]
There is a bug that if a subvolume has multi-level parent qgroups, and
is able to do a quick inherit, only the direct parent qgroup got
updated:

  mkfs.btrfs  -f -O quota $dev
  mount $dev $mnt
  btrfs subv create $mnt/subv1
  btrfs qgroup create 1/100 $mnt
  btrfs qgroup create 2/100 $mnt
  btrfs qgroup assign 1/100 2/100 $mnt
  btrfs qgroup assign 0/256 1/100 $mnt
  btrfs qgroup show -p --sync $mnt

  Qgroupid    Referenced    Exclusive Parent     Path
  --------    ----------    --------- ------     ----
  0/5           16.00KiB     16.00KiB -          <toplevel>
  0/256         16.00KiB     16.00KiB 1/100      subv1
  1/100         16.00KiB     16.00KiB 2/100      2/100<1 member qgroup>
  2/100         16.00KiB     16.00KiB -          <0 member qgroups>

  btrfs subv snap -i 1/100 $mnt/subv1 $mnt/snap1
  btrfs qgroup show -p --sync $mnt

  Qgroupid    Referenced    Exclusive Parent     Path
  --------    ----------    --------- ------     ----
  0/5           16.00KiB     16.00KiB -          <toplevel>
  0/256         16.00KiB     16.00KiB 1/100      subv1
  0/257         16.00KiB     16.00KiB 1/100      snap1
  1/100         32.00KiB     32.00KiB 2/100      2/100<1 member qgroup>
  2/100         16.00KiB     16.00KiB -          <0 member qgroups>
  # Note that 2/100 is not updated, and qgroup numbers are inconsistent

  umount $mnt

[CAUSE]
If the snapshot source subvolume belongs to a parent qgroup, and the new
snapshot target is also added to the new same parent qgroup, we allow a
quick update without marking qgroup inconsistent.

But that quick update only update the parent qgroup, without checking if
there is any more parent qgroups.

[FIX]
Iterate through all parent qgroups during the quick inherit.

Reported-by: Boris Burkov <boris@bur.io>
Fixes: b20fe56cd285 ("btrfs: qgroup: allow quick inherit if snapshot is created and added to the same parent")
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 904d2a05e63a..206587820fec 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3208,7 +3208,10 @@ static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_qgroup *src;
 	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *qgroup;
 	struct btrfs_qgroup_list *list;
+	LIST_HEAD(qgroup_list);
+	const u32 nodesize = fs_info->nodesize;
 	int nr_parents = 0;
 
 	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_FULL)
@@ -3248,8 +3251,19 @@ static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
 	if (parent->excl != parent->rfer)
 		return 1;
 
-	parent->excl += fs_info->nodesize;
-	parent->rfer += fs_info->nodesize;
+	qgroup_iterator_add(&qgroup_list, parent);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
+		qgroup->rfer += nodesize;
+		qgroup->rfer_cmpr += nodesize;
+		qgroup->excl += nodesize;
+		qgroup->excl_cmpr += nodesize;
+		qgroup_dirty(fs_info, qgroup);
+
+		/* Append parent qgroups to @qgroup_list. */
+		list_for_each_entry(list, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, list->group);
+	}
+	qgroup_iterator_clean(&qgroup_list);
 	return 0;
 }
 

From f157dd661339fc6f5f2b574fe2429c43bd309534 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= <mssola@mssola.com>
Date: Tue, 21 Oct 2025 11:11:25 +0200
Subject: [PATCH 3/9] btrfs: fix NULL dereference on root when tracing inode
 eviction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When evicting an inode the first thing we do is to setup tracing for it,
which implies fetching the root's id. But in btrfs_evict_inode() the
root might be NULL, as implied in the next check that we do in
btrfs_evict_inode().

Hence, we either should set the ->root_objectid to 0 in case the root is
NULL, or we move tracing setup after checking that the root is not
NULL. Setting the rootid to 0 at least gives us the possibility to trace
this call even in the case when the root is NULL, so that's the solution
taken here.

Fixes: 1abe9b8a138c ("Btrfs: add initial tracepoint support for btrfs")
Reported-by: syzbot+d991fea1b4b23b1f6bf8@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d991fea1b4b23b1f6bf8
Signed-off-by: Miquel Sabaté Solà <mssola@mssola.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 7e418f065b94..125bdc166bfe 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -224,7 +224,8 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 		__entry->generation = BTRFS_I(inode)->generation;
 		__entry->last_trans = BTRFS_I(inode)->last_trans;
 		__entry->logged_trans = BTRFS_I(inode)->logged_trans;
-		__entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root);
+		__entry->root_objectid = BTRFS_I(inode)->root ?
+					 btrfs_root_id(BTRFS_I(inode)->root) : 0;
 	),
 
 	TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu "

From 5037b342825df7094a4906d1e2a9674baab50cb2 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Thu, 11 Dec 2025 13:30:33 +0800
Subject: [PATCH 4/9] btrfs: fix deadlock in wait_current_trans() due to
 ignored transaction type

When wait_current_trans() is called during start_transaction(), it
currently waits for a blocked transaction without considering whether
the given transaction type actually needs to wait for that particular
transaction state. The btrfs_blocked_trans_types[] array already defines
which transaction types should wait for which transaction states, but
this check was missing in wait_current_trans().

This can lead to a deadlock scenario involving two transactions and
pending ordered extents:

  1. Transaction A is in TRANS_STATE_COMMIT_DOING state

  2. A worker processing an ordered extent calls start_transaction()
     with TRANS_JOIN

  3. join_transaction() returns -EBUSY because Transaction A is in
     TRANS_STATE_COMMIT_DOING

  4. Transaction A moves to TRANS_STATE_UNBLOCKED and completes

  5. A new Transaction B is created (TRANS_STATE_RUNNING)

  6. The ordered extent from step 2 is added to Transaction B's
     pending ordered extents

  7. Transaction B immediately starts commit by another task and
     enters TRANS_STATE_COMMIT_START

  8. The worker finally reaches wait_current_trans(), sees Transaction B
     in TRANS_STATE_COMMIT_START (a blocked state), and waits
     unconditionally

  9. However, TRANS_JOIN should NOT wait for TRANS_STATE_COMMIT_START
     according to btrfs_blocked_trans_types[]

  10. Transaction B is waiting for pending ordered extents to complete

  11. Deadlock: Transaction B waits for ordered extent, ordered extent
      waits for Transaction B

This can be illustrated by the following call stacks:
  CPU0                              CPU1
                                    btrfs_finish_ordered_io()
                                      start_transaction(TRANS_JOIN)
                                        join_transaction()
                                          # -EBUSY (Transaction A is
                                          # TRANS_STATE_COMMIT_DOING)
  # Transaction A completes
  # Transaction B created
  # ordered extent added to
  # Transaction B's pending list
  btrfs_commit_transaction()
    # Transaction B enters
    # TRANS_STATE_COMMIT_START
    # waiting for pending ordered
    # extents
                                        wait_current_trans()
                                          # waits for Transaction B
                                          # (should not wait!)

Task bstore_kv_sync in btrfs_commit_transaction waiting for ordered
extents:

  __schedule+0x2e7/0x8a0
  schedule+0x64/0xe0
  btrfs_commit_transaction+0xbf7/0xda0 [btrfs]
  btrfs_sync_file+0x342/0x4d0 [btrfs]
  __x64_sys_fdatasync+0x4b/0x80
  do_syscall_64+0x33/0x40
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Task kworker in wait_current_trans waiting for transaction commit:

  Workqueue: btrfs-syno_nocow btrfs_work_helper [btrfs]
  __schedule+0x2e7/0x8a0
  schedule+0x64/0xe0
  wait_current_trans+0xb0/0x110 [btrfs]
  start_transaction+0x346/0x5b0 [btrfs]
  btrfs_finish_ordered_io.isra.0+0x49b/0x9c0 [btrfs]
  btrfs_work_helper+0xe8/0x350 [btrfs]
  process_one_work+0x1d3/0x3c0
  worker_thread+0x4d/0x3e0
  kthread+0x12d/0x150
  ret_from_fork+0x1f/0x30

Fix this by passing the transaction type to wait_current_trans() and
checking btrfs_blocked_trans_types[cur_trans->state] against the given
type before deciding to wait. This ensures that transaction types which
are allowed to join during certain blocked states will not unnecessarily
wait and cause deadlocks.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 05ee4391c83a..bd03f465e2d3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -520,13 +520,14 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans)
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
  */
-static void wait_current_trans(struct btrfs_fs_info *fs_info)
+static void wait_current_trans(struct btrfs_fs_info *fs_info, unsigned int type)
 {
 	struct btrfs_transaction *cur_trans;
 
 	spin_lock(&fs_info->trans_lock);
 	cur_trans = fs_info->running_transaction;
-	if (cur_trans && is_transaction_blocked(cur_trans)) {
+	if (cur_trans && is_transaction_blocked(cur_trans) &&
+	    (btrfs_blocked_trans_types[cur_trans->state] & type)) {
 		refcount_inc(&cur_trans->use_count);
 		spin_unlock(&fs_info->trans_lock);
 
@@ -701,12 +702,12 @@ again:
 		sb_start_intwrite(fs_info->sb);
 
 	if (may_wait_transaction(fs_info, type))
-		wait_current_trans(fs_info);
+		wait_current_trans(fs_info, type);
 
 	do {
 		ret = join_transaction(fs_info, type);
 		if (ret == -EBUSY) {
-			wait_current_trans(fs_info);
+			wait_current_trans(fs_info, type);
 			if (unlikely(type == TRANS_ATTACH ||
 				     type == TRANS_JOIN_NOSTART))
 				ret = -ENOENT;
@@ -1003,7 +1004,7 @@ out:
 
 void btrfs_throttle(struct btrfs_fs_info *fs_info)
 {
-	wait_current_trans(fs_info);
+	wait_current_trans(fs_info, TRANS_START);
 }
 
 bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)

From e9e3b22ddfa760762b696ac6417c8d6edd182e49 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 11 Dec 2025 12:45:17 +1030
Subject: [PATCH 5/9] btrfs: fix beyond-EOF write handling

[BUG]
For the following write sequence with 64K page size and 4K fs block size,
it will lead to file extent items to be inserted without any data
checksum:

  mkfs.btrfs -s 4k -f $dev > /dev/null
  mount $dev $mnt
  xfs_io -f -c "pwrite 0 16k" -c "pwrite 32k 4k" -c pwrite "60k 64K" \
            -c "truncate 16k" $mnt/foobar
  umount $mnt

This will result the following 2 file extent items to be inserted (extra
trace point added to insert_ordered_extent_file_extent()):

  btrfs_finish_one_ordered: root=5 ino=257 file_off=61440 num_bytes=4096 csum_bytes=0
  btrfs_finish_one_ordered: root=5 ino=257 file_off=0 num_bytes=16384 csum_bytes=16384

Note for file offset 60K, we're inserting a file extent without any
data checksum.

Also note that range [32K, 36K) didn't reach
insert_ordered_extent_file_extent(), which is the correct behavior as
that OE is fully truncated, should not result any file extent.

Although file extent at 60K will be later dropped by btrfs_truncate(),
if the transaction got committed after file extent inserted but before
the file extent dropping, we will have a small window where we have a
file extent beyond EOF and without any data checksum.

That will cause "btrfs check" to report error.

[CAUSE]
The sequence happens like this:

- Buffered write dirtied the page cache and updated isize

  Now the inode size is 64K, with the following page cache layout:

  0             16K             32K              48K           64K
  |/////////////|               |//|                        |//|

- Truncate the inode to 16K
  Which will trigger writeback through:

  btrfs_setsize()
  |- truncate_setsize()
  |  Now the inode size is set to 16K
  |
  |- btrfs_truncate()
     |- btrfs_wait_ordered_range() for [16K, u64(-1)]
        |- btrfs_fdatawrite_range() for [16K, u64(-1)}
	   |- extent_writepage() for folio 0
	      |- writepage_delalloc()
	      |  Generated OE for [0, 16K), [32K, 36K] and [60K, 64K)
	      |
	      |- extent_writepage_io()

  Then inside extent_writepage_io(), the dirty fs blocks are handled
  differently:

  - Submit write for range [0, 16K)
    As they are still inside the inode size (16K).

  - Mark OE [32K, 36K) as truncated
    Since we only call btrfs_lookup_first_ordered_range() once, which
    returned the first OE after file offset 16K.

  - Mark all OEs inside range [16K, 64K) as finished
    Which will mark OE ranges [32K, 36K) and [60K, 64K) as finished.

    For OE [32K, 36K) since it's already marked as truncated, and its
    truncated length is 0, no file extent will be inserted.

    For OE [60K, 64K) it has never been submitted thus has no data
    checksum, and we insert the file extent as usual.
    This is the root cause of file extent at 60K to be inserted without
    any data checksum.

  - Clear dirty flags for range [16K, 64K)
    It is the function btrfs_folio_clear_dirty() which searches and clears
    any dirty blocks inside that range.

[FIX]
The bug itself was introduced a long time ago, way before subpage and
large folio support.

At that time, fs block size must match page size, thus the range
[cur, end) is just one fs block.

But later with subpage and large folios, the same range [cur, end)
can have multiple blocks and ordered extents.

Later commit 18de34daa7c6 ("btrfs: truncate ordered extent when skipping
writeback past i_size") was fixing a bug related to subpage/large
folios, but it's still utilizing the old range [cur, end), meaning only
the first OE will be marked as truncated.

The proper fix here is to make EOF handling block-by-block, not trying
to handle the whole range to @end.

By this we always locate and truncate the OE for every dirty block.

CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2d32dfc34ae3..97748d0d54d9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1728,7 +1728,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			struct btrfs_ordered_extent *ordered;
 
 			ordered = btrfs_lookup_first_ordered_range(inode, cur,
-								   folio_end - cur);
+								   fs_info->sectorsize);
 			/*
 			 * We have just run delalloc before getting here, so
 			 * there must be an ordered extent.
@@ -1742,7 +1742,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			btrfs_put_ordered_extent(ordered);
 
 			btrfs_mark_ordered_io_finished(inode, folio, cur,
-						       end - cur, true);
+						       fs_info->sectorsize, true);
 			/*
 			 * This range is beyond i_size, thus we don't need to
 			 * bother writing back.
@@ -1751,8 +1751,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			 * writeback the sectors with subpage dirty bits,
 			 * causing writeback without ordered extent.
 			 */
-			btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur);
-			break;
+			btrfs_folio_clear_dirty(fs_info, folio, cur, fs_info->sectorsize);
+			continue;
 		}
 		ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
 		if (unlikely(ret < 0)) {

From 7ba0b6461bc4edb3005ea6e00cdae189bcf908a5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 11 Dec 2025 15:06:26 +0000
Subject: [PATCH 6/9] btrfs: always detect conflicting inodes when logging
 inode refs

After rename exchanging (either with the rename exchange operation or
regular renames in multiple non-atomic steps) two inodes and at least
one of them is a directory, we can end up with a log tree that contains
only of the inodes and after a power failure that can result in an attempt
to delete the other inode when it should not because it was not deleted
before the power failure. In some case that delete attempt fails when
the target inode is a directory that contains a subvolume inside it, since
the log replay code is not prepared to deal with directory entries that
point to root items (only inode items).

1) We have directories "dir1" (inode A) and "dir2" (inode B) under the
   same parent directory;

2) We have a file (inode C) under directory "dir1" (inode A);

3) We have a subvolume inside directory "dir2" (inode B);

4) All these inodes were persisted in a past transaction and we are
   currently at transaction N;

5) We rename the file (inode C), so at btrfs_log_new_name() we update
   inode C's last_unlink_trans to N;

6) We get a rename exchange for "dir1" (inode A) and "dir2" (inode B),
   so after the exchange "dir1" is inode B and "dir2" is inode A.
   During the rename exchange we call btrfs_log_new_name() for inodes
   A and B, but because they are directories, we don't update their
   last_unlink_trans to N;

7) An fsync against the file (inode C) is done, and because its inode
   has a last_unlink_trans with a value of N we log its parent directory
   (inode A) (through btrfs_log_all_parents(), called from
   btrfs_log_inode_parent()).

8) So we end up with inode B not logged, which now has the old name
   of inode A. At copy_inode_items_to_log(), when logging inode A, we
   did not check if we had any conflicting inode to log because inode
   A has a generation lower than the current transaction (created in
   a past transaction);

9) After a power failure, when replaying the log tree, since we find that
   inode A has a new name that conflicts with the name of inode B in the
   fs tree, we attempt to delete inode B... this is wrong since that
   directory was never deleted before the power failure, and because there
   is a subvolume inside that directory, attempting to delete it will fail
   since replay_dir_deletes() and btrfs_unlink_inode() are not prepared
   to deal with dir items that point to roots instead of inodes.

   When that happens the mount fails and we get a stack trace like the
   following:

   [87.2314] BTRFS info (device dm-0): start tree-log replay
   [87.2318] BTRFS critical (device dm-0): failed to delete reference to subvol, root 5 inode 256 parent 259
   [87.2332] ------------[ cut here ]------------
   [87.2338] BTRFS: Transaction aborted (error -2)
   [87.2346] WARNING: CPU: 1 PID: 638968 at fs/btrfs/inode.c:4345 __btrfs_unlink_inode+0x416/0x440 [btrfs]
   [87.2368] Modules linked in: btrfs loop dm_thin_pool (...)
   [87.2470] CPU: 1 UID: 0 PID: 638968 Comm: mount Tainted: G        W           6.18.0-rc7-btrfs-next-218+ #2 PREEMPT(full)
   [87.2489] Tainted: [W]=WARN
   [87.2494] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
   [87.2514] RIP: 0010:__btrfs_unlink_inode+0x416/0x440 [btrfs]
   [87.2538] Code: c0 89 04 24 (...)
   [87.2568] RSP: 0018:ffffc0e741f4b9b8 EFLAGS: 00010286
   [87.2574] RAX: 0000000000000000 RBX: ffff9d3ec8a6cf60 RCX: 0000000000000000
   [87.2582] RDX: 0000000000000002 RSI: ffffffff84ab45a1 RDI: 00000000ffffffff
   [87.2591] RBP: ffff9d3ec8a6ef20 R08: 0000000000000000 R09: ffffc0e741f4b840
   [87.2599] R10: ffff9d45dc1fffa8 R11: 0000000000000003 R12: ffff9d3ee26d77e0
   [87.2608] R13: ffffc0e741f4ba98 R14: ffff9d4458040800 R15: ffff9d44b6b7ca10
   [87.2618] FS:  00007f7b9603a840(0000) GS:ffff9d4658982000(0000) knlGS:0000000000000000
   [87.2629] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   [87.2637] CR2: 00007ffc9ec33b98 CR3: 000000011273e003 CR4: 0000000000370ef0
   [87.2648] Call Trace:
   [87.2651]  <TASK>
   [87.2654]  btrfs_unlink_inode+0x15/0x40 [btrfs]
   [87.2661]  unlink_inode_for_log_replay+0x27/0xf0 [btrfs]
   [87.2669]  check_item_in_log+0x1ea/0x2c0 [btrfs]
   [87.2676]  replay_dir_deletes+0x16b/0x380 [btrfs]
   [87.2684]  fixup_inode_link_count+0x34b/0x370 [btrfs]
   [87.2696]  fixup_inode_link_counts+0x41/0x160 [btrfs]
   [87.2703]  btrfs_recover_log_trees+0x1ff/0x7c0 [btrfs]
   [87.2711]  ? __pfx_replay_one_buffer+0x10/0x10 [btrfs]
   [87.2719]  open_ctree+0x10bb/0x15f0 [btrfs]
   [87.2726]  btrfs_get_tree.cold+0xb/0x16c [btrfs]
   [87.2734]  ? fscontext_read+0x15c/0x180
   [87.2740]  ? rw_verify_area+0x50/0x180
   [87.2746]  vfs_get_tree+0x25/0xd0
   [87.2750]  vfs_cmd_create+0x59/0xe0
   [87.2755]  __do_sys_fsconfig+0x4f6/0x6b0
   [87.2760]  do_syscall_64+0x50/0x1220
   [87.2764]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
   [87.2770] RIP: 0033:0x7f7b9625f4aa
   [87.2775] Code: 73 01 c3 48 (...)
   [87.2803] RSP: 002b:00007ffc9ec35b08 EFLAGS: 00000246 ORIG_RAX: 00000000000001af
   [87.2817] RAX: ffffffffffffffda RBX: 0000558bfa91ac20 RCX: 00007f7b9625f4aa
   [87.2829] RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003
   [87.2842] RBP: 0000558bfa91b120 R08: 0000000000000000 R09: 0000000000000000
   [87.2854] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
   [87.2864] R13: 00007f7b963f1580 R14: 00007f7b963f326c R15: 00007f7b963d8a23
   [87.2877]  </TASK>
   [87.2882] ---[ end trace 0000000000000000 ]---
   [87.2891] BTRFS: error (device dm-0 state A) in __btrfs_unlink_inode:4345: errno=-2 No such entry
   [87.2904] BTRFS: error (device dm-0 state EAO) in do_abort_log_replay:191: errno=-2 No such entry
   [87.2915] BTRFS critical (device dm-0 state EAO): log tree (for root 5) leaf currently being processed (slot 7 key (258 12 257)):
   [87.2929] BTRFS info (device dm-0 state EAO): leaf 30736384 gen 10 total ptrs 7 free space 15712 owner 18446744073709551610
   [87.2929] BTRFS info (device dm-0 state EAO): refs 3 lock_owner 0 current 638968
   [87.2929]      item 0 key (257 INODE_ITEM 0) itemoff 16123 itemsize 160
   [87.2929]              inode generation 9 transid 10 size 0 nbytes 0
   [87.2929]              block group 0 mode 40755 links 1 uid 0 gid 0
   [87.2929]              rdev 0 sequence 7 flags 0x0
   [87.2929]              atime 1765464494.678070921
   [87.2929]              ctime 1765464494.686606513
   [87.2929]              mtime 1765464494.686606513
   [87.2929]              otime 1765464494.678070921
   [87.2929]      item 1 key (257 INODE_REF 256) itemoff 16109 itemsize 14
   [87.2929]              index 4 name_len 4
   [87.2929]      item 2 key (257 DIR_LOG_INDEX 2) itemoff 16101 itemsize 8
   [87.2929]              dir log end 2
   [87.2929]      item 3 key (257 DIR_LOG_INDEX 3) itemoff 16093 itemsize 8
   [87.2929]              dir log end 18446744073709551615
   [87.2930]      item 4 key (257 DIR_INDEX 3) itemoff 16060 itemsize 33
   [87.2930]              location key (258 1 0) type 1
   [87.2930]              transid 10 data_len 0 name_len 3
   [87.2930]      item 5 key (258 INODE_ITEM 0) itemoff 15900 itemsize 160
   [87.2930]              inode generation 9 transid 10 size 0 nbytes 0
   [87.2930]              block group 0 mode 100644 links 1 uid 0 gid 0
   [87.2930]              rdev 0 sequence 2 flags 0x0
   [87.2930]              atime 1765464494.678456467
   [87.2930]              ctime 1765464494.686606513
   [87.2930]              mtime 1765464494.678456467
   [87.2930]              otime 1765464494.678456467
   [87.2930]      item 6 key (258 INODE_REF 257) itemoff 15887 itemsize 13
   [87.2930]              index 3 name_len 3
   [87.2930] BTRFS critical (device dm-0 state EAO): log replay failed in unlink_inode_for_log_replay:1045 for root 5, stage 3, with error -2: failed to unlink inode 256 parent dir 259 name subvol root 5
   [87.2963] BTRFS: error (device dm-0 state EAO) in btrfs_recover_log_trees:7743: errno=-2 No such entry
   [87.2981] BTRFS: error (device dm-0 state EAO) in btrfs_replay_log:2083: errno=-2 No such entry (Failed to recover log tr

So fix this by changing copy_inode_items_to_log() to always detect if
there are conflicting inodes for the ref/extref of the inode being logged
even if the inode was created in a past transaction.

A test case for fstests will follow soon.

CC: stable@vger.kernel.org # 6.1+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 31edc93a383e..5831754bb01c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6341,10 +6341,8 @@ again:
 			 * and no keys greater than that, so bail out.
 			 */
 			break;
-		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
-			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
-			   (inode->generation == trans->transid ||
-			    ctx->logging_conflict_inodes)) {
+		} else if (min_key->type == BTRFS_INODE_REF_KEY ||
+			   min_key->type == BTRFS_INODE_EXTREF_KEY) {
 			u64 other_ino = 0;
 			u64 other_parent = 0;
 

From 83f59076a1ae6f5c6845d6f7ed3a1a373d883684 Mon Sep 17 00:00:00 2001
From: Leo Martins <loemra.dev@gmail.com>
Date: Fri, 12 Dec 2025 17:26:26 -0800
Subject: [PATCH 7/9] btrfs: fix use-after-free warning in
 btrfs_get_or_create_delayed_node()

Previously, btrfs_get_or_create_delayed_node() set the delayed_node's
refcount before acquiring the root->delayed_nodes lock.
Commit e8513c012de7 ("btrfs: implement ref_tracker for delayed_nodes")
moved refcount_set inside the critical section, which means there is
no longer a memory barrier between setting the refcount and setting
btrfs_inode->delayed_node.

Without that barrier, the stores to node->refs and
btrfs_inode->delayed_node may become visible out of order. Another
thread can then read btrfs_inode->delayed_node and attempt to
increment a refcount that hasn't been set yet, leading to a
refcounting bug and a use-after-free warning.

The fix is to move refcount_set back to where it was to take
advantage of the implicit memory barrier provided by lock
acquisition.

Because the allocations now happen outside of the lock's critical
section, they can use GFP_NOFS instead of GFP_ATOMIC.

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202511262228.6dda231e-lkp@intel.com
Fixes: e8513c012de7 ("btrfs: implement ref_tracker for delayed_nodes")
Tested-by: kernel test robot <oliver.sang@intel.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Leo Martins <loemra.dev@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ce6e9f8812e0..4b7d9015e0da 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -152,37 +152,39 @@ again:
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
 
+	/* Cached in the inode and can be accessed. */
+	refcount_set(&node->refs, 2);
+	btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS);
+	btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_NOFS);
+
 	/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
 	ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
-	if (ret == -ENOMEM) {
-		btrfs_delayed_node_ref_tracker_dir_exit(node);
-		kmem_cache_free(delayed_node_cache, node);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (ret == -ENOMEM)
+		goto cleanup;
+
 	xa_lock(&root->delayed_nodes);
 	ptr = xa_load(&root->delayed_nodes, ino);
 	if (ptr) {
 		/* Somebody inserted it, go back and read it. */
 		xa_unlock(&root->delayed_nodes);
-		btrfs_delayed_node_ref_tracker_dir_exit(node);
-		kmem_cache_free(delayed_node_cache, node);
-		node = NULL;
-		goto again;
+		goto cleanup;
 	}
 	ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
 	ASSERT(xa_err(ptr) != -EINVAL);
 	ASSERT(xa_err(ptr) != -ENOMEM);
 	ASSERT(ptr == NULL);
-
-	/* Cached in the inode and can be accessed. */
-	refcount_set(&node->refs, 2);
-	btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
-	btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC);
-
 	btrfs_inode->delayed_node = node;
 	xa_unlock(&root->delayed_nodes);
 
 	return node;
+cleanup:
+	btrfs_delayed_node_ref_tracker_free(node, tracker);
+	btrfs_delayed_node_ref_tracker_free(node, &node->inode_cache_tracker);
+	btrfs_delayed_node_ref_tracker_dir_exit(node);
+	kmem_cache_free(delayed_node_cache, node);
+	if (ret)
+		return ERR_PTR(ret);
+	goto again;
 }
 
 /*

From f8da41de0bff9eb1d774a7253da0c9f637c4470a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 12 Dec 2025 17:10:10 +0000
Subject: [PATCH 8/9] btrfs: do not free data reservation in fallback from
 inline due to -ENOSPC

If we fail to create an inline extent due to -ENOSPC, we will attempt to
go through the normal COW path, reserve an extent, create an ordered
extent, etc. However we were always freeing the reserved qgroup data,
which is wrong since we will use data. Fix this by freeing the reserved
qgroup data in __cow_file_range_inline() only if we are not doing the
fallback (ret is <= 0).

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6633b3dc9314..c4c370b6aae9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -673,8 +673,12 @@ out:
 	 * it won't count as data extent, free them directly here.
 	 * And at reserve time, it's always aligned to page size, so
 	 * just free one page here.
+	 *
+	 * If we fallback to non-inline (ret == 1) due to -ENOSPC, then we need
+	 * to keep the data reservation.
 	 */
-	btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
+	if (ret <= 0)
+		btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans);
 	return ret;

From c1c050f92d8f6aac4e17f7f2230160794fceef0c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 12 Dec 2025 17:18:25 +0000
Subject: [PATCH 9/9] btrfs: fix reservation leak in some error paths when
 inserting inline extent

If we fail to allocate a path or join a transaction, we return from
__cow_file_range_inline() without freeing the reserved qgroup data,
resulting in a leak. Fix this by ensuring we call btrfs_qgroup_free_data()
in such cases.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c4c370b6aae9..03337fa7a61c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -617,19 +617,22 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
 	struct btrfs_drop_extents_args drop_args = { 0 };
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	u64 data_len = (compressed_size ?: size);
 	int ret;
 	struct btrfs_path *path;
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		return PTR_ERR(trans);
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
 	}
 	trans->block_rsv = &inode->block_rsv;
 
@@ -680,7 +683,8 @@ out:
 	if (ret <= 0)
 		btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
 	btrfs_free_path(path);
-	btrfs_end_transaction(trans);
+	if (trans)
+		btrfs_end_transaction(trans);
 	return ret;
 }