1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00

Compare commits

...

29 Commits

Author SHA1 Message Date
Linus Torvalds
40fbbd64bb a couple of shmem rename fixes - recent regression from tree-in-dcache
series and older breakage from stable directory offsets stuff.
 
 Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQQqUNBr3gm4hGXdBJlZ7Krx/gZQ6wUCaUD1egAKCRBZ7Krx/gZQ
 693YAQDWMzqUs8bJx95frxidF4fJ658K/bZWHuG9eLDvhF2CxAEAoIWt5EbJ1dE0
 NEIg/+kdVpKpk1DH7SZTzc+Mgbe4aAE=
 =/LtU
 -----END PGP SIGNATURE-----

Merge tag 'pull-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull shmem rename fixes from Al Viro:
 "A couple of shmem rename fixes - recent regression from tree-in-dcache
  series and older breakage from stable directory offsets stuff"

* tag 'pull-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  shmem: fix recovery on rename failures
  shmem_whiteout(): fix regression from tree-in-dcache series
2025-12-16 19:44:36 +12:00
Linus Torvalds
53ec4a79ff seven smb3 server fixes
-----BEGIN PGP SIGNATURE-----
 
 iQGzBAABCgAdFiEE6fsu8pdIjtWE/DpLiiy9cAdyT1EFAmlA0jkACgkQiiy9cAdy
 T1HXXQv/SiJ9wKH6PDJZ6MRGYJLpHoq6BHUj6Uob2x7fc9LXGTlKwFJ8NAWBN5/1
 Po6MrL28C4Lkm+KJttH/D/9FpsBEWmViMeHuiu8SahdC90TaoNi9hu4lBQvCGSOm
 D59dWuG7KCXVgu3i6zWTKf2G2OkkRwHGKQ66TRvJ317HD0mzQfamke0SBLgN1/VJ
 nKrZw7fuBLNf5x2Yxtn01idGSwROCTqSLG1i6V4wlfX4mLT9ZJAgfbzK7bhReT8U
 ph2OZqFhKMSzZJQE/6VHw2A51LFfWZPNnp4Cl4AEkIVHzhWqzipGggUs782rGQcW
 cHG/1Zawk03ap+7omuyhjgaFjQ02N1W2D+avdSKAjVpFCX+qsAf1RHw+N3+alA8g
 JNuI4O4rtrHHznqaZ2xdgaWHpKp1K+ku2gjZYwTmt0L0ewcPRzvmpWJPT9r+1yFb
 TwLGWPSVpR9jYViUF0X2cmlLYFaiKvKIFgRGn08UD4OrEQupy5p1tIJSzFnqf7E/
 9tKxoXte
 =RiDE
 -----END PGP SIGNATURE-----

Merge tag 'v6.19-rc1-ksmbd-server-fixes' of git://git.samba.org/ksmbd

Pull smb server fixes from Steve French:

 - Fix set xattr name validation

 - Fix session refcount leak

 - Minor cleanup

 - smbdirect (RDMA) fixes: improve receive completion, and connect

* tag 'v6.19-rc1-ksmbd-server-fixes' of git://git.samba.org/ksmbd:
  ksmbd: fix buffer validation by including null terminator size in EA length
  ksmbd: Fix refcount leak when invalid session is found on session lookup
  ksmbd: remove redundant DACL check in smb_check_perm_dacl
  ksmbd: convert comma to semicolon
  smb: server: defer the initial recv completion logic to smb_direct_negotiate_recv_work()
  smb: server: initialize recv_io->cqe.done = recv_done just once
  smb: smbdirect: introduce smbdirect_socket.connect.{lock,work}
2025-12-16 19:34:09 +12:00
Linus Torvalds
115fada16b for-6.19-rc1-tag
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmlATDcACgkQxWXV+ddt
 WDvWIA/7BP1o+7bGSY/HIwnVIwNyEd1YCpUrLZAd61C7t2OEZKCO13HtM9cZSIRb
 z++k8iCkNl5Z3uZH/cQfUcCuA2sip9HSGaVrfeQ3qVnB/s3DMGb/mu1yBCKUo9/f
 nCxpsIC7BD6cTaiAzCJ6JgvCjOSieG111s0/LGjDcjlM7YQoA8p/Jan1nlvfc8It
 sTQ+ZCsuO/xHViQnfmT/XIyy5bSuSABb5LR78wp8wumngTL3ooBXGwWifrNT/egi
 E06Hhnqopg1PjBDtQtmInJ1gh1E0capQ5j1Z6TDeMYCPeUOuPpRqLVrRP3bIM4jN
 vDu5dZpM9r542Wpj/vZvs/UqmhczUmbQfjLfWdr+KORrl6RA9pkHXyFTFIsTKhGi
 vtAsmMnu5FwKSlnZU1i/EuvcF89KEPx4jKRGQWiKUPwAuBUAkVa4xhsI/mAUmwv5
 +Z+hQxPuIAdmcbLblsI0mnhCGjMTx+qUQQdhY2r7U2bOKhEds+XekABb9KBrjOdj
 k8UEQZJwwWkcPSunYsOpBYBI1SIV8UeHtp8d2xrat90+Ome7feL1VFEjV/rOKc6w
 f7hUeYZPVNQMcXdfNRkoXHK/zqKxpMF5lz9Tq3mzfF6XoseC+gSW44dWQnHPnI8X
 kCj0bpg7o2WgGuc3UWIXrVXZmSEWhn30Go6UfsGqHT7xiULQvSc=
 =lAhb
 -----END PGP SIGNATURE-----

Merge tag 'for-6.19-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - fix missing btrfs_path release after printing a relocation error
   message

 - fix extent changeset leak on mmap write after failure to reserve
   metadata

 - fix fs devices list structure freeing, it could be potentially leaked
   under some circumstances

 - tree log fixes:
     - fix incremental directory logging where inodes for new dentries
       were incorrectly skipped
     - don't log conflicting inode if it's a directory moved in the
       current transaction

 - regression fixes:
     - fix incorrect btrfs_path freeing when it's auto-cleaned
     - revert commit simplifying preallocation of temporary structures
       in qgroup functions, some cases were not handled properly

* tag 'for-6.19-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix changeset leak on mmap write after failure to reserve metadata
  btrfs: fix memory leak of fs_devices in degraded seed device path
  btrfs: fix a potential path leak in print_data_reloc_error()
  Revert "btrfs: add ASSERTs on prealloc in qgroup functions"
  btrfs: do not skip logging new dentries when logging a new name
  btrfs: don't log conflicting inode if it's a dir moved in the current transaction
  btrfs: tests: fix double btrfs_path free in remove_extent_ref()
2025-12-16 19:28:20 +12:00
Linus Torvalds
dbf89321bf sched_ext: Fixes for v6.19-rc1
- Fix memory leak when destroying helper kthread workers during scheduler
   disable.
 
 - Fix bypass depth accounting on scx_enable() failure which could leave
   the system permanently in bypass mode.
 
 - Fix missing preemption handling when moving tasks to local DSQs via
   scx_bpf_dsq_move().
 
 - Misc fixes including NULL check for put_prev_task(), flushing stdout in
   selftests, and removing unused code.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaUA9aQ4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGXWKAP9VeREr2ceRFd3WK5vFsFzCGh1L8rRu+t352MGL
 qWXkzQD/apLFSo5RQZ0tE8qORwKM9hNhS8QkVJhlsv5VZRWMBQI=
 =ueoE
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-6.19-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix memory leak when destroying helper kthread workers during
   scheduler disable

 - Fix bypass depth accounting on scx_enable() failure which could leave
   the system permanently in bypass mode

 - Fix missing preemption handling when moving tasks to local DSQs via
   scx_bpf_dsq_move()

 - Misc fixes including NULL check for put_prev_task(), flushing stdout
   in selftests, and removing unused code

* tag 'sched_ext-for-6.19-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Remove unused code in the do_pick_task_scx()
  selftests/sched_ext: flush stdout before test to avoid log spam
  sched_ext: Fix missing post-enqueue handling in move_local_task_to_local_dsq()
  sched_ext: Factor out local_dsq_post_enq() from dispatch_enqueue()
  sched_ext: Fix bypass depth leak on scx_enable() failure
  sched/ext: Avoid null ptr traversal when ->put_prev_task() is called with NULL next
  sched_ext: Fix the memleak for sch->helper objects
2025-12-16 19:24:35 +12:00
Linus Torvalds
6b63f90fa2 cgroup: Fixes for v6.19-rc1
- Fix a race condition in css_rstat_updated() where CMPXCHG without LOCK
   prefix could cause lnode corruption when the flusher runs concurrently
   on another CPU. The issue was introduced in 6.17 and causes memcg stats
   to become corrupted in production.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaUA8mg4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGXhaAQDSk/ywLzJG807gzvhC5QLcY8kYzUUFHJ4TQAFp
 08UqOAEA9yDgBHPqkp6ucjEJMG+2esE9A5NJB2LeEjG8ZZOCDQM=
 =Kcfd
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-6.19-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fix from Tejun Heo:

 - Fix a race condition in css_rstat_updated() where CMPXCHG without
   LOCK prefix could cause lnode corruption when the flusher runs
   concurrently on another CPU. The issue was introduced in 6.17 and
   causes memcg stats to become corrupted in production.

* tag 'cgroup-for-6.19-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: rstat: use LOCK CMPXCHG in css_rstat_updated
2025-12-16 19:21:17 +12:00
Al Viro
e1b4c6a583 shmem: fix recovery on rename failures
maple_tree insertions can fail if we are seriously short on memory;
simple_offset_rename() does not recover well if it runs into that.
The same goes for simple_offset_rename_exchange().

Moreover, shmem_whiteout() expects that if it succeeds, the caller will
progress to d_move(), i.e. that shmem_rename2() won't fail past the
successful call of shmem_whiteout().

Not hard to fix, fortunately - mtree_store() can't fail if the index we
are trying to store into is already present in the tree as a singleton.

For simple_offset_rename_exchange() that's enough - we just need to be
careful about the order of operations.

For simple_offset_rename() solution is to preinsert the target into the
tree for new_dir; the rest can be done without any potentially failing
operations.

That preinsertion has to be done in shmem_rename2() rather than in
simple_offset_rename() itself - otherwise we'd need to deal with the
possibility of failure after successful shmem_whiteout().

Fixes: a2e459555c5f ("shmem: stable directory offsets")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-12-16 00:57:29 -05:00
Zqiang
bb27226f0d sched_ext: Remove unused code in the do_pick_task_scx()
The kick_idle variable is no longer used, this commit therefore remove
it and also remove associated code in the do_pick_task_scx().

Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-15 05:53:49 -10:00
Namjae Jeon
95d7a890e4 ksmbd: fix buffer validation by including null terminator size in EA length
The smb2_set_ea function, which handles Extended Attributes (EA),
was performing buffer validation checks that incorrectly omitted the size
of the null terminating character (+1 byte) for EA Name.
This patch fixes the issue by explicitly adding '+ 1' to EaNameLength where
the null terminator is expected to be present in the buffer, ensuring
the validation accurately reflects the total required buffer size.

Cc: stable@vger.kernel.org
Reported-by: Roger <roger.andersen@protonmail.com>
Reported-by: Stanislas Polu <spolu@dust.tt>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:56 -06:00
Namjae Jeon
cafb57f7bd ksmbd: Fix refcount leak when invalid session is found on session lookup
When a session is found but its state is not SMB2_SESSION_VALID, It
indicates that no valid session was found, but it is missing to decrement
the reference count acquired by the session lookup, which results in
a reference count leak. This patch fixes the issue by explicitly calling
ksmbd_user_session_put to release the reference to the session.

Cc: stable@vger.kernel.org
Reported-by: Alexandre <roger.andersen@protonmail.com>
Reported-by: Stanislas Polu <spolu@dust.tt>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:56 -06:00
Alexey Velichayshiy
8dd2e58b62 ksmbd: remove redundant DACL check in smb_check_perm_dacl
A zero value of pdacl->num_aces is already handled at the start of
smb_check_perm_dacl() so the second check is useless.

Drop the unreachable code block, no functional impact intended.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Signed-off-by: Alexey Velichayshiy <a.velichayshiy@ispras.ru>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:56 -06:00
Chen Ni
0446356e9f ksmbd: convert comma to semicolon
Replace comma between expressions with semicolons.

Using a ',' in place of a ';' can have unintended side effects.
Although that is not the case here, it is seems best to use ';'
unless ',' is intended.

Found by inspection.
No functional change intended.
Compile tested only.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:56 -06:00
Stefan Metzmacher
d180b1d9c7 smb: server: defer the initial recv completion logic to smb_direct_negotiate_recv_work()
The previous change to relax WARN_ON_ONCE(SMBDIRECT_SOCKET_*) checks in
recv_done() and smb_direct_cm_handler() seems to work around the
problem that the order of initial recv completion and
RDMA_CM_EVENT_ESTABLISHED is random, but it's still
a bit ugly.

This implements a better solution deferring the recv completion
processing to smb_direct_negotiate_recv_work(), which is queued
only if both events arrived.

In order to avoid more basic changes to the main recv_done
callback, I introduced a smb_direct_negotiate_recv_done,
which is only used for the first pdu, this will allow
further cleanup and simplifications in recv_done
as a future patch.

smb_direct_negotiate_recv_work() is also very basic
with only basic error checking and the transition
from SMBDIRECT_SOCKET_NEGOTIATE_NEEDED to
SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, which allows
smb_direct_prepare() to continue as before.

Cc: Tom Talpey <tom@talpey.com>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:55 -06:00
Stefan Metzmacher
c1fb124f2a smb: server: initialize recv_io->cqe.done = recv_done just once
smbdirect_recv_io structures are pre-allocated so we can set the
callback function just once.

This will make it easy to move smb_direct_post_recv to common code
soon.

Cc: Tom Talpey <tom@talpey.com>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:55 -06:00
Stefan Metzmacher
49ca214774 smb: smbdirect: introduce smbdirect_socket.connect.{lock,work}
This will first be used by the server in order to defer
the processing of the initial recv of the negotiation
request.

But in future it will also be used by the client in order
to implement an async connect.

Cc: Tom Talpey <tom@talpey.com>
Cc: Long Li <longli@microsoft.com>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-12-14 18:35:55 -06:00
Al Viro
3010f06c52 shmem_whiteout(): fix regression from tree-in-dcache series
Now that shmem_mknod() hashes the new dentry, d_rehash() in
shmem_whiteout() should be removed.

X-paperbag: brown
Reported-by: Hugh Dickins <hughd@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Tested-by: Hugh Dickins <hughd@google.com>
Fixes: 2313598222f9 ("convert ramfs and tmpfs")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2025-12-13 22:28:53 -05:00
Emil Tsalapatis
579a3297b2 selftests/sched_ext: flush stdout before test to avoid log spam
The sched_ext selftests runner runs each test in the same process,
with each test possibly forking multiple times. When the main runner
has not flushed its stdout, the children inherit the buffered output
for previous tests and emit it during exit. This causes log spam.

Make sure stdout/stderr is fully flushed before each test.

Cc: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-12 08:18:32 -10:00
Tejun Heo
f5e1e5ec20 sched_ext: Fix missing post-enqueue handling in move_local_task_to_local_dsq()
move_local_task_to_local_dsq() is used when moving a task from a non-local
DSQ to a local DSQ on the same CPU. It directly manipulates the local DSQ
without going through dispatch_enqueue() and was missing the post-enqueue
handling that triggers preemption when SCX_ENQ_PREEMPT is set or the idle
task is running.

The function is used by move_task_between_dsqs() which backs
scx_bpf_dsq_move() and may be called while the CPU is busy.

Add local_dsq_post_enq() call to move_local_task_to_local_dsq(). As the
dispatch path doesn't need post-enqueue handling, add SCX_RQ_IN_BALANCE
early exit to keep consume_dispatch_q() behavior unchanged and avoid
triggering unnecessary resched when scx_bpf_dsq_move() is used from the
dispatch path.

Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
Cc: stable@vger.kernel.org # v6.12+
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-12 06:26:42 -10:00
Tejun Heo
530b6637c7 sched_ext: Factor out local_dsq_post_enq() from dispatch_enqueue()
Factor out local_dsq_post_enq() which performs post-enqueue handling for
local DSQs - triggering resched_curr() if SCX_ENQ_PREEMPT is specified or if
the current CPU is idle. No functional change.

This will be used by the next patch to fix move_local_task_to_local_dsq().

Cc: stable@vger.kernel.org # v6.12+
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-12 06:26:07 -10:00
Filipe Manana
37343524f0 btrfs: fix changeset leak on mmap write after failure to reserve metadata
If the call to btrfs_delalloc_reserve_metadata() fails we jump to the
'out_noreserve' label and there we never free the extent_changeset
allocated by the previous call to btrfs_check_data_free_space() (if
qgroups are enabled). Fix this by calling extent_changeset_free() under
the 'out_noreserve' label.

Fixes: 6599716de2d6 ("btrfs: fix -ENOSPC mmap write failure on NOCOW files/extents")
Reported-by: syzbot+2f8aa76e6acc9fce6638@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/693a635a.a70a0220.33cd7b.0029.GAE@google.com/
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-12 16:33:18 +01:00
Deepanshu Kartikey
b57f2ddd28 btrfs: fix memory leak of fs_devices in degraded seed device path
In open_seed_devices(), when find_fsid() fails and we're in DEGRADED
mode, a new fs_devices is allocated via alloc_fs_devices() but is never
added to the seed_list before returning. This contrasts with the normal
path where fs_devices is properly added via list_add().

If any error occurs later in read_one_dev() or btrfs_read_chunk_tree(),
the cleanup code iterates seed_list to free seed devices, but this
orphaned fs_devices is never found and never freed, causing a memory
leak. Any devices allocated via add_missing_dev() and attached to this
fs_devices are also leaked.

Fix this by adding the newly allocated fs_devices to seed_list in the
degraded path, consistent with the normal path.

Fixes: 5f37583569442 ("Btrfs: move the missing device to its own fs device list")
Reported-by: syzbot+eadd98df8bceb15d7fed@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=eadd98df8bceb15d7fed
Tested-by: syzbot+eadd98df8bceb15d7fed@syzkaller.appspotmail.com
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Deepanshu Kartikey <kartikey406@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-12 16:33:12 +01:00
Tejun Heo
9f769637a9 sched_ext: Fix bypass depth leak on scx_enable() failure
scx_enable() calls scx_bypass(true) to initialize in bypass mode and then
scx_bypass(false) on success to exit. If scx_enable() fails during task
initialization - e.g. scx_cgroup_init() or scx_init_task() returns an error -
it jumps to err_disable while bypass is still active. scx_disable_workfn()
then calls scx_bypass(true/false) for its own bypass, leaving the bypass depth
at 1 instead of 0. This causes the system to remain permanently in bypass mode
after a failed scx_enable().

Failures after task initialization is complete - e.g. scx_tryset_enable_state()
at the end - already call scx_bypass(false) before reaching the error path and
are not affected. This only affects a subset of failure modes.

Fix it by tracking whether scx_enable() called scx_bypass(true) in a bool and
having scx_disable_workfn() call an extra scx_bypass(false) to clear it. This
is a temporary measure as the bypass depth will be moved into the sched
instance, which will make this tracking unnecessary.

Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm@meta.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Link: https://lore.kernel.org/stable/286e6f7787a81239e1ce2989b52391ce%40kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-11 06:27:35 -10:00
Qu Wenruo
313ef70a9f btrfs: fix a potential path leak in print_data_reloc_error()
Inside print_data_reloc_error(), if extent_from_logical() failed we
return immediately.

However there are the following cases where extent_from_logical() can
return error but still holds a path:

- btrfs_search_slot() returned 0

- No backref item found in extent tree

- No flags_ret provided
  This is not possible in this call site though.

So for the above two cases, we can return without releasing the path,
causing extent buffer leaks.

Fixes: b9a9a85059cd ("btrfs: output affected files when relocation fails")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-09 04:35:28 +01:00
Qu Wenruo
428e1b114c Revert "btrfs: add ASSERTs on prealloc in qgroup functions"
This reverts commit 252877a8701530fde861a4f27710c1e718e97caa.

Commit 252877a87015 ("btrfs: add ASSERTs on prealloc in qgroup
functions") tries to remove the kfree() on preallocated qgroup during
several call sites, but this cannot work as intended:

- btrfs_quota_enable()
- btrfs_create_qgroup()
  If add_qgroup_item() failed, we go out_free_path() and at that time
  prealloc is not yet utilized and will trigger the new ASSERT().

- btrfs_qgroup_inherit()
  If qgroup_auto_inherit() failed, prealloc is not yet utilized and
  will trigger the new ASSERT()

Reported-by: syzbot+b44d4a4885bc82af2a06@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/69369331.a70a0220.38f243.009e.GAE@google.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-09 04:32:46 +01:00
Filipe Manana
5630f7557d btrfs: do not skip logging new dentries when logging a new name
When we are logging a directory and the log context indicates that we
are logging a new name for some other file (that is or was inside that
directory), we skip logging the inodes for new dentries in the directory.

This is ok most of the time, but if after the rename or link operation
that triggered the logging of that directory, we have an explicit fsync
of that directory without the directory inode being evicted and reloaded,
we end up never logging the inodes for the new dentries that we found
during the new name logging, as the next directory fsync will only process
dentries that were added after the last time we logged the directory (we
are doing an incremental directory logging).

So make sure we always log new dentries for a directory even if we are
in a context of logging a new name.

We started skipping logging inodes for new dentries as of commit
c48792c6ee7a ("btrfs: do not log new dentries when logging that a new name
exists") and it was fine back then, because when logging a directory we
always iterated over all the directory entries (for leaves changed in the
current transaction) so a subsequent fsync would always log anything that
was previously skipped while logging a directory when logging a new name
(with btrfs_log_new_name()). But later support for incrementally logging
a directory was added in commit dc2872247ec0 ("btrfs: keep track of the
last logged keys when logging a directory"), to avoid checking all dir
items every time we log a directory, so the check to skip dentry logging
added in the first commit should have been removed when the incremental
support for logging a directory was added.

A test case for fstests will follow soon.

Reported-by: Vyacheslav Kovalevsky <slava.kovalevskiy.2014@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/84c4e713-85d6-42b9-8dcf-0722ed26cb05@gmail.com/
Fixes: dc2872247ec0 ("btrfs: keep track of the last logged keys when logging a directory")
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-08 21:46:56 +01:00
Filipe Manana
266273eaf4 btrfs: don't log conflicting inode if it's a dir moved in the current transaction
We can't log a conflicting inode if it's a directory and it was moved
from one parent directory to another parent directory in the current
transaction, as this can result an attempt to have a directory with
two hard links during log replay, one for the old parent directory and
another for the new parent directory.

The following scenario triggers that issue:

1) We have directories "dir1" and "dir2" created in a past transaction.
   Directory "dir1" has inode A as its parent directory;

2) We move "dir1" to some other directory;

3) We create a file with the name "dir1" in directory inode A;

4) We fsync the new file. This results in logging the inode of the new file
   and the inode for the directory "dir1" that was previously moved in the
   current transaction. So the log tree has the INODE_REF item for the
   new location of "dir1";

5) We move the new file to some other directory. This results in updating
   the log tree to included the new INODE_REF for the new location of the
   file and removes the INODE_REF for the old location. This happens
   during the rename when we call btrfs_log_new_name();

6) We fsync the file, and that persists the log tree changes done in the
   previous step (btrfs_log_new_name() only updates the log tree in
   memory);

7) We have a power failure;

8) Next time the fs is mounted, log replay happens and when processing
   the inode for directory "dir1" we find a new INODE_REF and add that
   link, but we don't remove the old link of the inode since we have
   not logged the old parent directory of the directory inode "dir1".

As a result after log replay finishes when we trigger writeback of the
subvolume tree's extent buffers, the tree check will detect that we have
a directory a hard link count of 2 and we get a mount failure.
The errors and stack traces reported in dmesg/syslog are like this:

   [ 3845.729764] BTRFS info (device dm-0): start tree-log replay
   [ 3845.730304] page: refcount:3 mapcount:0 mapping:000000005c8a3027 index:0x1d00 pfn:0x11510c
   [ 3845.731236] memcg:ffff9264c02f4e00
   [ 3845.731751] aops:btree_aops [btrfs] ino:1
   [ 3845.732300] flags: 0x17fffc00000400a(uptodate|private|writeback|node=0|zone=2|lastcpupid=0x1ffff)
   [ 3845.733346] raw: 017fffc00000400a 0000000000000000 dead000000000122 ffff9264d978aea8
   [ 3845.734265] raw: 0000000000001d00 ffff92650e6d4738 00000003ffffffff ffff9264c02f4e00
   [ 3845.735305] page dumped because: eb page dump
   [ 3845.735981] BTRFS critical (device dm-0): corrupt leaf: root=5 block=30408704 slot=6 ino=257, invalid nlink: has 2 expect no more than 1 for dir
   [ 3845.737786] BTRFS info (device dm-0): leaf 30408704 gen 10 total ptrs 17 free space 14881 owner 5
   [ 3845.737789] BTRFS info (device dm-0): refs 4 lock_owner 0 current 30701
   [ 3845.737792] 	item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160
   [ 3845.737794] 		inode generation 3 transid 9 size 16 nbytes 16384
   [ 3845.737795] 		block group 0 mode 40755 links 1 uid 0 gid 0
   [ 3845.737797] 		rdev 0 sequence 2 flags 0x0
   [ 3845.737798] 		atime 1764259517.0
   [ 3845.737800] 		ctime 1764259517.572889464
   [ 3845.737801] 		mtime 1764259517.572889464
   [ 3845.737802] 		otime 1764259517.0
   [ 3845.737803] 	item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12
   [ 3845.737805] 		index 0 name_len 2
   [ 3845.737807] 	item 2 key (256 DIR_ITEM 2363071922) itemoff 16077 itemsize 34
   [ 3845.737808] 		location key (257 1 0) type 2
   [ 3845.737810] 		transid 9 data_len 0 name_len 4
   [ 3845.737811] 	item 3 key (256 DIR_ITEM 2676584006) itemoff 16043 itemsize 34
   [ 3845.737813] 		location key (258 1 0) type 2
   [ 3845.737814] 		transid 9 data_len 0 name_len 4
   [ 3845.737815] 	item 4 key (256 DIR_INDEX 2) itemoff 16009 itemsize 34
   [ 3845.737816] 		location key (257 1 0) type 2
   [ 3845.737818] 		transid 9 data_len 0 name_len 4
   [ 3845.737819] 	item 5 key (256 DIR_INDEX 3) itemoff 15975 itemsize 34
   [ 3845.737820] 		location key (258 1 0) type 2
   [ 3845.737821] 		transid 9 data_len 0 name_len 4
   [ 3845.737822] 	item 6 key (257 INODE_ITEM 0) itemoff 15815 itemsize 160
   [ 3845.737824] 		inode generation 9 transid 10 size 6 nbytes 0
   [ 3845.737825] 		block group 0 mode 40755 links 2 uid 0 gid 0
   [ 3845.737826] 		rdev 0 sequence 1 flags 0x0
   [ 3845.737827] 		atime 1764259517.572889464
   [ 3845.737828] 		ctime 1764259517.572889464
   [ 3845.737830] 		mtime 1764259517.572889464
   [ 3845.737831] 		otime 1764259517.572889464
   [ 3845.737832] 	item 7 key (257 INODE_REF 256) itemoff 15801 itemsize 14
   [ 3845.737833] 		index 2 name_len 4
   [ 3845.737834] 	item 8 key (257 INODE_REF 258) itemoff 15787 itemsize 14
   [ 3845.737836] 		index 2 name_len 4
   [ 3845.737837] 	item 9 key (257 DIR_ITEM 2507850652) itemoff 15754 itemsize 33
   [ 3845.737838] 		location key (259 1 0) type 1
   [ 3845.737839] 		transid 10 data_len 0 name_len 3
   [ 3845.737840] 	item 10 key (257 DIR_INDEX 2) itemoff 15721 itemsize 33
   [ 3845.737842] 		location key (259 1 0) type 1
   [ 3845.737843] 		transid 10 data_len 0 name_len 3
   [ 3845.737844] 	item 11 key (258 INODE_ITEM 0) itemoff 15561 itemsize 160
   [ 3845.737846] 		inode generation 9 transid 10 size 8 nbytes 0
   [ 3845.737847] 		block group 0 mode 40755 links 1 uid 0 gid 0
   [ 3845.737848] 		rdev 0 sequence 1 flags 0x0
   [ 3845.737849] 		atime 1764259517.572889464
   [ 3845.737850] 		ctime 1764259517.572889464
   [ 3845.737851] 		mtime 1764259517.572889464
   [ 3845.737852] 		otime 1764259517.572889464
   [ 3845.737853] 	item 12 key (258 INODE_REF 256) itemoff 15547 itemsize 14
   [ 3845.737855] 		index 3 name_len 4
   [ 3845.737856] 	item 13 key (258 DIR_ITEM 1843588421) itemoff 15513 itemsize 34
   [ 3845.737857] 		location key (257 1 0) type 2
   [ 3845.737858] 		transid 10 data_len 0 name_len 4
   [ 3845.737860] 	item 14 key (258 DIR_INDEX 2) itemoff 15479 itemsize 34
   [ 3845.737861] 		location key (257 1 0) type 2
   [ 3845.737862] 		transid 10 data_len 0 name_len 4
   [ 3845.737863] 	item 15 key (259 INODE_ITEM 0) itemoff 15319 itemsize 160
   [ 3845.737865] 		inode generation 10 transid 10 size 0 nbytes 0
   [ 3845.737866] 		block group 0 mode 100600 links 1 uid 0 gid 0
   [ 3845.737867] 		rdev 0 sequence 2 flags 0x0
   [ 3845.737868] 		atime 1764259517.580874966
   [ 3845.737869] 		ctime 1764259517.586121869
   [ 3845.737870] 		mtime 1764259517.580874966
   [ 3845.737872] 		otime 1764259517.580874966
   [ 3845.737873] 	item 16 key (259 INODE_REF 257) itemoff 15306 itemsize 13
   [ 3845.737874] 		index 2 name_len 3
   [ 3845.737875] BTRFS error (device dm-0): block=30408704 write time tree block corruption detected
   [ 3845.739448] ------------[ cut here ]------------
   [ 3845.740092] WARNING: CPU: 5 PID: 30701 at fs/btrfs/disk-io.c:335 btree_csum_one_bio+0x25a/0x270 [btrfs]
   [ 3845.741439] Modules linked in: btrfs dm_flakey crc32c_cryptoapi (...)
   [ 3845.750626] CPU: 5 UID: 0 PID: 30701 Comm: mount Tainted: G        W           6.18.0-rc6-btrfs-next-218+ #1 PREEMPT(full)
   [ 3845.752414] Tainted: [W]=WARN
   [ 3845.752828] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
   [ 3845.754499] RIP: 0010:btree_csum_one_bio+0x25a/0x270 [btrfs]
   [ 3845.755460] Code: 31 f6 48 89 (...)
   [ 3845.758685] RSP: 0018:ffffa8d9c5677678 EFLAGS: 00010246
   [ 3845.759450] RAX: 0000000000000000 RBX: ffff92650e6d4738 RCX: 0000000000000000
   [ 3845.760309] RDX: 0000000000000000 RSI: ffffffff9aab45b9 RDI: ffff9264c4748000
   [ 3845.761239] RBP: ffff9264d4324000 R08: 0000000000000000 R09: ffffa8d9c5677468
   [ 3845.762607] R10: ffff926bdc1fffa8 R11: 0000000000000003 R12: ffffa8d9c5677680
   [ 3845.764099] R13: 0000000000004000 R14: ffff9264dd624000 R15: ffff9264d978aba8
   [ 3845.765094] FS:  00007f751fa5a840(0000) GS:ffff926c42a82000(0000) knlGS:0000000000000000
   [ 3845.766226] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   [ 3845.766970] CR2: 0000558df1815380 CR3: 000000010ed88003 CR4: 0000000000370ef0
   [ 3845.768009] Call Trace:
   [ 3845.768392]  <TASK>
   [ 3845.768714]  btrfs_submit_bbio+0x6ee/0x7f0 [btrfs]
   [ 3845.769640]  ? write_one_eb+0x28e/0x340 [btrfs]
   [ 3845.770588]  btree_write_cache_pages+0x2f0/0x550 [btrfs]
   [ 3845.771286]  ? alloc_extent_state+0x19/0x100 [btrfs]
   [ 3845.771967]  ? merge_next_state+0x1a/0x90 [btrfs]
   [ 3845.772586]  ? set_extent_bit+0x233/0x8b0 [btrfs]
   [ 3845.773198]  ? xas_load+0x9/0xc0
   [ 3845.773589]  ? xas_find+0x14d/0x1a0
   [ 3845.773969]  do_writepages+0xc6/0x160
   [ 3845.774367]  filemap_fdatawrite_wbc+0x48/0x60
   [ 3845.775003]  __filemap_fdatawrite_range+0x5b/0x80
   [ 3845.775902]  btrfs_write_marked_extents+0x61/0x170 [btrfs]
   [ 3845.776707]  btrfs_write_and_wait_transaction+0x4e/0xc0 [btrfs]
   [ 3845.777379]  ? _raw_spin_unlock_irqrestore+0x23/0x40
   [ 3845.777923]  btrfs_commit_transaction+0x5ea/0xd20 [btrfs]
   [ 3845.778551]  ? _raw_spin_unlock+0x15/0x30
   [ 3845.778986]  ? release_extent_buffer+0x34/0x160 [btrfs]
   [ 3845.779659]  btrfs_recover_log_trees+0x7a3/0x7c0 [btrfs]
   [ 3845.780416]  ? __pfx_replay_one_buffer+0x10/0x10 [btrfs]
   [ 3845.781499]  open_ctree+0x10bb/0x15f0 [btrfs]
   [ 3845.782194]  btrfs_get_tree.cold+0xb/0x16c [btrfs]
   [ 3845.782764]  ? fscontext_read+0x15c/0x180
   [ 3845.783202]  ? rw_verify_area+0x50/0x180
   [ 3845.783667]  vfs_get_tree+0x25/0xd0
   [ 3845.784047]  vfs_cmd_create+0x59/0xe0
   [ 3845.784458]  __do_sys_fsconfig+0x4f6/0x6b0
   [ 3845.784914]  do_syscall_64+0x50/0x1220
   [ 3845.785340]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
   [ 3845.785980] RIP: 0033:0x7f751fc7f4aa
   [ 3845.786759] Code: 73 01 c3 48 (...)
   [ 3845.789951] RSP: 002b:00007ffcdba45dc8 EFLAGS: 00000246 ORIG_RAX: 00000000000001af
   [ 3845.791402] RAX: ffffffffffffffda RBX: 000055ccc8291c20 RCX: 00007f751fc7f4aa
   [ 3845.792688] RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003
   [ 3845.794308] RBP: 000055ccc8292120 R08: 0000000000000000 R09: 0000000000000000
   [ 3845.795829] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
   [ 3845.797183] R13: 00007f751fe11580 R14: 00007f751fe1326c R15: 00007f751fdf8a23
   [ 3845.798633]  </TASK>
   [ 3845.799067] ---[ end trace 0000000000000000 ]---
   [ 3845.800215] BTRFS: error (device dm-0) in btrfs_commit_transaction:2553: errno=-5 IO failure (Error while writing out transaction)
   [ 3845.801860] BTRFS warning (device dm-0 state E): Skipping commit of aborted transaction.
   [ 3845.802815] BTRFS error (device dm-0 state EA): Transaction aborted (error -5)
   [ 3845.803728] BTRFS: error (device dm-0 state EA) in cleanup_transaction:2036: errno=-5 IO failure
   [ 3845.805374] BTRFS: error (device dm-0 state EA) in btrfs_replay_log:2083: errno=-5 IO failure (Failed to recover log tree)
   [ 3845.807919] BTRFS error (device dm-0 state EA): open_ctree failed: -5

Fix this by never logging a conflicting inode that is a directory and was
moved in the current transaction (its last_unlink_trans equals the current
transaction) and instead fallback to a transaction commit.

A test case for fstests will follow soon.

Reported-by: Vyacheslav Kovalevsky <slva.kovalevskiy.2014@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/7bbc9419-5c56-450a-b5a0-efeae7457113@gmail.com/
CC: stable@vger.kernel.org # 6.1+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-08 21:46:47 +01:00
Dan Carpenter
564d59410c btrfs: tests: fix double btrfs_path free in remove_extent_ref()
We converted this code to use auto free cleanup.h magic but one
remaining free was accidentally left behind which leads to a double free
bug.

Fixes: a320476ca8a3 ("btrfs: tests: do trivial BTRFS_PATH_AUTO_FREE conversions")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-12-08 21:46:43 +01:00
Shakeel Butt
3309b63a22 cgroup: rstat: use LOCK CMPXCHG in css_rstat_updated
On x86-64, this_cpu_cmpxchg() uses CMPXCHG without LOCK prefix which
means it is only safe for the local CPU and not for multiple CPUs.
Recently the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi
safe") make css_rstat_updated lockless and uses lockless list to allow
reentrancy. Since css_rstat_updated can invoked from process context,
IRQ and NMI, it uses this_cpu_cmpxchg() to select the winner which will
inset the lockless lnode into the global per-cpu lockless list.

However the commit missed one case where lockless node of a cgroup can
be accessed and modified by another CPU doing the flushing. Basically
llist_del_first_init() in css_process_update_tree().

On a cursory look, it can be questioned how css_process_update_tree()
can see a lockless node in global lockless list where the updater is at
this_cpu_cmpxchg() and before llist_add() call in css_rstat_updated().
This can indeed happen in the presence of IRQs/NMI.

Consider this scenario: Updater for cgroup stat C on CPU A in process
context is after llist_on_list() check and before this_cpu_cmpxchg() in
css_rstat_updated() where it get interrupted by IRQ/NMI. In the IRQ/NMI
context, a new updater calls css_rstat_updated() for same cgroup C and
successfully inserts rstatc_pcpu->lnode.

Now concurrently CPU B is running the flusher and it calls
llist_del_first_init() for CPU A and got rstatc_pcpu->lnode of cgroup C
which was added by the IRQ/NMI updater.

Now imagine CPU B calling init_llist_node() on cgroup C's
rstatc_pcpu->lnode of CPU A and on CPU A, the process context updater
calling this_cpu_cmpxchg(rstatc_pcpu->lnode) concurrently.

The CMPXCNG without LOCK on CPU A is not safe and thus we need LOCK
prefix.

In Meta's fleet running the kernel with the commit 36df6e3dbd7e, we are
observing on some machines the memcg stats are getting skewed by more
than the actual memory on the system. On close inspection, we noticed
that lockless node for a workload for specific CPU was in the bad state
and thus all the updates on that CPU for that cgroup was being lost.

To confirm if this skew was indeed due to this CMPXCHG without LOCK in
css_rstat_updated(), we created a repro (using AI) at [1] which shows
that CMPXCHG without LOCK creates almost the same lnode corruption as
seem in Meta's fleet and with LOCK CMPXCHG the issue does not
reproduces.

Link: http://lore.kernel.org/efiagdwmzfwpdzps74fvcwq3n4cs36q33ij7eebcpssactv3zu@se4hqiwxcfxq [1]
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: stable@vger.kernel.org # v6.17+
Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe")
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-08 08:26:56 -10:00
John Stultz
12b5cd99a0 sched/ext: Avoid null ptr traversal when ->put_prev_task() is called with NULL next
Early when trying to get sched_ext and proxy-exe working together,
I kept tripping over NULL ptr in put_prev_task_scx() on the line:
  if (sched_class_above(&ext_sched_class, next->sched_class)) {

Which was due to put_prev_task() passes a NULL next, calling:
  prev->sched_class->put_prev_task(rq, prev, NULL);

put_prev_task_scx() already guards for a NULL next in the
switch_class case, but doesn't seem to have a guard for
sched_class_above() check.

I can't say I understand why this doesn't trip usually without
proxy-exec. And in newer kernels there are way fewer
put_prev_task(), and I can't easily reproduce the issue now
even with proxy-exec.

But we still have one put_prev_task() call left in core.c that
seems like it could trip this, so I wanted to send this out for
consideration.

tj: put_prev_task() can be called with NULL @next; however, when @p is
queued, that doesn't happen, so this condition shouldn't currently be
triggerable. The connection isn't straightforward or necessarily reliable,
so add the NULL check even if it can't currently be triggered.

Link: http://lkml.kernel.org/r/20251206022218.1541878-1-jstultz@google.com
Signed-off-by: John Stultz <jstultz@google.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-08 07:18:13 -10:00
Zqiang
517a44d185 sched_ext: Fix the memleak for sch->helper objects
This commit use kthread_destroy_worker() to release sch->helper
objects to fix the following kmemleak:

unreferenced object 0xffff888121ec7b00 (size 128):
  comm "scx_simple", pid 1197, jiffies 4295884415
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 ad 4e ad de  .............N..
    ff ff ff ff 00 00 00 00 ff ff ff ff ff ff ff ff  ................
  backtrace (crc 587b3352):
    kmemleak_alloc+0x62/0xa0
    __kmalloc_cache_noprof+0x28d/0x3e0
    kthread_create_worker_on_node+0xd5/0x1f0
    scx_enable.isra.210+0x6c2/0x25b0
    bpf_scx_reg+0x12/0x20
    bpf_struct_ops_link_create+0x2c3/0x3b0
    __sys_bpf+0x3102/0x4b00
    __x64_sys_bpf+0x79/0xc0
    x64_sys_call+0x15d9/0x1dd0
    do_syscall_64+0xf0/0x470
    entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched")
Cc: stable@vger.kernel.org # v6.16+
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-12-08 07:08:17 -10:00
18 changed files with 313 additions and 149 deletions

View File

@ -2019,13 +2019,14 @@ out:
else
btrfs_delalloc_release_space(inode, data_reserved, page_start,
reserved_space, true);
extent_changeset_free(data_reserved);
out_noreserve:
if (only_release_metadata)
btrfs_check_nocow_unlock(inode);
sb_end_pagefault(inode->vfs_inode.i_sb);
extent_changeset_free(data_reserved);
if (ret < 0)
return vmf_error(ret);

View File

@ -256,6 +256,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
if (ret < 0) {
btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
logical, ret);
btrfs_release_path(&path);
return;
}
eb = path.nodes[0];

View File

@ -1243,14 +1243,7 @@ out:
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
/*
* At this point we either failed at allocating prealloc, or we
* succeeded and passed the ownership to it to add_qgroup_rb(). In any
* case, this needs to be NULL or there is something wrong.
*/
ASSERT(prealloc == NULL);
kfree(prealloc);
return ret;
}
@ -1682,12 +1675,7 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* At this point we either failed at allocating prealloc, or we
* succeeded and passed the ownership to it to add_qgroup_rb(). In any
* case, this needs to be NULL or there is something wrong.
*/
ASSERT(prealloc == NULL);
kfree(prealloc);
return ret;
}
@ -3279,7 +3267,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
struct btrfs_qgroup *prealloc = NULL;
struct btrfs_qgroup *prealloc;
struct btrfs_qgroup_list **qlist_prealloc = NULL;
bool free_inherit = false;
bool need_rescan = false;
@ -3520,14 +3508,7 @@ out:
}
if (free_inherit)
kfree(inherit);
/*
* At this point we either failed at allocating prealloc, or we
* succeeded and passed the ownership to it to add_qgroup_rb(). In any
* case, this needs to be NULL or there is something wrong.
*/
ASSERT(prealloc == NULL);
kfree(prealloc);
return ret;
}

View File

@ -187,7 +187,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
test_err("couldn't find backref %d", ret);
btrfs_free_path(path);
return ret;
}
btrfs_del_item(&trans, root, path);

View File

@ -5865,14 +5865,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_inode *curr_inode = start_inode;
int ret = 0;
/*
* If we are logging a new name, as part of a link or rename operation,
* don't bother logging new dentries, as we just want to log the names
* of an inode and that any new parents exist.
*/
if (ctx->logging_new_name)
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@ -6051,6 +6043,33 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
return ret;
}
static bool can_log_conflicting_inode(const struct btrfs_trans_handle *trans,
const struct btrfs_inode *inode)
{
if (!S_ISDIR(inode->vfs_inode.i_mode))
return true;
if (inode->last_unlink_trans < trans->transid)
return true;
/*
* If this is a directory and its unlink_trans is not from a past
* transaction then we must fallback to a transaction commit in order
* to avoid getting a directory with 2 hard links after log replay.
*
* This happens if a directory A is renamed, moved from one parent
* directory to another one, a new file is created in the old parent
* directory with the old name of our directory A, the new file is
* fsynced, then we moved the new file to some other parent directory
* and fsync again the new file. This results in a log tree where we
* logged that directory A existed, with the INODE_REF item for the
* new location but without having logged its old parent inode, so
* that on log replay we add a new link for the new location but the
* old link remains, resulting in a link count of 2.
*/
return false;
}
static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@ -6154,6 +6173,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
return 0;
}
if (!can_log_conflicting_inode(trans, inode)) {
btrfs_add_delayed_iput(inode);
return BTRFS_LOG_FORCE_COMMIT;
}
btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
@ -6218,6 +6242,12 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
break;
}
if (!can_log_conflicting_inode(trans, inode)) {
btrfs_add_delayed_iput(inode);
ret = BTRFS_LOG_FORCE_COMMIT;
break;
}
/*
* Always log the directory, we cannot make this
* conditional on need_log_inode() because the directory

View File

@ -7128,6 +7128,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices->seeding = true;
fs_devices->opened = 1;
list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
return fs_devices;
}

View File

@ -346,22 +346,22 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
* User space expects the directory offset value of the replaced
* (new) directory entry to be unchanged after a rename.
*
* Returns zero on success, a negative errno value on failure.
* Caller must have grabbed a slot for new_dentry in the maple_tree
* associated with new_dir, even if dentry is negative.
*/
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
long new_offset = dentry2offset(new_dentry);
simple_offset_remove(old_ctx, old_dentry);
if (WARN_ON(!new_offset))
return;
if (new_offset) {
offset_set(new_dentry, 0);
return simple_offset_replace(new_ctx, old_dentry, new_offset);
}
return simple_offset_add(new_ctx, old_dentry);
simple_offset_remove(old_ctx, old_dentry);
offset_set(new_dentry, 0);
WARN_ON(simple_offset_replace(new_ctx, old_dentry, new_offset));
}
/**
@ -388,31 +388,23 @@ int simple_offset_rename_exchange(struct inode *old_dir,
long new_index = dentry2offset(new_dentry);
int ret;
simple_offset_remove(old_ctx, old_dentry);
simple_offset_remove(new_ctx, new_dentry);
if (WARN_ON(!old_index || !new_index))
return -EINVAL;
ret = simple_offset_replace(new_ctx, old_dentry, new_index);
if (ret)
goto out_restore;
ret = mtree_store(&new_ctx->mt, new_index, old_dentry, GFP_KERNEL);
if (WARN_ON(ret))
return ret;
ret = simple_offset_replace(old_ctx, new_dentry, old_index);
if (ret) {
simple_offset_remove(new_ctx, old_dentry);
goto out_restore;
ret = mtree_store(&old_ctx->mt, old_index, new_dentry, GFP_KERNEL);
if (WARN_ON(ret)) {
mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
return ret;
}
ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
if (ret) {
simple_offset_remove(new_ctx, old_dentry);
simple_offset_remove(old_ctx, new_dentry);
goto out_restore;
}
offset_set(old_dentry, new_index);
offset_set(new_dentry, old_index);
simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
return 0;
out_restore:
(void)simple_offset_replace(old_ctx, old_dentry, old_index);
(void)simple_offset_replace(new_ctx, new_dentry, new_index);
return ret;
}
/**

View File

@ -132,6 +132,14 @@ struct smbdirect_socket {
struct smbdirect_socket_parameters parameters;
/*
* The state for connect/negotiation
*/
struct {
spinlock_t lock;
struct work_struct work;
} connect;
/*
* The state for keepalive and timeout handling
*/
@ -353,6 +361,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->disconnect_work);
spin_lock_init(&sc->connect.lock);
INIT_WORK(&sc->connect.work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->connect.work);
INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->idle.immediate_work);
INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);

View File

@ -325,8 +325,10 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
sess = ksmbd_session_lookup(conn, id);
if (!sess && conn->binding)
sess = ksmbd_session_lookup_slowpath(id);
if (sess && sess->state != SMB2_SESSION_VALID)
if (sess && sess->state != SMB2_SESSION_VALID) {
ksmbd_user_session_put(sess);
sess = NULL;
}
return sess;
}

View File

@ -2363,7 +2363,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
int rc = 0;
unsigned int next = 0;
if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength +
if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + 1 +
le16_to_cpu(eabuf->EaValueLength))
return -EINVAL;
@ -2440,7 +2440,7 @@ next:
break;
}
if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength +
if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + 1 +
le16_to_cpu(eabuf->EaValueLength)) {
rc = -EINVAL;
break;

View File

@ -1307,9 +1307,6 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
granted |= le32_to_cpu(ace->access_req);
ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
}
if (!pdacl->num_aces)
granted = GENERIC_ALL_FLAGS;
}
if (!uid)

View File

@ -242,6 +242,7 @@ static void smb_direct_disconnect_rdma_work(struct work_struct *work)
* disable[_delayed]_work_sync()
*/
disable_work(&sc->disconnect_work);
disable_work(&sc->connect.work);
disable_work(&sc->recv_io.posted.refill_work);
disable_delayed_work(&sc->idle.timer_work);
disable_work(&sc->idle.immediate_work);
@ -297,6 +298,7 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc)
* not queued again but here we don't block and avoid
* disable[_delayed]_work_sync()
*/
disable_work(&sc->connect.work);
disable_work(&sc->recv_io.posted.refill_work);
disable_work(&sc->idle.immediate_work);
disable_delayed_work(&sc->idle.timer_work);
@ -467,6 +469,7 @@ static void free_transport(struct smb_direct_transport *t)
*/
smb_direct_disconnect_wake_up_all(sc);
disable_work_sync(&sc->connect.work);
disable_work_sync(&sc->recv_io.posted.refill_work);
disable_delayed_work_sync(&sc->idle.timer_work);
disable_work_sync(&sc->idle.immediate_work);
@ -635,28 +638,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (sc->recv_io.expected) {
case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) {
put_recvmsg(sc, recvmsg);
smb_direct_disconnect_rdma_connection(sc);
return;
}
sc->recv_io.reassembly.full_packet_received = true;
/*
* Some drivers (at least mlx5_ib) might post a
* recv completion before RDMA_CM_EVENT_ESTABLISHED,
* we need to adjust our expectation in that case.
*/
if (!sc->first_error && sc->status == SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) {
put_recvmsg(sc, recvmsg);
smb_direct_disconnect_rdma_connection(sc);
return;
}
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
enqueue_reassembly(sc, recvmsg, 0);
wake_up(&sc->status_wait);
return;
/* see smb_direct_negotiate_recv_done */
break;
case SMBDIRECT_EXPECT_DATA_TRANSFER: {
struct smbdirect_data_transfer *data_transfer =
(struct smbdirect_data_transfer *)recvmsg->packet;
@ -742,6 +725,126 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
smb_direct_disconnect_rdma_connection(sc);
}
static void smb_direct_negotiate_recv_work(struct work_struct *work);
static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_recv_io *recv_io =
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = recv_io->socket;
unsigned long flags;
/*
* reset the common recv_done for later reuse.
*/
recv_io->cqe.done = recv_done;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
put_recvmsg(sc, recv_io);
if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
smb_direct_disconnect_rdma_connection(sc);
}
return;
}
ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
ib_dma_sync_single_for_cpu(sc->ib.dev,
recv_io->sge.addr,
recv_io->sge.length,
DMA_FROM_DEVICE);
/*
* This is an internal error!
*/
if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) {
put_recvmsg(sc, recv_io);
smb_direct_disconnect_rdma_connection(sc);
return;
}
/*
* Don't reset timer to the keepalive interval in
* this will be done in smb_direct_negotiate_recv_work.
*/
/*
* Only remember the recv_io if it has enough bytes,
* this gives smb_direct_negotiate_recv_work enough
* information in order to disconnect if it was not
* valid.
*/
sc->recv_io.reassembly.full_packet_received = true;
if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req))
enqueue_reassembly(sc, recv_io, 0);
else
put_recvmsg(sc, recv_io);
/*
* Some drivers (at least mlx5_ib and irdma in roce mode)
* might post a recv completion before RDMA_CM_EVENT_ESTABLISHED,
* we need to adjust our expectation in that case.
*
* So we defer further processing of the negotiation
* to smb_direct_negotiate_recv_work().
*
* If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
* we queue the work directly otherwise
* smb_direct_cm_handler() will do it, when
* RDMA_CM_EVENT_ESTABLISHED arrived.
*/
spin_lock_irqsave(&sc->connect.lock, flags);
if (!sc->first_error) {
INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work);
if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)
queue_work(sc->workqueue, &sc->connect.work);
}
spin_unlock_irqrestore(&sc->connect.lock, flags);
}
static void smb_direct_negotiate_recv_work(struct work_struct *work)
{
struct smbdirect_socket *sc =
container_of(work, struct smbdirect_socket, connect.work);
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_recv_io *recv_io;
if (sc->first_error)
return;
ksmbd_debug(RDMA, "Negotiate Recv Work running\n");
/*
* Reset timer to the keepalive interval in
* order to trigger our next keepalive message.
*/
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
msecs_to_jiffies(sp->keepalive_interval_msec));
/*
* If smb_direct_negotiate_recv_done() detected an
* invalid request we want to disconnect.
*/
recv_io = get_first_reassembly(sc);
if (!recv_io) {
smb_direct_disconnect_rdma_connection(sc);
return;
}
if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) {
smb_direct_disconnect_rdma_connection(sc);
return;
}
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
wake_up(&sc->status_wait);
}
static int smb_direct_post_recv(struct smbdirect_socket *sc,
struct smbdirect_recv_io *recvmsg)
{
@ -758,7 +861,6 @@ static int smb_direct_post_recv(struct smbdirect_socket *sc,
return ret;
recvmsg->sge.length = sp->max_recv_size;
recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey;
recvmsg->cqe.done = recv_done;
wr.wr_cqe = &recvmsg->cqe;
wr.next = NULL;
@ -1732,6 +1834,7 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
struct smbdirect_socket *sc = cm_id->context;
unsigned long flags;
ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n",
cm_id, rdma_event_msg(event->event), event->event);
@ -1739,18 +1842,27 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
switch (event->event) {
case RDMA_CM_EVENT_ESTABLISHED: {
/*
* Some drivers (at least mlx5_ib) might post a
* recv completion before RDMA_CM_EVENT_ESTABLISHED,
* Some drivers (at least mlx5_ib and irdma in roce mode)
* might post a recv completion before RDMA_CM_EVENT_ESTABLISHED,
* we need to adjust our expectation in that case.
*
* As we already started the negotiation, we just
* ignore RDMA_CM_EVENT_ESTABLISHED here.
* If smb_direct_negotiate_recv_done was called first
* it initialized sc->connect.work only for us to
* start, so that we turned into
* SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before
* smb_direct_negotiate_recv_work() runs.
*
* If smb_direct_negotiate_recv_done didn't happen
* yet. sc->connect.work is still be disabled and
* queue_work() is a no-op.
*/
if (!sc->first_error && sc->status > SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)
break;
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
break;
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
spin_lock_irqsave(&sc->connect.lock, flags);
if (!sc->first_error)
queue_work(sc->workqueue, &sc->connect.work);
spin_unlock_irqrestore(&sc->connect.lock, flags);
wake_up(&sc->status_wait);
break;
}
@ -1921,6 +2033,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
recvmsg = get_free_recvmsg(sc);
if (!recvmsg)
return -ENOMEM;
recvmsg->cqe.done = smb_direct_negotiate_recv_done;
ret = smb_direct_post_recv(sc, recvmsg);
if (ret) {
@ -2339,6 +2452,7 @@ respond:
static int smb_direct_connect(struct smbdirect_socket *sc)
{
struct smbdirect_recv_io *recv_io;
int ret;
ret = smb_direct_init_params(sc);
@ -2353,6 +2467,9 @@ static int smb_direct_connect(struct smbdirect_socket *sc)
return ret;
}
list_for_each_entry(recv_io, &sc->recv_io.free.list, list)
recv_io->cqe.done = recv_done;
ret = smb_direct_create_qpair(sc);
if (ret) {
pr_err("Can't accept RDMA client: %d\n", ret);

View File

@ -702,7 +702,7 @@ retry:
rd.old_parent = NULL;
rd.new_parent = new_path.dentry;
rd.flags = flags;
rd.delegated_inode = NULL,
rd.delegated_inode = NULL;
err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last);
if (err)
goto out_drop_write;

View File

@ -3247,7 +3247,7 @@ struct offset_ctx {
void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
struct dentry *old_dentry,

View File

@ -71,7 +71,6 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
struct llist_head *lhead;
struct css_rstat_cpu *rstatc;
struct css_rstat_cpu __percpu *rstatc_pcpu;
struct llist_node *self;
/*
@ -104,18 +103,22 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
/*
* This function can be renentered by irqs and nmis for the same cgroup
* and may try to insert the same per-cpu lnode into the llist. Note
* that llist_add() does not protect against such scenarios.
* that llist_add() does not protect against such scenarios. In addition
* this same per-cpu lnode can be modified through init_llist_node()
* from css_rstat_flush() running on a different CPU.
*
* To protect against such stacked contexts of irqs/nmis, we use the
* fact that lnode points to itself when not on a list and then use
* this_cpu_cmpxchg() to atomically set to NULL to select the winner
* try_cmpxchg() to atomically set to NULL to select the winner
* which will call llist_add(). The losers can assume the insertion is
* successful and the winner will eventually add the per-cpu lnode to
* the llist.
*
* Please note that we can not use this_cpu_cmpxchg() here as on some
* archs it is not safe against modifications from multiple CPUs.
*/
self = &rstatc->lnode;
rstatc_pcpu = css->rstat_cpu;
if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL))
return;
lhead = ss_lhead_cpu(css->ss, cpu);

View File

@ -41,6 +41,13 @@ static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
/*
* Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
* depth on enable failure. Will be removed when bypass depth is moved into the
* sched instance.
*/
static bool scx_bypassed_for_enable;
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@ -975,6 +982,30 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
u64 enq_flags)
{
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
bool preempt = false;
/*
* If @rq is in balance, the CPU is already vacant and looking for the
* next task to run. No need to preempt or trigger resched after moving
* @p into its local DSQ.
*/
if (rq->scx.flags & SCX_RQ_IN_BALANCE)
return;
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
preempt = true;
}
if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
resched_curr(rq);
}
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
struct task_struct *p, u64 enq_flags)
{
@ -1086,22 +1117,10 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
if (enq_flags & SCX_ENQ_CLEAR_OPSS)
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
if (is_local) {
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
bool preempt = false;
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
preempt = true;
}
if (preempt || sched_class_above(&ext_sched_class,
rq->curr->sched_class))
resched_curr(rq);
} else {
if (is_local)
local_dsq_post_enq(dsq, p, enq_flags);
else
raw_spin_unlock(&dsq->lock);
}
}
static void task_unlink_from_dsq(struct task_struct *p,
@ -1625,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
dsq_mod_nr(dst_dsq, 1);
p->scx.dsq = dst_dsq;
local_dsq_post_enq(dst_dsq, p, enq_flags);
}
/**
@ -2402,7 +2423,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* ops.enqueue() that @p is the only one available for this cpu,
* which should trigger an explicit follow-up scheduling event.
*/
if (sched_class_above(&ext_sched_class, next->sched_class)) {
if (next && sched_class_above(&ext_sched_class, next->sched_class)) {
WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
@ -2425,7 +2446,7 @@ static struct task_struct *
do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
{
struct task_struct *prev = rq->curr;
bool keep_prev, kick_idle = false;
bool keep_prev;
struct task_struct *p;
/* see kick_cpus_irq_workfn() */
@ -2467,12 +2488,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
} else {
p = first_local_task(rq);
if (!p) {
if (kick_idle)
scx_kick_cpu(rcu_dereference_sched(scx_root),
cpu_of(rq), SCX_KICK_IDLE);
if (!p)
return NULL;
}
if (unlikely(!p->scx.slice)) {
struct scx_sched *sch = rcu_dereference_sched(scx_root);
@ -3575,7 +3592,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
int node;
irq_work_sync(&sch->error_irq_work);
kthread_stop(sch->helper->task);
kthread_destroy_worker(sch->helper);
free_percpu(sch->pcpu);
@ -4318,6 +4335,11 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_max_batch = 0;
free_kick_syncs();
if (scx_bypassed_for_enable) {
scx_bypassed_for_enable = false;
scx_bypass(false);
}
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@ -4786,7 +4808,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
return sch;
err_stop_helper:
kthread_stop(sch->helper->task);
kthread_destroy_worker(sch->helper);
err_free_pcpu:
free_percpu(sch->pcpu);
err_free_gdsqs:
@ -4970,6 +4992,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* Init in bypass mode to guarantee forward progress.
*/
scx_bypass(true);
scx_bypassed_for_enable = true;
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
@ -5067,6 +5090,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
scx_bypassed_for_enable = false;
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {

View File

@ -4019,22 +4019,10 @@ static int shmem_whiteout(struct mnt_idmap *idmap,
whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
if (!whiteout)
return -ENOMEM;
error = shmem_mknod(idmap, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
return error;
/*
* Cheat and hash the whiteout while the old dentry is still in
* place, instead of playing games with FS_RENAME_DOES_D_MOVE.
*
* d_lookup() will consistently find one of them at this point,
* not sure which one, but that isn't even important.
*/
d_rehash(whiteout);
return 0;
return error;
}
/*
@ -4050,6 +4038,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
bool had_offset = false;
int error;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
@ -4062,16 +4051,23 @@ static int shmem_rename2(struct mnt_idmap *idmap,
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
if (error)
error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry);
if (error == -EBUSY)
had_offset = true;
else if (unlikely(error))
return error;
if (flags & RENAME_WHITEOUT) {
error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error) {
if (!had_offset)
simple_offset_remove(shmem_get_offset_ctx(new_dir),
new_dentry);
return error;
}
}
simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {

View File

@ -46,6 +46,14 @@ static void print_test_preamble(const struct scx_test *test, bool quiet)
if (!quiet)
printf("DESCRIPTION: %s\n", test->description);
printf("OUTPUT:\n");
/*
* The tests may fork with the preamble buffered
* in the children's stdout. Flush before the test
* to avoid printing the message multiple times.
*/
fflush(stdout);
fflush(stderr);
}
static const char *status_to_result(enum scx_test_status status)