mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-01-11 09:00:12 +00:00
ctx->tcxt_list holds the tasks using this ring, and it's currently protected by the normal ctx->uring_lock. However, this can cause a circular locking issue, as reported by syzbot, where cancelations off exec end up needing to remove an entry from this list: ====================================================== WARNING: possible circular locking dependency detected syzkaller #0 Tainted: G L ------------------------------------------------------ syz.0.9999/12287 is trying to acquire lock: ffff88805851c0a8 (&ctx->uring_lock){+.+.}-{4:4}, at: io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 but task is already holding lock: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline] ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sig->cred_guard_mutex){+.+.}-{4:4}: __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 proc_pid_attr_write+0x547/0x630 fs/proc/base.c:2837 vfs_write+0x27e/0xb30 fs/read_write.c:684 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 (sb_writers#3){.+.+}-{0:0}: percpu_down_read_internal include/linux/percpu-rwsem.h:53 [inline] percpu_down_read_freezable include/linux/percpu-rwsem.h:83 [inline] __sb_start_write include/linux/fs/super.h:19 [inline] sb_start_write+0x4d/0x1c0 include/linux/fs/super.h:125 mnt_want_write+0x41/0x90 fs/namespace.c:499 open_last_lookups fs/namei.c:4529 [inline] path_openat+0xadd/0x3dd0 fs/namei.c:4784 do_filp_open+0x1fa/0x410 fs/namei.c:4814 io_openat2+0x3e0/0x5c0 io_uring/openclose.c:143 __io_issue_sqe+0x181/0x4b0 io_uring/io_uring.c:1792 io_issue_sqe+0x165/0x1060 io_uring/io_uring.c:1815 io_queue_sqe io_uring/io_uring.c:2042 [inline] io_submit_sqe io_uring/io_uring.c:2320 [inline] io_submit_sqes+0xbf4/0x2140 io_uring/io_uring.c:2434 __do_sys_io_uring_enter io_uring/io_uring.c:3280 [inline] __se_sys_io_uring_enter+0x2e0/0x2b60 io_uring/io_uring.c:3219 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&ctx->uring_lock){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain kernel/locking/lockdep.c:3908 [inline] __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868 __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646 io_uring_task_cancel include/linux/io_uring.h:24 [inline] begin_new_exec+0x10ed/0x2440 fs/exec.c:1131 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010 search_binary_handler fs/exec.c:1669 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve+0x92e/0x1400 fs/exec.c:1753 do_execveat_common+0x510/0x6a0 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x94/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: &ctx->uring_lock --> sb_writers#3 --> &sig->cred_guard_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sig->cred_guard_mutex); lock(sb_writers#3); lock(&sig->cred_guard_mutex); lock(&ctx->uring_lock); *** DEADLOCK *** 1 lock held by syz.0.9999/12287: #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline] #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733 stack backtrace: CPU: 0 UID: 0 PID: 12287 Comm: syz.0.9999 Tainted: G L syzkaller #0 PREEMPT(full) Tainted: [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025 Call Trace: <TASK> dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_circular_bug+0x2e2/0x300 kernel/locking/lockdep.c:2043 check_noncircular+0x12e/0x150 kernel/locking/lockdep.c:2175 check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain kernel/locking/lockdep.c:3908 [inline] __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868 __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646 io_uring_task_cancel include/linux/io_uring.h:24 [inline] begin_new_exec+0x10ed/0x2440 fs/exec.c:1131 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010 search_binary_handler fs/exec.c:1669 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve+0x92e/0x1400 fs/exec.c:1753 do_execveat_common+0x510/0x6a0 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x94/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7ff3a8b8f749 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ff3a9a97038 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 00007ff3a8de5fa0 RCX: 00007ff3a8b8f749 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000400 RBP: 00007ff3a8c13f91 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ff3a8de6038 R14: 00007ff3a8de5fa0 R15: 00007ff3a8f0fa28 </TASK> Add a separate lock just for the tctx_list, tctx_lock. This can nest under ->uring_lock, where necessary, and be used separately for list manipulation. For the cancelation off exec side, this removes the need to grab ->uring_lock, hence fixing the circular locking dependency. Reported-by: syzbot+b0e3b77ffaa8a4067ce5@syzkaller.appspotmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
354 lines
7.7 KiB
C
354 lines
7.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/file.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/nospec.h>
|
|
#include <linux/io_uring.h>
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
|
|
#include "io_uring.h"
|
|
#include "tctx.h"
|
|
|
|
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
|
|
struct task_struct *task)
|
|
{
|
|
struct io_wq_hash *hash;
|
|
struct io_wq_data data;
|
|
unsigned int concurrency;
|
|
|
|
mutex_lock(&ctx->uring_lock);
|
|
hash = ctx->hash_map;
|
|
if (!hash) {
|
|
hash = kzalloc(sizeof(*hash), GFP_KERNEL);
|
|
if (!hash) {
|
|
mutex_unlock(&ctx->uring_lock);
|
|
return ERR_PTR(-ENOMEM);
|
|
}
|
|
refcount_set(&hash->refs, 1);
|
|
init_waitqueue_head(&hash->wait);
|
|
ctx->hash_map = hash;
|
|
}
|
|
mutex_unlock(&ctx->uring_lock);
|
|
|
|
data.hash = hash;
|
|
data.task = task;
|
|
|
|
/* Do QD, or 4 * CPUS, whatever is smallest */
|
|
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
|
|
|
|
return io_wq_create(concurrency, &data);
|
|
}
|
|
|
|
void __io_uring_free(struct task_struct *tsk)
|
|
{
|
|
struct io_uring_task *tctx = tsk->io_uring;
|
|
struct io_tctx_node *node;
|
|
unsigned long index;
|
|
|
|
/*
|
|
* Fault injection forcing allocation errors in the xa_store() path
|
|
* can lead to xa_empty() returning false, even though no actual
|
|
* node is stored in the xarray. Until that gets sorted out, attempt
|
|
* an iteration here and warn if any entries are found.
|
|
*/
|
|
xa_for_each(&tctx->xa, index, node) {
|
|
WARN_ON_ONCE(1);
|
|
break;
|
|
}
|
|
WARN_ON_ONCE(tctx->io_wq);
|
|
WARN_ON_ONCE(tctx->cached_refs);
|
|
|
|
percpu_counter_destroy(&tctx->inflight);
|
|
kfree(tctx);
|
|
tsk->io_uring = NULL;
|
|
}
|
|
|
|
__cold int io_uring_alloc_task_context(struct task_struct *task,
|
|
struct io_ring_ctx *ctx)
|
|
{
|
|
struct io_uring_task *tctx;
|
|
int ret;
|
|
|
|
tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
|
|
if (unlikely(!tctx))
|
|
return -ENOMEM;
|
|
|
|
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
|
|
if (unlikely(ret)) {
|
|
kfree(tctx);
|
|
return ret;
|
|
}
|
|
|
|
tctx->io_wq = io_init_wq_offload(ctx, task);
|
|
if (IS_ERR(tctx->io_wq)) {
|
|
ret = PTR_ERR(tctx->io_wq);
|
|
percpu_counter_destroy(&tctx->inflight);
|
|
kfree(tctx);
|
|
return ret;
|
|
}
|
|
|
|
tctx->task = task;
|
|
xa_init(&tctx->xa);
|
|
init_waitqueue_head(&tctx->wait);
|
|
atomic_set(&tctx->in_cancel, 0);
|
|
atomic_set(&tctx->inflight_tracked, 0);
|
|
task->io_uring = tctx;
|
|
init_llist_head(&tctx->task_list);
|
|
init_task_work(&tctx->task_work, tctx_task_work);
|
|
return 0;
|
|
}
|
|
|
|
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
|
|
{
|
|
struct io_uring_task *tctx = current->io_uring;
|
|
struct io_tctx_node *node;
|
|
int ret;
|
|
|
|
if (unlikely(!tctx)) {
|
|
ret = io_uring_alloc_task_context(current, ctx);
|
|
if (unlikely(ret))
|
|
return ret;
|
|
|
|
tctx = current->io_uring;
|
|
if (ctx->iowq_limits_set) {
|
|
unsigned int limits[2] = { ctx->iowq_limits[0],
|
|
ctx->iowq_limits[1], };
|
|
|
|
ret = io_wq_max_workers(tctx->io_wq, limits);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
}
|
|
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
|
|
node = kmalloc(sizeof(*node), GFP_KERNEL);
|
|
if (!node)
|
|
return -ENOMEM;
|
|
node->ctx = ctx;
|
|
node->task = current;
|
|
|
|
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
|
|
node, GFP_KERNEL));
|
|
if (ret) {
|
|
kfree(node);
|
|
return ret;
|
|
}
|
|
|
|
mutex_lock(&ctx->tctx_lock);
|
|
list_add(&node->ctx_node, &ctx->tctx_list);
|
|
mutex_unlock(&ctx->tctx_lock);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
|
|
{
|
|
int ret;
|
|
|
|
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
|
|
&& ctx->submitter_task != current)
|
|
return -EEXIST;
|
|
|
|
ret = __io_uring_add_tctx_node(ctx);
|
|
if (ret)
|
|
return ret;
|
|
|
|
current->io_uring->last = ctx;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Remove this io_uring_file -> task mapping.
|
|
*/
|
|
__cold void io_uring_del_tctx_node(unsigned long index)
|
|
{
|
|
struct io_uring_task *tctx = current->io_uring;
|
|
struct io_tctx_node *node;
|
|
|
|
if (!tctx)
|
|
return;
|
|
node = xa_erase(&tctx->xa, index);
|
|
if (!node)
|
|
return;
|
|
|
|
WARN_ON_ONCE(current != node->task);
|
|
WARN_ON_ONCE(list_empty(&node->ctx_node));
|
|
|
|
mutex_lock(&node->ctx->tctx_lock);
|
|
list_del(&node->ctx_node);
|
|
mutex_unlock(&node->ctx->tctx_lock);
|
|
|
|
if (tctx->last == node->ctx)
|
|
tctx->last = NULL;
|
|
kfree(node);
|
|
}
|
|
|
|
__cold void io_uring_clean_tctx(struct io_uring_task *tctx)
|
|
{
|
|
struct io_wq *wq = tctx->io_wq;
|
|
struct io_tctx_node *node;
|
|
unsigned long index;
|
|
|
|
xa_for_each(&tctx->xa, index, node) {
|
|
io_uring_del_tctx_node(index);
|
|
cond_resched();
|
|
}
|
|
if (wq) {
|
|
/*
|
|
* Must be after io_uring_del_tctx_node() (removes nodes under
|
|
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
|
|
*/
|
|
io_wq_put_and_exit(wq);
|
|
tctx->io_wq = NULL;
|
|
}
|
|
}
|
|
|
|
void io_uring_unreg_ringfd(void)
|
|
{
|
|
struct io_uring_task *tctx = current->io_uring;
|
|
int i;
|
|
|
|
for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
|
|
if (tctx->registered_rings[i]) {
|
|
fput(tctx->registered_rings[i]);
|
|
tctx->registered_rings[i] = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
|
|
int start, int end)
|
|
{
|
|
int offset;
|
|
for (offset = start; offset < end; offset++) {
|
|
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
|
|
if (tctx->registered_rings[offset])
|
|
continue;
|
|
|
|
tctx->registered_rings[offset] = file;
|
|
return offset;
|
|
}
|
|
return -EBUSY;
|
|
}
|
|
|
|
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
|
|
int start, int end)
|
|
{
|
|
struct file *file;
|
|
int offset;
|
|
|
|
file = fget(fd);
|
|
if (!file) {
|
|
return -EBADF;
|
|
} else if (!io_is_uring_fops(file)) {
|
|
fput(file);
|
|
return -EOPNOTSUPP;
|
|
}
|
|
offset = io_ring_add_registered_file(tctx, file, start, end);
|
|
if (offset < 0)
|
|
fput(file);
|
|
return offset;
|
|
}
|
|
|
|
/*
|
|
* Register a ring fd to avoid fdget/fdput for each io_uring_enter()
|
|
* invocation. User passes in an array of struct io_uring_rsrc_update
|
|
* with ->data set to the ring_fd, and ->offset given for the desired
|
|
* index. If no index is desired, application may set ->offset == -1U
|
|
* and we'll find an available index. Returns number of entries
|
|
* successfully processed, or < 0 on error if none were processed.
|
|
*/
|
|
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
|
|
unsigned nr_args)
|
|
{
|
|
struct io_uring_rsrc_update __user *arg = __arg;
|
|
struct io_uring_rsrc_update reg;
|
|
struct io_uring_task *tctx;
|
|
int ret, i;
|
|
|
|
if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
|
|
return -EINVAL;
|
|
|
|
mutex_unlock(&ctx->uring_lock);
|
|
ret = __io_uring_add_tctx_node(ctx);
|
|
mutex_lock(&ctx->uring_lock);
|
|
if (ret)
|
|
return ret;
|
|
|
|
tctx = current->io_uring;
|
|
for (i = 0; i < nr_args; i++) {
|
|
int start, end;
|
|
|
|
if (copy_from_user(®, &arg[i], sizeof(reg))) {
|
|
ret = -EFAULT;
|
|
break;
|
|
}
|
|
|
|
if (reg.resv) {
|
|
ret = -EINVAL;
|
|
break;
|
|
}
|
|
|
|
if (reg.offset == -1U) {
|
|
start = 0;
|
|
end = IO_RINGFD_REG_MAX;
|
|
} else {
|
|
if (reg.offset >= IO_RINGFD_REG_MAX) {
|
|
ret = -EINVAL;
|
|
break;
|
|
}
|
|
start = reg.offset;
|
|
end = start + 1;
|
|
}
|
|
|
|
ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
reg.offset = ret;
|
|
if (copy_to_user(&arg[i], ®, sizeof(reg))) {
|
|
fput(tctx->registered_rings[reg.offset]);
|
|
tctx->registered_rings[reg.offset] = NULL;
|
|
ret = -EFAULT;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return i ? i : ret;
|
|
}
|
|
|
|
int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
|
|
unsigned nr_args)
|
|
{
|
|
struct io_uring_rsrc_update __user *arg = __arg;
|
|
struct io_uring_task *tctx = current->io_uring;
|
|
struct io_uring_rsrc_update reg;
|
|
int ret = 0, i;
|
|
|
|
if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
|
|
return -EINVAL;
|
|
if (!tctx)
|
|
return 0;
|
|
|
|
for (i = 0; i < nr_args; i++) {
|
|
if (copy_from_user(®, &arg[i], sizeof(reg))) {
|
|
ret = -EFAULT;
|
|
break;
|
|
}
|
|
if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
|
|
ret = -EINVAL;
|
|
break;
|
|
}
|
|
|
|
reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
|
|
if (tctx->registered_rings[reg.offset]) {
|
|
fput(tctx->registered_rings[reg.offset]);
|
|
tctx->registered_rings[reg.offset] = NULL;
|
|
}
|
|
}
|
|
|
|
return i ? i : ret;
|
|
}
|