1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00
Pavel Begunkov c265ae75f9 io_uring: introduce io_uring querying
There are many parameters users might want to query about io_uring like
available request types or the ring sizes. This patch introduces an
interface for such slow path queries.

It was written with several requirements in mind:
- Can be used with or without an io_uring instance. Asking for supported
  setup flags before creating an instance as well as qeurying info about
  an already created ring are valid use cases.
- Should be moderately fast. For example, users might use it to
  periodically retrieve ring attributes at runtime. As a consequence,
  it should be able to query multiple attributes in a single syscall.
- Backward and forward compatible.
- Should be reasobably easy to use.
- Reduce the kernel code size for introducing new query types.

It's implemented as a new registration opcode IORING_REGISTER_QUERY.
The user passes one or more query strutctures linked together, each
represented by struct io_uring_query_hdr. The header stores common
control fields needed for processing and points to query type specific
information.

The header contains
- The query type
- The result field, which on return contains the error code for the query
- Pointer to the query type specific information
- The size of the query structure. The kernel will only populate up to
  the size, which helps with backward compatibility. The kernel can also
  reduce the size, so if the current kernel is older than the inteface
  the user tries to use, it'll get only the supported bits.
- next_entry field is used to chain multiple queries.

Apart from common registeration syscall failures, it can only immediately
return an error code in case when the headers are incorrect or any
other addresses and invalid. That usually mean that the userspace
doesn't use the API right and should be corrected. All query type
specific errors are returned in the header's result field.

As an example, the patch adds a single query type for now, i.e.
IO_URING_QUERY_OPCODES, which tells what register / request / etc.
opcodes are supported, but there are particular plans to extend it.

Note: there is a request probing interface via IORING_REGISTER_PROBE,
but it's a mess. It requires the user to create a ring first, it only
works for requests, and requires dynamic allocations.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-09-08 08:06:37 -06:00

946 lines
23 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Code related to the io_uring_register() syscall
*
* Copyright (C) 2023 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/refcount.h>
#include <linux/bits.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include "io_uring.h"
#include "opdef.h"
#include "tctx.h"
#include "rsrc.h"
#include "sqpoll.h"
#include "register.h"
#include "cancel.h"
#include "kbuf.h"
#include "napi.h"
#include "eventfd.h"
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
#include "query.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
int i, ret;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
size = struct_size(p, ops, nr_args);
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
ret = -EFAULT;
if (copy_from_user(p, arg, size))
goto out;
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
p->last_op = IORING_OP_LAST - 1;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (io_uring_op_supported(i))
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
ret = 0;
if (copy_to_user(arg, p, size))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *creds;
creds = xa_erase(&ctx->personalities, id);
if (creds) {
put_cred(creds);
return 0;
}
return -EINVAL;
}
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds;
u32 id;
int ret;
creds = get_current_cred();
ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
if (ret < 0) {
put_cred(creds);
return ret;
}
return id;
}
static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
struct io_restriction *restrictions)
{
struct io_uring_restriction *res;
size_t size;
int i, ret;
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
return -EINVAL;
size = array_size(nr_args, sizeof(*res));
if (size == SIZE_MAX)
return -EOVERFLOW;
res = memdup_user(arg, size);
if (IS_ERR(res))
return PTR_ERR(res);
ret = -EINVAL;
for (i = 0; i < nr_args; i++) {
switch (res[i].opcode) {
case IORING_RESTRICTION_REGISTER_OP:
if (res[i].register_op >= IORING_REGISTER_LAST)
goto err;
__set_bit(res[i].register_op, restrictions->register_op);
break;
case IORING_RESTRICTION_SQE_OP:
if (res[i].sqe_op >= IORING_OP_LAST)
goto err;
__set_bit(res[i].sqe_op, restrictions->sqe_op);
break;
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
restrictions->sqe_flags_allowed = res[i].sqe_flags;
break;
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
restrictions->sqe_flags_required = res[i].sqe_flags;
break;
default:
goto err;
}
}
ret = 0;
err:
kfree(res);
return ret;
}
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
void __user *arg, unsigned int nr_args)
{
int ret;
/* Restrictions allowed only if rings started disabled */
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
/* We allow only a single restrictions registration */
if (ctx->restrictions.registered)
return -EBUSY;
ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
/* Reset all restrictions if an error happened */
if (ret != 0)
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
else
ctx->restrictions.registered = true;
return ret;
}
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
/*
* Lazy activation attempts would fail if it was polled before
* submitter_task is set.
*/
if (wq_has_sleeper(&ctx->poll_wq))
io_activate_pollwq(ctx);
}
if (ctx->restrictions.registered)
ctx->restricted = 1;
ctx->flags &= ~IORING_SETUP_R_DISABLED;
if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
return 0;
}
static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
cpumask_var_t new_mask)
{
int ret;
if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_wq_cpu_affinity(current->io_uring, new_mask);
} else {
mutex_unlock(&ctx->uring_lock);
ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
mutex_lock(&ctx->uring_lock);
}
return ret;
}
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
void __user *arg, unsigned len)
{
cpumask_var_t new_mask;
int ret;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
cpumask_clear(new_mask);
if (len > cpumask_size())
len = cpumask_size();
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
ret = compat_get_bitmap(cpumask_bits(new_mask),
(const compat_ulong_t __user *)arg,
len * 8 /* CHAR_BIT */);
else
#endif
ret = copy_from_user(new_mask, arg, len);
if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
ret = __io_register_iowq_aff(ctx, new_mask);
free_cpumask_var(new_mask);
return ret;
}
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
return __io_register_iowq_aff(ctx, NULL);
}
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
int i, ret;
if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX)
return -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
struct task_struct *tsk;
/*
* Observe the correct sqd->lock -> ctx->uring_lock
* ordering. Fine to drop uring_lock here, we hold
* a ref to the ctx.
*/
refcount_inc(&sqd->refs);
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
tsk = sqpoll_task_locked(sqd);
if (tsk)
tctx = tsk->io_uring;
}
} else {
tctx = current->io_uring;
}
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i])
ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
goto err;
} else {
memset(new_count, 0, sizeof(new_count));
}
if (sqd) {
mutex_unlock(&ctx->uring_lock);
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
mutex_lock(&ctx->uring_lock);
}
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
/* that's it for SQPOLL, only the SQPOLL task creates requests */
if (sqd)
return 0;
/* now propagate the restriction to all registered users */
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
tctx = node->task->io_uring;
if (WARN_ON_ONCE(!tctx->io_wq))
continue;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
new_count[i] = ctx->iowq_limits[i];
/* ignore errors, it always returns zero anyway */
(void)io_wq_max_workers(tctx->io_wq, new_count);
}
return 0;
err:
if (sqd) {
mutex_unlock(&ctx->uring_lock);
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
mutex_lock(&ctx->uring_lock);
}
return ret;
}
static int io_register_clock(struct io_ring_ctx *ctx,
struct io_uring_clock_register __user *arg)
{
struct io_uring_clock_register reg;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
switch (reg.clockid) {
case CLOCK_MONOTONIC:
ctx->clock_offset = 0;
break;
case CLOCK_BOOTTIME:
ctx->clock_offset = TK_OFFS_BOOT;
break;
default:
return -EINVAL;
}
ctx->clockid = reg.clockid;
return 0;
}
/*
* State to maintain until we can swap. Both new and old state, used for
* either mapping or freeing.
*/
struct io_ring_ctx_rings {
struct io_rings *rings;
struct io_uring_sqe *sq_sqes;
struct io_mapped_region sq_region;
struct io_mapped_region ring_region;
};
static void io_register_free_rings(struct io_ring_ctx *ctx,
struct io_uring_params *p,
struct io_ring_ctx_rings *r)
{
io_free_region(ctx, &r->sq_region);
io_free_region(ctx, &r->ring_region);
}
#define swap_old(ctx, o, n, field) \
do { \
(o).field = (ctx)->field; \
(ctx)->field = (n).field; \
} while (0)
#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
IORING_SETUP_CQE_MIXED)
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_region_desc rd;
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
size_t size, sq_array_offset;
unsigned i, tail, old_head;
struct io_uring_params p;
int ret;
/* limited to DEFER_TASKRUN for now */
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
return -EINVAL;
if (copy_from_user(&p, arg, sizeof(p)))
return -EFAULT;
if (p.flags & ~RESIZE_FLAGS)
return -EINVAL;
/* properties that are always inherited */
p.flags |= (ctx->flags & COPY_FLAGS);
ret = io_uring_fill_params(p.sq_entries, &p);
if (unlikely(ret))
return ret;
/* nothing to do, but copy params back */
if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
if (copy_to_user(arg, &p, sizeof(p)))
return -EFAULT;
return 0;
}
size = rings_size(p.flags, p.sq_entries, p.cq_entries,
&sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(size);
if (p.flags & IORING_SETUP_NO_MMAP) {
rd.user_addr = p.cq_off.user_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
if (ret) {
io_register_free_rings(ctx, &p, &n);
return ret;
}
n.rings = io_region_get_ptr(&n.ring_region);
/*
* At this point n.rings is shared with userspace, just like o.rings
* is as well. While we don't expect userspace to modify it while
* a resize is in progress, and it's most likely that userspace will
* shoot itself in the foot if it does, we can't always assume good
* intent... Use read/write once helpers from here on to indicate the
* shared nature of it.
*/
WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
if (copy_to_user(arg, &p, sizeof(p))) {
io_register_free_rings(ctx, &p, &n);
return -EFAULT;
}
if (p.flags & IORING_SETUP_SQE128)
size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
else
size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
if (size == SIZE_MAX) {
io_register_free_rings(ctx, &p, &n);
return -EOVERFLOW;
}
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(size);
if (p.flags & IORING_SETUP_NO_MMAP) {
rd.user_addr = p.sq_off.user_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
if (ret) {
io_register_free_rings(ctx, &p, &n);
return ret;
}
n.sq_sqes = io_region_get_ptr(&n.sq_region);
/*
* If using SQPOLL, park the thread
*/
if (ctx->sq_data) {
mutex_unlock(&ctx->uring_lock);
io_sq_thread_park(ctx->sq_data);
mutex_lock(&ctx->uring_lock);
}
/*
* We'll do the swap. Grab the ctx->mmap_lock, which will exclude
* any new mmap's on the ring fd. Clear out existing mappings to prevent
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
* existing rings beyond this point will fail. Not that it could proceed
* at this point anyway, as the io_uring mmap side needs go grab the
* ctx->mmap_lock as well. Likewise, hold the completion lock over the
* duration of the actual swap.
*/
mutex_lock(&ctx->mmap_lock);
spin_lock(&ctx->completion_lock);
o.rings = ctx->rings;
ctx->rings = NULL;
o.sq_sqes = ctx->sq_sqes;
ctx->sq_sqes = NULL;
/*
* Now copy SQ and CQ entries, if any. If either of the destination
* rings can't hold what is already there, then fail the operation.
*/
tail = READ_ONCE(o.rings->sq.tail);
old_head = READ_ONCE(o.rings->sq.head);
if (tail - old_head > p.sq_entries)
goto overflow;
for (i = old_head; i < tail; i++) {
unsigned src_head = i & (ctx->sq_entries - 1);
unsigned dst_head = i & (p.sq_entries - 1);
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
}
WRITE_ONCE(n.rings->sq.head, old_head);
WRITE_ONCE(n.rings->sq.tail, tail);
tail = READ_ONCE(o.rings->cq.tail);
old_head = READ_ONCE(o.rings->cq.head);
if (tail - old_head > p.cq_entries) {
overflow:
/* restore old rings, and return -EOVERFLOW via cleanup path */
ctx->rings = o.rings;
ctx->sq_sqes = o.sq_sqes;
to_free = &n;
ret = -EOVERFLOW;
goto out;
}
for (i = old_head; i < tail; i++) {
unsigned src_head = i & (ctx->cq_entries - 1);
unsigned dst_head = i & (p.cq_entries - 1);
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
}
WRITE_ONCE(n.rings->cq.head, old_head);
WRITE_ONCE(n.rings->cq.tail, tail);
/* invalidate cached cqe refill */
ctx->cqe_cached = ctx->cqe_sentinel = NULL;
WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
/* all done, store old pointers and assign new ones */
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
ctx->sq_entries = p.sq_entries;
ctx->cq_entries = p.cq_entries;
ctx->rings = n.rings;
ctx->sq_sqes = n.sq_sqes;
swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region);
to_free = &o;
ret = 0;
out:
spin_unlock(&ctx->completion_lock);
mutex_unlock(&ctx->mmap_lock);
io_register_free_rings(ctx, &p, to_free);
if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
return ret;
}
static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
{
struct io_uring_mem_region_reg __user *reg_uptr = uarg;
struct io_uring_mem_region_reg reg;
struct io_uring_region_desc __user *rd_uptr;
struct io_uring_region_desc rd;
int ret;
if (io_region_is_set(&ctx->param_region))
return -EBUSY;
if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
return -EFAULT;
rd_uptr = u64_to_user_ptr(reg.region_uptr);
if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
return -EFAULT;
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
return -EINVAL;
/*
* This ensures there are no waiters. Waiters are unlocked and it's
* hard to synchronise with them, especially if we need to initialise
* the region.
*/
if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EINVAL;
ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
IORING_MAP_OFF_PARAM_REGION);
if (ret)
return ret;
if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
io_free_region(ctx, &ctx->param_region);
return -EFAULT;
}
if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
ctx->cq_wait_size = rd.size;
}
return 0;
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;
/*
* We don't quiesce the refs for register anymore and so it can't be
* dying as we're holding a file ref here.
*/
if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current)
return -EEXIST;
if (ctx->restricted) {
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
}
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_files_unregister(ctx);
break;
case IORING_REGISTER_FILES_UPDATE:
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 0);
break;
case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_eventfd_unregister(ctx);
break;
case IORING_REGISTER_PROBE:
ret = -EINVAL;
if (!arg || nr_args > 256)
break;
ret = io_probe(ctx, arg, nr_args);
break;
case IORING_REGISTER_PERSONALITY:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_personality(ctx);
break;
case IORING_UNREGISTER_PERSONALITY:
ret = -EINVAL;
if (arg)
break;
ret = io_unregister_personality(ctx, nr_args);
break;
case IORING_REGISTER_ENABLE_RINGS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_enable_rings(ctx);
break;
case IORING_REGISTER_RESTRICTIONS:
ret = io_register_restrictions(ctx, arg, nr_args);
break;
case IORING_REGISTER_FILES2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
break;
case IORING_REGISTER_FILES_UPDATE2:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_FILE);
break;
case IORING_REGISTER_BUFFERS2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_BUFFERS_UPDATE:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_IOWQ_AFF:
ret = -EINVAL;
if (!arg || !nr_args)
break;
ret = io_register_iowq_aff(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_IOWQ_AFF:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_unregister_iowq_aff(ctx);
break;
case IORING_REGISTER_IOWQ_MAX_WORKERS:
ret = -EINVAL;
if (!arg || nr_args != 2)
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
case IORING_REGISTER_RING_FDS:
ret = io_ringfd_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
case IORING_REGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_ring(ctx, arg);
break;
case IORING_UNREGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_unregister_pbuf_ring(ctx, arg);
break;
case IORING_REGISTER_SYNC_CANCEL:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_sync_cancel(ctx, arg);
break;
case IORING_REGISTER_FILE_ALLOC_RANGE:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_file_alloc_range(ctx, arg);
break;
case IORING_REGISTER_PBUF_STATUS:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_status(ctx, arg);
break;
case IORING_REGISTER_NAPI:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_napi(ctx, arg);
break;
case IORING_UNREGISTER_NAPI:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_unregister_napi(ctx, arg);
break;
case IORING_REGISTER_CLOCK:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_clock(ctx, arg);
break;
case IORING_REGISTER_CLONE_BUFFERS:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_clone_buffers(ctx, arg);
break;
case IORING_REGISTER_ZCRX_IFQ:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_zcrx_ifq(ctx, arg);
break;
case IORING_REGISTER_RESIZE_RINGS:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_resize_rings(ctx, arg);
break;
case IORING_REGISTER_MEM_REGION:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_mem_region(ctx, arg);
break;
case IORING_REGISTER_QUERY:
ret = io_query(ctx, arg, nr_args);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
/*
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
* true, then the registered index is used. Otherwise, the normal fd table.
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
*/
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
{
struct file *file;
if (registered) {
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return ERR_PTR(-EINVAL);
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
if (file)
get_file(file);
} else {
file = fget(fd);
}
if (unlikely(!file))
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
fput(file);
return ERR_PTR(-EOPNOTSUPP);
}
static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
{
struct io_uring_sqe sqe;
if (!arg || nr_args != 1)
return -EINVAL;
if (copy_from_user(&sqe, arg, sizeof(sqe)))
return -EFAULT;
/* no flags supported */
if (sqe.flags)
return -EINVAL;
if (sqe.opcode != IORING_OP_MSG_RING)
return -EINVAL;
return io_uring_sync_msg_ring(&sqe);
}
/*
* "blind" registration opcodes are ones where there's no ring given, and
* hence the source fd must be -1.
*/
static int io_uring_register_blind(unsigned int opcode, void __user *arg,
unsigned int nr_args)
{
switch (opcode) {
case IORING_REGISTER_SEND_MSG_RING:
return io_uring_register_send_msg_ring(arg, nr_args);
case IORING_REGISTER_QUERY:
return io_query(NULL, arg, nr_args);
}
return -EINVAL;
}
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
if (fd == -1)
return io_uring_register_blind(opcode, arg, nr_args);
file = io_uring_register_get_file(fd, use_registered_ring);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data;
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
ctx->buf_table.nr, ret);
mutex_unlock(&ctx->uring_lock);
fput(file);
return ret;
}