torvalds-linux/io_uring/register.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Code related to the io_uring_register() syscall
 *
 * Copyright (C) 2023 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/refcount.h>
#include <linux/bits.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>

#include "io_uring.h"
#include "opdef.h"
#include "tctx.h"
#include "rsrc.h"
#include "sqpoll.h"
#include "register.h"
#include "cancel.h"
#include "kbuf.h"
#include "napi.h"
#include "eventfd.h"
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
#include "query.h"

#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
				 IORING_REGISTER_LAST + IORING_OP_LAST)

static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
			   unsigned nr_args)
{
	struct io_uring_probe *p;
	size_t size;
	int i, ret;

	if (nr_args > IORING_OP_LAST)
		nr_args = IORING_OP_LAST;

	size = struct_size(p, ops, nr_args);
	p = kzalloc(size, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	ret = -EFAULT;
	if (copy_from_user(p, arg, size))
		goto out;
	ret = -EINVAL;
	if (memchr_inv(p, 0, size))
		goto out;

	p->last_op = IORING_OP_LAST - 1;

	for (i = 0; i < nr_args; i++) {
		p->ops[i].op = i;
		if (io_uring_op_supported(i))
			p->ops[i].flags = IO_URING_OP_SUPPORTED;
	}
	p->ops_len = i;

	ret = 0;
	if (copy_to_user(arg, p, size))
		ret = -EFAULT;
out:
	kfree(p);
	return ret;
}

int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
	const struct cred *creds;

	creds = xa_erase(&ctx->personalities, id);
	if (creds) {
		put_cred(creds);
		return 0;
	}

	return -EINVAL;
}


static int io_register_personality(struct io_ring_ctx *ctx)
{
	const struct cred *creds;
	u32 id;
	int ret;

	creds = get_current_cred();

	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
	if (ret < 0) {
		put_cred(creds);
		return ret;
	}
	return id;
}

static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
					struct io_restriction *restrictions)
{
	struct io_uring_restriction *res;
	size_t size;
	int i, ret;

	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
		return -EINVAL;

	size = array_size(nr_args, sizeof(*res));
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	res = memdup_user(arg, size);
	if (IS_ERR(res))
		return PTR_ERR(res);

	ret = -EINVAL;

	for (i = 0; i < nr_args; i++) {
		switch (res[i].opcode) {
		case IORING_RESTRICTION_REGISTER_OP:
			if (res[i].register_op >= IORING_REGISTER_LAST)
				goto err;
			__set_bit(res[i].register_op, restrictions->register_op);
			break;
		case IORING_RESTRICTION_SQE_OP:
			if (res[i].sqe_op >= IORING_OP_LAST)
				goto err;
			__set_bit(res[i].sqe_op, restrictions->sqe_op);
			break;
		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
			restrictions->sqe_flags_allowed = res[i].sqe_flags;
			break;
		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
			restrictions->sqe_flags_required = res[i].sqe_flags;
			break;
		default:
			goto err;
		}
	}

	ret = 0;

err:
	kfree(res);
	return ret;
}

static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
					   void __user *arg, unsigned int nr_args)
{
	int ret;

	/* Restrictions allowed only if rings started disabled */
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

	/* We allow only a single restrictions registration */
	if (ctx->restrictions.registered)
		return -EBUSY;

	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
	/* Reset all restrictions if an error happened */
	if (ret != 0)
		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
	else
		ctx->restrictions.registered = true;
	return ret;
}

static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
		/*
		 * Lazy activation attempts would fail if it was polled before
		 * submitter_task is set.
		 */
		if (wq_has_sleeper(&ctx->poll_wq))
			io_activate_pollwq(ctx);
	}

	if (ctx->restrictions.registered)
		ctx->restricted = 1;

	ctx->flags &= ~IORING_SETUP_R_DISABLED;
	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
		wake_up(&ctx->sq_data->wait);
	return 0;
}

static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
					 cpumask_var_t new_mask)
{
	int ret;

	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
	} else {
		mutex_unlock(&ctx->uring_lock);
		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
		mutex_lock(&ctx->uring_lock);
	}

	return ret;
}

static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
				       void __user *arg, unsigned len)
{
	cpumask_var_t new_mask;
	int ret;

	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
		return -ENOMEM;

	cpumask_clear(new_mask);
	if (len > cpumask_size())
		len = cpumask_size();

#ifdef CONFIG_COMPAT
	if (in_compat_syscall())
		ret = compat_get_bitmap(cpumask_bits(new_mask),
					(const compat_ulong_t __user *)arg,
					len * 8 /* CHAR_BIT */);
	else
#endif
		ret = copy_from_user(new_mask, arg, len);

	if (ret) {
		free_cpumask_var(new_mask);
		return -EFAULT;
	}

	ret = __io_register_iowq_aff(ctx, new_mask);
	free_cpumask_var(new_mask);
	return ret;
}

static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
	return __io_register_iowq_aff(ctx, NULL);
}

static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
					       void __user *arg)
	__must_hold(&ctx->uring_lock)
{
	struct io_tctx_node *node;
	struct io_uring_task *tctx = NULL;
	struct io_sq_data *sqd = NULL;
	__u32 new_count[2];
	int i, ret;

	if (copy_from_user(new_count, arg, sizeof(new_count)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i] > INT_MAX)
			return -EINVAL;

	if (ctx->flags & IORING_SETUP_SQPOLL) {
		sqd = ctx->sq_data;
		if (sqd) {
			struct task_struct *tsk;

			/*
			 * Observe the correct sqd->lock -> ctx->uring_lock
			 * ordering. Fine to drop uring_lock here, we hold
			 * a ref to the ctx.
			 */
			refcount_inc(&sqd->refs);
			mutex_unlock(&ctx->uring_lock);
			mutex_lock(&sqd->lock);
			mutex_lock(&ctx->uring_lock);
			tsk = sqpoll_task_locked(sqd);
			if (tsk)
				tctx = tsk->io_uring;
		}
	} else {
		tctx = current->io_uring;
	}

	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));

	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i])
			ctx->iowq_limits[i] = new_count[i];
	ctx->iowq_limits_set = true;

	if (tctx && tctx->io_wq) {
		ret = io_wq_max_workers(tctx->io_wq, new_count);
		if (ret)
			goto err;
	} else {
		memset(new_count, 0, sizeof(new_count));
	}

	if (sqd) {
		mutex_unlock(&ctx->uring_lock);
		mutex_unlock(&sqd->lock);
		io_put_sq_data(sqd);
		mutex_lock(&ctx->uring_lock);
	}

	if (copy_to_user(arg, new_count, sizeof(new_count)))
		return -EFAULT;

	/* that's it for SQPOLL, only the SQPOLL task creates requests */
	if (sqd)
		return 0;

	/* now propagate the restriction to all registered users */
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		tctx = node->task->io_uring;
		if (WARN_ON_ONCE(!tctx->io_wq))
			continue;

		for (i = 0; i < ARRAY_SIZE(new_count); i++)
			new_count[i] = ctx->iowq_limits[i];
		/* ignore errors, it always returns zero anyway */
		(void)io_wq_max_workers(tctx->io_wq, new_count);
	}
	return 0;
err:
	if (sqd) {
		mutex_unlock(&ctx->uring_lock);
		mutex_unlock(&sqd->lock);
		io_put_sq_data(sqd);
		mutex_lock(&ctx->uring_lock);
	}
	return ret;
}

static int io_register_clock(struct io_ring_ctx *ctx,
			     struct io_uring_clock_register __user *arg)
{
	struct io_uring_clock_register reg;

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
		return -EINVAL;

	switch (reg.clockid) {
	case CLOCK_MONOTONIC:
		ctx->clock_offset = 0;
		break;
	case CLOCK_BOOTTIME:
		ctx->clock_offset = TK_OFFS_BOOT;
		break;
	default:
		return -EINVAL;
	}

	ctx->clockid = reg.clockid;
	return 0;
}

/*
 * State to maintain until we can swap. Both new and old state, used for
 * either mapping or freeing.
 */
struct io_ring_ctx_rings {
	struct io_rings *rings;
	struct io_uring_sqe *sq_sqes;

	struct io_mapped_region sq_region;
	struct io_mapped_region ring_region;
};

static void io_register_free_rings(struct io_ring_ctx *ctx,
				   struct io_uring_params *p,
				   struct io_ring_ctx_rings *r)
{
	io_free_region(ctx, &r->sq_region);
	io_free_region(ctx, &r->ring_region);
}

#define swap_old(ctx, o, n, field)		\
	do {					\
		(o).field = (ctx)->field;	\
		(ctx)->field = (n).field;	\
	} while (0)

#define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
#define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
			 IORING_SETUP_CQE_MIXED)

static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
{
	struct io_uring_region_desc rd;
	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
	size_t size, sq_array_offset;
	unsigned i, tail, old_head;
	struct io_uring_params p;
	int ret;

	/* limited to DEFER_TASKRUN for now */
	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
		return -EINVAL;
	if (copy_from_user(&p, arg, sizeof(p)))
		return -EFAULT;
	if (p.flags & ~RESIZE_FLAGS)
		return -EINVAL;

	/* properties that are always inherited */
	p.flags |= (ctx->flags & COPY_FLAGS);

	ret = io_uring_fill_params(p.sq_entries, &p);
	if (unlikely(ret))
		return ret;

	/* nothing to do, but copy params back */
	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
		if (copy_to_user(arg, &p, sizeof(p)))
			return -EFAULT;
		return 0;
	}

	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
				&sq_array_offset);
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	memset(&rd, 0, sizeof(rd));
	rd.size = PAGE_ALIGN(size);
	if (p.flags & IORING_SETUP_NO_MMAP) {
		rd.user_addr = p.cq_off.user_addr;
		rd.flags |= IORING_MEM_REGION_TYPE_USER;
	}
	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
	if (ret) {
		io_register_free_rings(ctx, &p, &n);
		return ret;
	}
	n.rings = io_region_get_ptr(&n.ring_region);

	/*
	 * At this point n.rings is shared with userspace, just like o.rings
	 * is as well. While we don't expect userspace to modify it while
	 * a resize is in progress, and it's most likely that userspace will
	 * shoot itself in the foot if it does, we can't always assume good
	 * intent... Use read/write once helpers from here on to indicate the
	 * shared nature of it.
	 */
	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);

	if (copy_to_user(arg, &p, sizeof(p))) {
		io_register_free_rings(ctx, &p, &n);
		return -EFAULT;
	}

	if (p.flags & IORING_SETUP_SQE128)
		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
	else
		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
	if (size == SIZE_MAX) {
		io_register_free_rings(ctx, &p, &n);
		return -EOVERFLOW;
	}

	memset(&rd, 0, sizeof(rd));
	rd.size = PAGE_ALIGN(size);
	if (p.flags & IORING_SETUP_NO_MMAP) {
		rd.user_addr = p.sq_off.user_addr;
		rd.flags |= IORING_MEM_REGION_TYPE_USER;
	}
	ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
	if (ret) {
		io_register_free_rings(ctx, &p, &n);
		return ret;
	}
	n.sq_sqes = io_region_get_ptr(&n.sq_region);

	/*
	 * If using SQPOLL, park the thread
	 */
	if (ctx->sq_data) {
		mutex_unlock(&ctx->uring_lock);
		io_sq_thread_park(ctx->sq_data);
		mutex_lock(&ctx->uring_lock);
	}

	/*
	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
	 * existing rings beyond this point will fail. Not that it could proceed
	 * at this point anyway, as the io_uring mmap side needs go grab the
	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
	 * duration of the actual swap.
	 */
	mutex_lock(&ctx->mmap_lock);
	spin_lock(&ctx->completion_lock);
	o.rings = ctx->rings;
	ctx->rings = NULL;
	o.sq_sqes = ctx->sq_sqes;
	ctx->sq_sqes = NULL;

	/*
	 * Now copy SQ and CQ entries, if any. If either of the destination
	 * rings can't hold what is already there, then fail the operation.
	 */
	tail = READ_ONCE(o.rings->sq.tail);
	old_head = READ_ONCE(o.rings->sq.head);
	if (tail - old_head > p.sq_entries)
		goto overflow;
	for (i = old_head; i < tail; i++) {
		unsigned src_head = i & (ctx->sq_entries - 1);
		unsigned dst_head = i & (p.sq_entries - 1);

		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
	}
	WRITE_ONCE(n.rings->sq.head, old_head);
	WRITE_ONCE(n.rings->sq.tail, tail);

	tail = READ_ONCE(o.rings->cq.tail);
	old_head = READ_ONCE(o.rings->cq.head);
	if (tail - old_head > p.cq_entries) {
overflow:
		/* restore old rings, and return -EOVERFLOW via cleanup path */
		ctx->rings = o.rings;
		ctx->sq_sqes = o.sq_sqes;
		to_free = &n;
		ret = -EOVERFLOW;
		goto out;
	}
	for (i = old_head; i < tail; i++) {
		unsigned src_head = i & (ctx->cq_entries - 1);
		unsigned dst_head = i & (p.cq_entries - 1);

		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
	}
	WRITE_ONCE(n.rings->cq.head, old_head);
	WRITE_ONCE(n.rings->cq.tail, tail);
	/* invalidate cached cqe refill */
	ctx->cqe_cached = ctx->cqe_sentinel = NULL;

	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));

	/* all done, store old pointers and assign new ones */
	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);

	ctx->sq_entries = p.sq_entries;
	ctx->cq_entries = p.cq_entries;

	ctx->rings = n.rings;
	ctx->sq_sqes = n.sq_sqes;
	swap_old(ctx, o, n, ring_region);
	swap_old(ctx, o, n, sq_region);
	to_free = &o;
	ret = 0;
out:
	spin_unlock(&ctx->completion_lock);
	mutex_unlock(&ctx->mmap_lock);
	io_register_free_rings(ctx, &p, to_free);

	if (ctx->sq_data)
		io_sq_thread_unpark(ctx->sq_data);

	return ret;
}

static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
{
	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
	struct io_uring_mem_region_reg reg;
	struct io_uring_region_desc __user *rd_uptr;
	struct io_uring_region_desc rd;
	int ret;

	if (io_region_is_set(&ctx->param_region))
		return -EBUSY;
	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
		return -EFAULT;
	rd_uptr = u64_to_user_ptr(reg.region_uptr);
	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
		return -EFAULT;
	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
		return -EINVAL;
	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
		return -EINVAL;

	/*
	 * This ensures there are no waiters. Waiters are unlocked and it's
	 * hard to synchronise with them, especially if we need to initialise
	 * the region.
	 */
	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
	    !(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EINVAL;

	ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
					 IORING_MAP_OFF_PARAM_REGION);
	if (ret)
		return ret;
	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
		io_free_region(ctx, &ctx->param_region);
		return -EFAULT;
	}

	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
		ctx->cq_wait_size = rd.size;
	}
	return 0;
}

static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			       void __user *arg, unsigned nr_args)
	__releases(ctx->uring_lock)
	__acquires(ctx->uring_lock)
{
	int ret;

	/*
	 * We don't quiesce the refs for register anymore and so it can't be
	 * dying as we're holding a file ref here.
	 */
	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
		return -ENXIO;

	if (ctx->submitter_task && ctx->submitter_task != current)
		return -EEXIST;

	if (ctx->restricted) {
		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
		if (!test_bit(opcode, ctx->restrictions.register_op))
			return -EACCES;
	}

	switch (opcode) {
	case IORING_REGISTER_BUFFERS:
		ret = -EFAULT;
		if (!arg)
			break;
		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
		break;
	case IORING_UNREGISTER_BUFFERS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_sqe_buffers_unregister(ctx);
		break;
	case IORING_REGISTER_FILES:
		ret = -EFAULT;
		if (!arg)
			break;
		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
		break;
	case IORING_UNREGISTER_FILES:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_sqe_files_unregister(ctx);
		break;
	case IORING_REGISTER_FILES_UPDATE:
		ret = io_register_files_update(ctx, arg, nr_args);
		break;
	case IORING_REGISTER_EVENTFD:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
		ret = io_eventfd_register(ctx, arg, 0);
		break;
	case IORING_REGISTER_EVENTFD_ASYNC:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
		ret = io_eventfd_register(ctx, arg, 1);
		break;
	case IORING_UNREGISTER_EVENTFD:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_eventfd_unregister(ctx);
		break;
	case IORING_REGISTER_PROBE:
		ret = -EINVAL;
		if (!arg || nr_args > 256)
			break;
		ret = io_probe(ctx, arg, nr_args);
		break;
	case IORING_REGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_personality(ctx);
		break;
	case IORING_UNREGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg)
			break;
		ret = io_unregister_personality(ctx, nr_args);
		break;
	case IORING_REGISTER_ENABLE_RINGS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_enable_rings(ctx);
		break;
	case IORING_REGISTER_RESTRICTIONS:
		ret = io_register_restrictions(ctx, arg, nr_args);
		break;
	case IORING_REGISTER_FILES2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_FILES_UPDATE2:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_BUFFERS2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
		break;
	case IORING_REGISTER_BUFFERS_UPDATE:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_BUFFER);
		break;
	case IORING_REGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (!arg || !nr_args)
			break;
		ret = io_register_iowq_aff(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_unregister_iowq_aff(ctx);
		break;
	case IORING_REGISTER_IOWQ_MAX_WORKERS:
		ret = -EINVAL;
		if (!arg || nr_args != 2)
			break;
		ret = io_register_iowq_max_workers(ctx, arg);
		break;
	case IORING_REGISTER_RING_FDS:
		ret = io_ringfd_register(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_RING_FDS:
		ret = io_ringfd_unregister(ctx, arg, nr_args);
		break;
	case IORING_REGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_pbuf_ring(ctx, arg);
		break;
	case IORING_UNREGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_unregister_pbuf_ring(ctx, arg);
		break;
	case IORING_REGISTER_SYNC_CANCEL:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_sync_cancel(ctx, arg);
		break;
	case IORING_REGISTER_FILE_ALLOC_RANGE:
		ret = -EINVAL;
		if (!arg || nr_args)
			break;
		ret = io_register_file_alloc_range(ctx, arg);
		break;
	case IORING_REGISTER_PBUF_STATUS:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_pbuf_status(ctx, arg);
		break;
	case IORING_REGISTER_NAPI:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_napi(ctx, arg);
		break;
	case IORING_UNREGISTER_NAPI:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
		ret = io_unregister_napi(ctx, arg);
		break;
	case IORING_REGISTER_CLOCK:
		ret = -EINVAL;
		if (!arg || nr_args)
			break;
		ret = io_register_clock(ctx, arg);
		break;
	case IORING_REGISTER_CLONE_BUFFERS:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_clone_buffers(ctx, arg);
		break;
	case IORING_REGISTER_ZCRX_IFQ:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_zcrx_ifq(ctx, arg);
		break;
	case IORING_REGISTER_RESIZE_RINGS:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_resize_rings(ctx, arg);
		break;
	case IORING_REGISTER_MEM_REGION:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_mem_region(ctx, arg);
		break;
	case IORING_REGISTER_QUERY:
		ret = io_query(ctx, arg, nr_args);
		break;
	default:
		ret = -EINVAL;
		break;
	}

	return ret;
}

/*
 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
 * true, then the registered index is used. Otherwise, the normal fd table.
 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
 */
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
{
	struct file *file;

	if (registered) {
		/*
		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
		 * need only dereference our task private array to find it.
		 */
		struct io_uring_task *tctx = current->io_uring;

		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
			return ERR_PTR(-EINVAL);
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		file = tctx->registered_rings[fd];
		if (file)
			get_file(file);
	} else {
		file = fget(fd);
	}

	if (unlikely(!file))
		return ERR_PTR(-EBADF);
	if (io_is_uring_fops(file))
		return file;
	fput(file);
	return ERR_PTR(-EOPNOTSUPP);
}

static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
{
	struct io_uring_sqe sqe;

	if (!arg || nr_args != 1)
		return -EINVAL;
	if (copy_from_user(&sqe, arg, sizeof(sqe)))
		return -EFAULT;
	/* no flags supported */
	if (sqe.flags)
		return -EINVAL;
	if (sqe.opcode != IORING_OP_MSG_RING)
		return -EINVAL;

	return io_uring_sync_msg_ring(&sqe);
}

/*
 * "blind" registration opcodes are ones where there's no ring given, and
 * hence the source fd must be -1.
 */
static int io_uring_register_blind(unsigned int opcode, void __user *arg,
				   unsigned int nr_args)
{
	switch (opcode) {
	case IORING_REGISTER_SEND_MSG_RING:
		return io_uring_register_send_msg_ring(arg, nr_args);
	case IORING_REGISTER_QUERY:
		return io_query(NULL, arg, nr_args);
	}
	return -EINVAL;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		void __user *, arg, unsigned int, nr_args)
{
	struct io_ring_ctx *ctx;
	long ret = -EBADF;
	struct file *file;
	bool use_registered_ring;

	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;

	if (opcode >= IORING_REGISTER_LAST)
		return -EINVAL;

	if (fd == -1)
		return io_uring_register_blind(opcode, arg, nr_args);

	file = io_uring_register_get_file(fd, use_registered_ring);
	if (IS_ERR(file))
		return PTR_ERR(file);
	ctx = file->private_data;

	mutex_lock(&ctx->uring_lock);
	ret = __io_uring_register(ctx, opcode, arg, nr_args);

	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
				ctx->buf_table.nr, ret);
	mutex_unlock(&ctx->uring_lock);

	fput(file);
	return ret;
}