From 0702e5364f643bc86683d9f585edfe76dbabae39 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 23 May 2022 16:56:21 -0600
Subject: [PATCH 001/172] io_uring: define a 'prep' and 'issue' handler for
 each opcode

Rather than have two giant switches for doing request preparation and
then for doing request issue, add a prep and issue handler for each
of them in the io_op_defs[] request definition.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 838 ++++++++++++++++++++++----------------------------
 1 file changed, 365 insertions(+), 473 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e8e769be9ed058..63cad0e12d8b37 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1110,231 +1110,13 @@ struct io_op_def {
 	unsigned		iopoll : 1;
 	/* size of async data needed, if any */
 	unsigned short		async_size;
-};
 
-static const struct io_op_def io_op_defs[] = {
-	[IORING_OP_NOP] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-	},
-	[IORING_OP_READV] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.needs_async_setup	= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-	},
-	[IORING_OP_WRITEV] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.needs_async_setup	= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-	},
-	[IORING_OP_FSYNC] = {
-		.needs_file		= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_READ_FIXED] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-	},
-	[IORING_OP_WRITE_FIXED] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-	},
-	[IORING_OP_POLL_ADD] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_POLL_REMOVE] = {
-		.audit_skip		= 1,
-	},
-	[IORING_OP_SYNC_FILE_RANGE] = {
-		.needs_file		= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_SENDMSG] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.needs_async_setup	= 1,
-		.ioprio			= 1,
-		.async_size		= sizeof(struct io_async_msghdr),
-	},
-	[IORING_OP_RECVMSG] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.needs_async_setup	= 1,
-		.ioprio			= 1,
-		.async_size		= sizeof(struct io_async_msghdr),
-	},
-	[IORING_OP_TIMEOUT] = {
-		.audit_skip		= 1,
-		.async_size		= sizeof(struct io_timeout_data),
-	},
-	[IORING_OP_TIMEOUT_REMOVE] = {
-		/* used by timeout updates' prep() */
-		.audit_skip		= 1,
-	},
-	[IORING_OP_ACCEPT] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.poll_exclusive		= 1,
-		.ioprio			= 1,	/* used for flags */
-	},
-	[IORING_OP_ASYNC_CANCEL] = {
-		.audit_skip		= 1,
-	},
-	[IORING_OP_LINK_TIMEOUT] = {
-		.audit_skip		= 1,
-		.async_size		= sizeof(struct io_timeout_data),
-	},
-	[IORING_OP_CONNECT] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.needs_async_setup	= 1,
-		.async_size		= sizeof(struct io_async_connect),
-	},
-	[IORING_OP_FALLOCATE] = {
-		.needs_file		= 1,
-	},
-	[IORING_OP_OPENAT] = {},
-	[IORING_OP_CLOSE] = {},
-	[IORING_OP_FILES_UPDATE] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-	},
-	[IORING_OP_STATX] = {
-		.audit_skip		= 1,
-	},
-	[IORING_OP_READ] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-	},
-	[IORING_OP_WRITE] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-	},
-	[IORING_OP_FADVISE] = {
-		.needs_file		= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_MADVISE] = {},
-	[IORING_OP_SEND] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-	},
-	[IORING_OP_RECV] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-	},
-	[IORING_OP_OPENAT2] = {
-	},
-	[IORING_OP_EPOLL_CTL] = {
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_SPLICE] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_PROVIDE_BUFFERS] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-	},
-	[IORING_OP_REMOVE_BUFFERS] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-	},
-	[IORING_OP_TEE] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-	},
-	[IORING_OP_SHUTDOWN] = {
-		.needs_file		= 1,
-	},
-	[IORING_OP_RENAMEAT] = {},
-	[IORING_OP_UNLINKAT] = {},
-	[IORING_OP_MKDIRAT] = {},
-	[IORING_OP_SYMLINKAT] = {},
-	[IORING_OP_LINKAT] = {},
-	[IORING_OP_MSG_RING] = {
-		.needs_file		= 1,
-		.iopoll			= 1,
-	},
-	[IORING_OP_FSETXATTR] = {
-		.needs_file = 1
-	},
-	[IORING_OP_SETXATTR] = {},
-	[IORING_OP_FGETXATTR] = {
-		.needs_file = 1
-	},
-	[IORING_OP_GETXATTR] = {},
-	[IORING_OP_SOCKET] = {
-		.audit_skip		= 1,
-	},
-	[IORING_OP_URING_CMD] = {
-		.needs_file		= 1,
-		.plug			= 1,
-		.needs_async_setup	= 1,
-		.async_size		= uring_cmd_pdu_size(1),
-	},
+	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
+	int (*issue)(struct io_kiocb *, unsigned int);
 };
 
+static const struct io_op_def io_op_defs[];
+
 /* requests with any of those set should undergo io_disarm_next() */
 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
@@ -8039,96 +7821,33 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
-static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_req_prep_async(struct io_kiocb *req)
 {
+	const struct io_op_def *def = &io_op_defs[req->opcode];
+
+	/* assign early for deferred execution for non-fixed file */
+	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+		req->file = io_file_get_normal(req, req->cqe.fd);
+	if (!def->needs_async_setup)
+		return 0;
+	if (WARN_ON_ONCE(req_has_async_data(req)))
+		return -EFAULT;
+	if (io_alloc_async_data(req))
+		return -EAGAIN;
+
 	switch (req->opcode) {
-	case IORING_OP_NOP:
-		return io_nop_prep(req, sqe);
 	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_READ:
+		return io_readv_prep_async(req);
 	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-	case IORING_OP_WRITE:
-		return io_prep_rw(req, sqe);
-	case IORING_OP_POLL_ADD:
-		return io_poll_add_prep(req, sqe);
-	case IORING_OP_POLL_REMOVE:
-		return io_poll_remove_prep(req, sqe);
-	case IORING_OP_FSYNC:
-		return io_fsync_prep(req, sqe);
-	case IORING_OP_SYNC_FILE_RANGE:
-		return io_sfr_prep(req, sqe);
+		return io_writev_prep_async(req);
 	case IORING_OP_SENDMSG:
-	case IORING_OP_SEND:
-		return io_sendmsg_prep(req, sqe);
+		return io_sendmsg_prep_async(req);
 	case IORING_OP_RECVMSG:
-	case IORING_OP_RECV:
-		return io_recvmsg_prep(req, sqe);
+		return io_recvmsg_prep_async(req);
 	case IORING_OP_CONNECT:
-		return io_connect_prep(req, sqe);
-	case IORING_OP_TIMEOUT:
-		return io_timeout_prep(req, sqe);
-	case IORING_OP_TIMEOUT_REMOVE:
-		return io_timeout_remove_prep(req, sqe);
-	case IORING_OP_ASYNC_CANCEL:
-		return io_async_cancel_prep(req, sqe);
-	case IORING_OP_LINK_TIMEOUT:
-		return io_link_timeout_prep(req, sqe);
-	case IORING_OP_ACCEPT:
-		return io_accept_prep(req, sqe);
-	case IORING_OP_FALLOCATE:
-		return io_fallocate_prep(req, sqe);
-	case IORING_OP_OPENAT:
-		return io_openat_prep(req, sqe);
-	case IORING_OP_CLOSE:
-		return io_close_prep(req, sqe);
-	case IORING_OP_FILES_UPDATE:
-		return io_files_update_prep(req, sqe);
-	case IORING_OP_STATX:
-		return io_statx_prep(req, sqe);
-	case IORING_OP_FADVISE:
-		return io_fadvise_prep(req, sqe);
-	case IORING_OP_MADVISE:
-		return io_madvise_prep(req, sqe);
-	case IORING_OP_OPENAT2:
-		return io_openat2_prep(req, sqe);
-	case IORING_OP_EPOLL_CTL:
-		return io_epoll_ctl_prep(req, sqe);
-	case IORING_OP_SPLICE:
-		return io_splice_prep(req, sqe);
-	case IORING_OP_PROVIDE_BUFFERS:
-		return io_provide_buffers_prep(req, sqe);
-	case IORING_OP_REMOVE_BUFFERS:
-		return io_remove_buffers_prep(req, sqe);
-	case IORING_OP_TEE:
-		return io_tee_prep(req, sqe);
-	case IORING_OP_SHUTDOWN:
-		return io_shutdown_prep(req, sqe);
-	case IORING_OP_RENAMEAT:
-		return io_renameat_prep(req, sqe);
-	case IORING_OP_UNLINKAT:
-		return io_unlinkat_prep(req, sqe);
-	case IORING_OP_MKDIRAT:
-		return io_mkdirat_prep(req, sqe);
-	case IORING_OP_SYMLINKAT:
-		return io_symlinkat_prep(req, sqe);
-	case IORING_OP_LINKAT:
-		return io_linkat_prep(req, sqe);
-	case IORING_OP_MSG_RING:
-		return io_msg_ring_prep(req, sqe);
-	case IORING_OP_FSETXATTR:
-		return io_fsetxattr_prep(req, sqe);
-	case IORING_OP_SETXATTR:
-		return io_setxattr_prep(req, sqe);
-	case IORING_OP_FGETXATTR:
-		return io_fgetxattr_prep(req, sqe);
-	case IORING_OP_GETXATTR:
-		return io_getxattr_prep(req, sqe);
-	case IORING_OP_SOCKET:
-		return io_socket_prep(req, sqe);
+		return io_connect_prep_async(req);
 	case IORING_OP_URING_CMD:
-		return io_uring_cmd_prep(req, sqe);
+		return io_uring_cmd_prep_async(req);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -8136,39 +7855,6 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EINVAL;
 }
 
-static int io_req_prep_async(struct io_kiocb *req)
-{
-	const struct io_op_def *def = &io_op_defs[req->opcode];
-
-	/* assign early for deferred execution for non-fixed file */
-	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
-		req->file = io_file_get_normal(req, req->cqe.fd);
-	if (!def->needs_async_setup)
-		return 0;
-	if (WARN_ON_ONCE(req_has_async_data(req)))
-		return -EFAULT;
-	if (io_alloc_async_data(req))
-		return -EAGAIN;
-
-	switch (req->opcode) {
-	case IORING_OP_READV:
-		return io_readv_prep_async(req);
-	case IORING_OP_WRITEV:
-		return io_writev_prep_async(req);
-	case IORING_OP_SENDMSG:
-		return io_sendmsg_prep_async(req);
-	case IORING_OP_RECVMSG:
-		return io_recvmsg_prep_async(req);
-	case IORING_OP_CONNECT:
-		return io_connect_prep_async(req);
-	case IORING_OP_URING_CMD:
-		return io_uring_cmd_prep_async(req);
-	}
-	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
-		    req->opcode);
-	return -EFAULT;
-}
-
 static u32 io_get_sequence(struct io_kiocb *req)
 {
 	u32 seq = req->ctx->cached_sq_head;
@@ -8335,141 +8021,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	if (!def->audit_skip)
 		audit_uring_entry(req->opcode);
 
-	switch (req->opcode) {
-	case IORING_OP_NOP:
-		ret = io_nop(req, issue_flags);
-		break;
-	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_READ:
-		ret = io_read(req, issue_flags);
-		break;
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-	case IORING_OP_WRITE:
-		ret = io_write(req, issue_flags);
-		break;
-	case IORING_OP_FSYNC:
-		ret = io_fsync(req, issue_flags);
-		break;
-	case IORING_OP_POLL_ADD:
-		ret = io_poll_add(req, issue_flags);
-		break;
-	case IORING_OP_POLL_REMOVE:
-		ret = io_poll_remove(req, issue_flags);
-		break;
-	case IORING_OP_SYNC_FILE_RANGE:
-		ret = io_sync_file_range(req, issue_flags);
-		break;
-	case IORING_OP_SENDMSG:
-		ret = io_sendmsg(req, issue_flags);
-		break;
-	case IORING_OP_SEND:
-		ret = io_send(req, issue_flags);
-		break;
-	case IORING_OP_RECVMSG:
-		ret = io_recvmsg(req, issue_flags);
-		break;
-	case IORING_OP_RECV:
-		ret = io_recv(req, issue_flags);
-		break;
-	case IORING_OP_TIMEOUT:
-		ret = io_timeout(req, issue_flags);
-		break;
-	case IORING_OP_TIMEOUT_REMOVE:
-		ret = io_timeout_remove(req, issue_flags);
-		break;
-	case IORING_OP_ACCEPT:
-		ret = io_accept(req, issue_flags);
-		break;
-	case IORING_OP_CONNECT:
-		ret = io_connect(req, issue_flags);
-		break;
-	case IORING_OP_ASYNC_CANCEL:
-		ret = io_async_cancel(req, issue_flags);
-		break;
-	case IORING_OP_FALLOCATE:
-		ret = io_fallocate(req, issue_flags);
-		break;
-	case IORING_OP_OPENAT:
-		ret = io_openat(req, issue_flags);
-		break;
-	case IORING_OP_CLOSE:
-		ret = io_close(req, issue_flags);
-		break;
-	case IORING_OP_FILES_UPDATE:
-		ret = io_files_update(req, issue_flags);
-		break;
-	case IORING_OP_STATX:
-		ret = io_statx(req, issue_flags);
-		break;
-	case IORING_OP_FADVISE:
-		ret = io_fadvise(req, issue_flags);
-		break;
-	case IORING_OP_MADVISE:
-		ret = io_madvise(req, issue_flags);
-		break;
-	case IORING_OP_OPENAT2:
-		ret = io_openat2(req, issue_flags);
-		break;
-	case IORING_OP_EPOLL_CTL:
-		ret = io_epoll_ctl(req, issue_flags);
-		break;
-	case IORING_OP_SPLICE:
-		ret = io_splice(req, issue_flags);
-		break;
-	case IORING_OP_PROVIDE_BUFFERS:
-		ret = io_provide_buffers(req, issue_flags);
-		break;
-	case IORING_OP_REMOVE_BUFFERS:
-		ret = io_remove_buffers(req, issue_flags);
-		break;
-	case IORING_OP_TEE:
-		ret = io_tee(req, issue_flags);
-		break;
-	case IORING_OP_SHUTDOWN:
-		ret = io_shutdown(req, issue_flags);
-		break;
-	case IORING_OP_RENAMEAT:
-		ret = io_renameat(req, issue_flags);
-		break;
-	case IORING_OP_UNLINKAT:
-		ret = io_unlinkat(req, issue_flags);
-		break;
-	case IORING_OP_MKDIRAT:
-		ret = io_mkdirat(req, issue_flags);
-		break;
-	case IORING_OP_SYMLINKAT:
-		ret = io_symlinkat(req, issue_flags);
-		break;
-	case IORING_OP_LINKAT:
-		ret = io_linkat(req, issue_flags);
-		break;
-	case IORING_OP_MSG_RING:
-		ret = io_msg_ring(req, issue_flags);
-		break;
-	case IORING_OP_FSETXATTR:
-		ret = io_fsetxattr(req, issue_flags);
-		break;
-	case IORING_OP_SETXATTR:
-		ret = io_setxattr(req, issue_flags);
-		break;
-	case IORING_OP_FGETXATTR:
-		ret = io_fgetxattr(req, issue_flags);
-		break;
-	case IORING_OP_GETXATTR:
-		ret = io_getxattr(req, issue_flags);
-		break;
-	case IORING_OP_SOCKET:
-		ret = io_socket(req, issue_flags);
-		break;
-	case IORING_OP_URING_CMD:
-		ret = io_uring_cmd(req, issue_flags);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
+	ret = def->issue(req, issue_flags);
 
 	if (!def->audit_skip)
 		audit_uring_exit(!ret, ret);
@@ -8898,7 +8450,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		req->flags |= REQ_F_CREDS;
 	}
 
-	return io_req_prep(req, sqe);
+	return def->prep(req, sqe);
 }
 
 static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
@@ -13200,8 +12752,343 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	return ret;
 }
 
+static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
+{
+	WARN_ON_ONCE(1);
+	return -ECANCELED;
+}
+
+static const struct io_op_def io_op_defs[] = {
+	[IORING_OP_NOP] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.prep			= io_nop_prep,
+		.issue			= io_nop,
+	},
+	[IORING_OP_READV] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.needs_async_setup	= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITEV] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.needs_async_setup	= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
+	[IORING_OP_FSYNC] = {
+		.needs_file		= 1,
+		.audit_skip		= 1,
+		.prep			= io_fsync_prep,
+		.issue			= io_fsync,
+	},
+	[IORING_OP_READ_FIXED] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITE_FIXED] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
+	[IORING_OP_POLL_ADD] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.prep			= io_poll_add_prep,
+		.issue			= io_poll_add,
+	},
+	[IORING_OP_POLL_REMOVE] = {
+		.audit_skip		= 1,
+		.prep			= io_poll_remove_prep,
+		.issue			= io_poll_remove,
+	},
+	[IORING_OP_SYNC_FILE_RANGE] = {
+		.needs_file		= 1,
+		.audit_skip		= 1,
+		.prep			= io_sfr_prep,
+		.issue			= io_sync_file_range,
+	},
+	[IORING_OP_SENDMSG] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.needs_async_setup	= 1,
+		.ioprio			= 1,
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep			= io_sendmsg_prep,
+		.issue			= io_sendmsg,
+	},
+	[IORING_OP_RECVMSG] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.needs_async_setup	= 1,
+		.ioprio			= 1,
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep			= io_recvmsg_prep,
+		.issue			= io_recvmsg,
+	},
+	[IORING_OP_TIMEOUT] = {
+		.audit_skip		= 1,
+		.async_size		= sizeof(struct io_timeout_data),
+		.prep			= io_timeout_prep,
+		.issue			= io_timeout,
+	},
+	[IORING_OP_TIMEOUT_REMOVE] = {
+		/* used by timeout updates' prep() */
+		.audit_skip		= 1,
+		.prep			= io_timeout_remove_prep,
+		.issue			= io_timeout_remove,
+	},
+	[IORING_OP_ACCEPT] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.poll_exclusive		= 1,
+		.ioprio			= 1,	/* used for flags */
+		.prep			= io_accept_prep,
+		.issue			= io_accept,
+	},
+	[IORING_OP_ASYNC_CANCEL] = {
+		.audit_skip		= 1,
+		.prep			= io_async_cancel_prep,
+		.issue			= io_async_cancel,
+	},
+	[IORING_OP_LINK_TIMEOUT] = {
+		.audit_skip		= 1,
+		.async_size		= sizeof(struct io_timeout_data),
+		.prep			= io_link_timeout_prep,
+		.issue			= io_no_issue,
+	},
+	[IORING_OP_CONNECT] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.needs_async_setup	= 1,
+		.async_size		= sizeof(struct io_async_connect),
+		.prep			= io_connect_prep,
+		.issue			= io_connect,
+	},
+	[IORING_OP_FALLOCATE] = {
+		.needs_file		= 1,
+		.prep			= io_fallocate_prep,
+		.issue			= io_fallocate,
+	},
+	[IORING_OP_OPENAT] = {
+		.prep			= io_openat_prep,
+		.issue			= io_openat,
+	},
+	[IORING_OP_CLOSE] = {
+		.prep			= io_close_prep,
+		.issue			= io_close,
+	},
+	[IORING_OP_FILES_UPDATE] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.prep			= io_files_update_prep,
+		.issue			= io_files_update,
+	},
+	[IORING_OP_STATX] = {
+		.audit_skip		= 1,
+		.prep			= io_statx_prep,
+		.issue			= io_statx,
+	},
+	[IORING_OP_READ] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
+	[IORING_OP_FADVISE] = {
+		.needs_file		= 1,
+		.audit_skip		= 1,
+		.prep			= io_fadvise_prep,
+		.issue			= io_fadvise,
+	},
+	[IORING_OP_MADVISE] = {
+		.prep			= io_madvise_prep,
+		.issue			= io_madvise,
+	},
+	[IORING_OP_SEND] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.prep			= io_sendmsg_prep,
+		.issue			= io_send,
+	},
+	[IORING_OP_RECV] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.prep			= io_recvmsg_prep,
+		.issue			= io_recv,
+	},
+	[IORING_OP_OPENAT2] = {
+		.prep			= io_openat2_prep,
+		.issue			= io_openat2,
+	},
+	[IORING_OP_EPOLL_CTL] = {
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.prep			= io_epoll_ctl_prep,
+		.issue			= io_epoll_ctl,
+	},
+	[IORING_OP_SPLICE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.prep			= io_splice_prep,
+		.issue			= io_splice,
+	},
+	[IORING_OP_PROVIDE_BUFFERS] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.prep			= io_provide_buffers_prep,
+		.issue			= io_provide_buffers,
+	},
+	[IORING_OP_REMOVE_BUFFERS] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.prep			= io_remove_buffers_prep,
+		.issue			= io_remove_buffers,
+	},
+	[IORING_OP_TEE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.prep			= io_tee_prep,
+		.issue			= io_tee,
+	},
+	[IORING_OP_SHUTDOWN] = {
+		.needs_file		= 1,
+		.prep			= io_shutdown_prep,
+		.issue			= io_shutdown,
+	},
+	[IORING_OP_RENAMEAT] = {
+		.prep			= io_renameat_prep,
+		.issue			= io_renameat,
+	},
+	[IORING_OP_UNLINKAT] = {
+		.prep			= io_unlinkat_prep,
+		.issue			= io_unlinkat,
+	},
+	[IORING_OP_MKDIRAT] = {
+		.prep			= io_mkdirat_prep,
+		.issue			= io_mkdirat,
+	},
+	[IORING_OP_SYMLINKAT] = {
+		.prep			= io_symlinkat_prep,
+		.issue			= io_symlinkat,
+	},
+	[IORING_OP_LINKAT] = {
+		.prep			= io_linkat_prep,
+		.issue			= io_linkat,
+	},
+	[IORING_OP_MSG_RING] = {
+		.needs_file		= 1,
+		.iopoll			= 1,
+		.prep			= io_msg_ring_prep,
+		.issue			= io_msg_ring,
+	},
+	[IORING_OP_FSETXATTR] = {
+		.needs_file = 1,
+		.prep			= io_fsetxattr_prep,
+		.issue			= io_fsetxattr,
+	},
+	[IORING_OP_SETXATTR] = {
+		.prep			= io_setxattr_prep,
+		.issue			= io_setxattr,
+	},
+	[IORING_OP_FGETXATTR] = {
+		.needs_file = 1,
+		.prep			= io_fgetxattr_prep,
+		.issue			= io_fgetxattr,
+	},
+	[IORING_OP_GETXATTR] = {
+		.prep			= io_getxattr_prep,
+		.issue			= io_getxattr,
+	},
+	[IORING_OP_SOCKET] = {
+		.audit_skip		= 1,
+		.prep			= io_socket_prep,
+		.issue			= io_socket,
+	},
+	[IORING_OP_URING_CMD] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.needs_async_setup	= 1,
+		.async_size		= uring_cmd_pdu_size(1),
+		.prep			= io_uring_cmd_prep,
+		.issue			= io_uring_cmd,
+	},
+};
+
 static int __init io_uring_init(void)
 {
+	int i;
+
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -13266,6 +13153,11 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
 
+	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
+		BUG_ON(!io_op_defs[i].prep);
+		BUG_ON(!io_op_defs[i].issue);
+	}
+
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT);
 	return 0;

From ed29b0b4fd835b058ddd151c49d021e28d631ee6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 23 May 2022 17:05:03 -0600
Subject: [PATCH 002/172] io_uring: move to separate directory

In preparation for splitting io_uring up a bit, move it into its own
top level directory. It didn't really belong in fs/ anyway, as it's
not a file system only API.

This adds io_uring/ and moves the core files in there, and updates the
MAINTAINERS file for the new location.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS                 | 7 +------
 Makefile                    | 1 +
 fs/Makefile                 | 2 --
 io_uring/Makefile           | 6 ++++++
 {fs => io_uring}/io-wq.c    | 0
 {fs => io_uring}/io-wq.h    | 0
 {fs => io_uring}/io_uring.c | 2 +-
 kernel/sched/core.c         | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)
 create mode 100644 io_uring/Makefile
 rename {fs => io_uring}/io-wq.c (100%)
 rename {fs => io_uring}/io-wq.h (100%)
 rename {fs => io_uring}/io_uring.c (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 64379c699903bc..08620b9a44fc77 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7773,9 +7773,6 @@ F:	include/linux/fs.h
 F:	include/linux/fs_types.h
 F:	include/uapi/linux/fs.h
 F:	include/uapi/linux/openat2.h
-X:	fs/io-wq.c
-X:	fs/io-wq.h
-X:	fs/io_uring.c
 
 FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
 M:	Riku Voipio <riku.voipio@iki.fi>
@@ -10476,9 +10473,7 @@ L:	io-uring@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.dk/linux-block
 T:	git git://git.kernel.dk/liburing
-F:	fs/io-wq.c
-F:	fs/io-wq.h
-F:	fs/io_uring.c
+F:	io_uring/
 F:	include/linux/io_uring.h
 F:	include/uapi/linux/io_uring.h
 F:	tools/io_uring/
diff --git a/Makefile b/Makefile
index b79c1c18149d38..e231f70dc78a5b 100644
--- a/Makefile
+++ b/Makefile
@@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
 ifeq ($(KBUILD_EXTMOD),)
 core-y			+= kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
 core-$(CONFIG_BLOCK)	+= block/
+core-$(CONFIG_IO_URING)	+= io_uring/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/fs/Makefile b/fs/Makefile
index 208a74e0b00e12..93b80529f8e827 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
-obj-$(CONFIG_IO_URING)		+= io_uring.o
-obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FS_VERITY)		+= verity/
diff --git a/io_uring/Makefile b/io_uring/Makefile
new file mode 100644
index 00000000000000..3680425df9478b
--- /dev/null
+++ b/io_uring/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for io_uring
+
+obj-$(CONFIG_IO_URING)		+= io_uring.o
+obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/fs/io-wq.c b/io_uring/io-wq.c
similarity index 100%
rename from fs/io-wq.c
rename to io_uring/io-wq.c
diff --git a/fs/io-wq.h b/io_uring/io-wq.h
similarity index 100%
rename from fs/io-wq.h
rename to io_uring/io-wq.h
diff --git a/fs/io_uring.c b/io_uring/io_uring.c
similarity index 99%
rename from fs/io_uring.c
rename to io_uring/io_uring.c
index 63cad0e12d8b37..f429b68d1fc295 100644
--- a/fs/io_uring.c
+++ b/io_uring/io_uring.c
@@ -87,7 +87,7 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "internal.h"
+#include "../fs/internal.h"
 #include "io-wq.h"
 
 #define IORING_MAX_ENTRIES	32768
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecdcf..f35674e89621b2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,7 @@
 #include "stats.h"
 
 #include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
+#include "../../io_uring/io-wq.h"
 #include "../smpboot.h"
 
 /*

From dc919caff6b68dab1ae56cd341d96f50c2438aea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 23 May 2022 17:30:37 -0600
Subject: [PATCH 003/172] io_uring: move req async preparation into opcode
 handler

Define an io_op_def->prep_async() handler and push the async preparation
to there. Since we now have that, we can drop ->needs_async_setup, as
they mean the same thing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 38 ++++++++++----------------------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f429b68d1fc295..f353822436c4ed 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1098,8 +1098,6 @@ struct io_op_def {
 	unsigned		poll_exclusive : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
-	/* do prep async if is going to be punted */
-	unsigned		needs_async_setup : 1;
 	/* opcode is not supported by this kernel */
 	unsigned		not_supported : 1;
 	/* skip auditing */
@@ -1113,6 +1111,7 @@ struct io_op_def {
 
 	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
 	int (*issue)(struct io_kiocb *, unsigned int);
+	int (*prep_async)(struct io_kiocb *);
 };
 
 static const struct io_op_def io_op_defs[];
@@ -3916,7 +3915,7 @@ static inline bool io_alloc_async_data(struct io_kiocb *req)
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     struct io_rw_state *s, bool force)
 {
-	if (!force && !io_op_defs[req->opcode].needs_async_setup)
+	if (!force && !io_op_defs[req->opcode].prep_async)
 		return 0;
 	if (!req_has_async_data(req)) {
 		struct io_async_rw *iorw;
@@ -7828,31 +7827,14 @@ static int io_req_prep_async(struct io_kiocb *req)
 	/* assign early for deferred execution for non-fixed file */
 	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
 		req->file = io_file_get_normal(req, req->cqe.fd);
-	if (!def->needs_async_setup)
+	if (!def->prep_async)
 		return 0;
 	if (WARN_ON_ONCE(req_has_async_data(req)))
 		return -EFAULT;
 	if (io_alloc_async_data(req))
 		return -EAGAIN;
 
-	switch (req->opcode) {
-	case IORING_OP_READV:
-		return io_readv_prep_async(req);
-	case IORING_OP_WRITEV:
-		return io_writev_prep_async(req);
-	case IORING_OP_SENDMSG:
-		return io_sendmsg_prep_async(req);
-	case IORING_OP_RECVMSG:
-		return io_recvmsg_prep_async(req);
-	case IORING_OP_CONNECT:
-		return io_connect_prep_async(req);
-	case IORING_OP_URING_CMD:
-		return io_uring_cmd_prep_async(req);
-	}
-
-	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
-			req->opcode);
-	return -EINVAL;
+	return def->prep_async(req);
 }
 
 static u32 io_get_sequence(struct io_kiocb *req)
@@ -12770,7 +12752,6 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
-		.needs_async_setup	= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
@@ -12778,13 +12759,13 @@ static const struct io_op_def io_op_defs[] = {
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_rw,
 		.issue			= io_read,
+		.prep_async		= io_readv_prep_async,
 	},
 	[IORING_OP_WRITEV] = {
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.needs_async_setup	= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
@@ -12792,6 +12773,7 @@ static const struct io_op_def io_op_defs[] = {
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_rw,
 		.issue			= io_write,
+		.prep_async		= io_writev_prep_async,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
@@ -12846,22 +12828,22 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.needs_async_setup	= 1,
 		.ioprio			= 1,
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_sendmsg_prep,
 		.issue			= io_sendmsg,
+		.prep_async		= io_sendmsg_prep_async,
 	},
 	[IORING_OP_RECVMSG] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
-		.needs_async_setup	= 1,
 		.ioprio			= 1,
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recvmsg,
+		.prep_async		= io_recvmsg_prep_async,
 	},
 	[IORING_OP_TIMEOUT] = {
 		.audit_skip		= 1,
@@ -12899,10 +12881,10 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.needs_async_setup	= 1,
 		.async_size		= sizeof(struct io_async_connect),
 		.prep			= io_connect_prep,
 		.issue			= io_connect,
+		.prep_async		= io_connect_prep_async,
 	},
 	[IORING_OP_FALLOCATE] = {
 		.needs_file		= 1,
@@ -13078,10 +13060,10 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_URING_CMD] = {
 		.needs_file		= 1,
 		.plug			= 1,
-		.needs_async_setup	= 1,
 		.async_size		= uring_cmd_pdu_size(1),
 		.prep			= io_uring_cmd_prep,
 		.issue			= io_uring_cmd,
+		.prep_async		= io_uring_cmd_prep_async,
 	},
 };
 

From f49eca21563b6d919d49828aaed9eab5d8090361 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 08:32:05 -0600
Subject: [PATCH 004/172] io_uring: add generic command payload type to struct
 io_kiocb

Each opcode generally has a command structure in io_kiocb which it can
use to store data associated with that request.

In preparation for having the core layer not know about what's inside
these fields, add a generic io_cmd_data type and put in the union as
well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f353822436c4ed..c596ffd92a0a00 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -954,14 +954,27 @@ enum {
 };
 
 /*
- * NOTE! Each of the iocb union members has the file pointer
- * as the first entry in their struct definition. So you can
- * access the file pointer through any of the sub-structs,
- * or directly as just 'file' in this struct.
+ * Each request type overlays its private data structure on top of this one.
+ * They must not exceed this one in size.
  */
+struct io_cmd_data {
+	struct file		*file;
+	/* each command gets 56 bytes of data */
+	__u8			data[56];
+};
+
+#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
+
 struct io_kiocb {
 	union {
+		/*
+		 * NOTE! Each of the io_kiocb union members has the file pointer
+		 * as the first entry in their struct definition. So you can
+		 * access the file pointer through any of the sub-structs,
+		 * or directly as just 'file' in this struct.
+		 */
 		struct file		*file;
+		struct io_cmd_data	cmd;
 		struct io_rw		rw;
 		struct io_poll_iocb	poll;
 		struct io_poll_update	poll_update;

From 3c306fb2f9463fd0e0e896dde9e3e53097d0e41e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 06:57:44 -0600
Subject: [PATCH 005/172] io_uring: convert read/write path to use io_cmd_type

Remove struct io_rw from io_kiocb, and convert the read/write path to
use the io_cmd_type approach instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 194 ++++++++++++++++++++++++--------------------
 1 file changed, 106 insertions(+), 88 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c596ffd92a0a00..db2880ccb22778 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -964,6 +964,7 @@ struct io_cmd_data {
 };
 
 #define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
+#define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
 
 struct io_kiocb {
 	union {
@@ -975,7 +976,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_rw		rw;
 		struct io_poll_iocb	poll;
 		struct io_poll_update	poll_update;
 		struct io_accept	accept;
@@ -3032,7 +3032,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct kiocb *kiocb = &req->rw.kiocb;
+		struct io_rw *rw = io_kiocb_to_cmd(req);
 		int ret;
 
 		/*
@@ -3043,7 +3043,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
+		ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
@@ -3188,11 +3188,11 @@ static void kiocb_end_write(struct io_kiocb *req)
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
-	struct io_async_rw *rw = req->async_data;
+	struct io_async_rw *io = req->async_data;
 
 	if (!req_has_async_data(req))
 		return !io_req_prep_async(req);
-	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
+	iov_iter_restore(&io->s.iter, &io->s.iter_state);
 	return true;
 }
 
@@ -3234,7 +3234,9 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 
 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
-	if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+
+	if (rw->kiocb.ki_flags & IOCB_WRITE) {
 		kiocb_end_write(req);
 		fsnotify_modify(req->file);
 	} else {
@@ -3276,7 +3278,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
 
 static void io_complete_rw(struct kiocb *kiocb, long res)
 {
-	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
 
 	if (__io_complete_rw_common(req, res))
 		return;
@@ -3287,7 +3290,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 {
-	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
 
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
@@ -3418,11 +3422,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req)
 
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct kiocb *kiocb = &req->rw.kiocb;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 	unsigned ioprio;
 	int ret;
 
-	kiocb->ki_pos = READ_ONCE(sqe->off);
+	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
 	/* used for fixed read/write too - just read unconditionally */
 	req->buf_index = READ_ONCE(sqe->buf_index);
 
@@ -3444,14 +3448,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		if (ret)
 			return ret;
 
-		kiocb->ki_ioprio = ioprio;
+		rw->kiocb.ki_ioprio = ioprio;
 	} else {
-		kiocb->ki_ioprio = get_current_ioprio();
+		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 
-	req->rw.addr = READ_ONCE(sqe->addr);
-	req->rw.len = READ_ONCE(sqe->len);
-	req->rw.flags = READ_ONCE(sqe->rw_flags);
+	rw->addr = READ_ONCE(sqe->addr);
+	rw->len = READ_ONCE(sqe->len);
+	rw->flags = READ_ONCE(sqe->rw_flags);
 	return 0;
 }
 
@@ -3478,18 +3482,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 
 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 {
-	struct kiocb *kiocb = &req->rw.kiocb;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 
-	if (kiocb->ki_pos != -1)
-		return &kiocb->ki_pos;
+	if (rw->kiocb.ki_pos != -1)
+		return &rw->kiocb.ki_pos;
 
 	if (!(req->file->f_mode & FMODE_STREAM)) {
 		req->flags |= REQ_F_CUR_POS;
-		kiocb->ki_pos = req->file->f_pos;
-		return &kiocb->ki_pos;
+		rw->kiocb.ki_pos = req->file->f_pos;
+		return &rw->kiocb.ki_pos;
 	}
 
-	kiocb->ki_pos = 0;
+	rw->kiocb.ki_pos = 0;
 	return NULL;
 }
 
@@ -3497,6 +3501,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 		       unsigned int issue_flags)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 
 	/* add previously done IO, if any */
 	if (req_has_async_data(req) && io->bytes_done > 0) {
@@ -3507,11 +3512,11 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 	}
 
 	if (req->flags & REQ_F_CUR_POS)
-		req->file->f_pos = req->rw.kiocb.ki_pos;
-	if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
+		req->file->f_pos = rw->kiocb.ki_pos;
+	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw))
 		__io_complete_rw(req, ret, issue_flags);
 	else
-		io_rw_done(&req->rw.kiocb, ret);
+		io_rw_done(&rw->kiocb, ret);
 
 	if (req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
@@ -3522,11 +3527,12 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 	}
 }
 
-static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
-			     struct io_mapped_ubuf *imu)
+static int __io_import_fixed(struct io_kiocb *req, int ddir,
+			     struct iov_iter *iter, struct io_mapped_ubuf *imu)
 {
-	size_t len = req->rw.len;
-	u64 buf_end, buf_addr = req->rw.addr;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	size_t len = rw->len;
+	u64 buf_end, buf_addr = rw->addr;
 	size_t offset;
 
 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
@@ -3540,7 +3546,7 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
 	 * and advance us to the beginning.
 	 */
 	offset = buf_addr - imu->ubuf;
-	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
 	if (offset) {
 		/*
@@ -3682,12 +3688,13 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 	struct compat_iovec __user *uiov;
 	compat_ssize_t clen;
 	void __user *buf;
 	size_t len;
 
-	uiov = u64_to_user_ptr(req->rw.addr);
+	uiov = u64_to_user_ptr(rw->addr);
 	if (!access_ok(uiov, sizeof(*uiov)))
 		return -EFAULT;
 	if (__get_user(clen, &uiov->iov_len))
@@ -3699,9 +3706,9 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 	buf = io_buffer_select(req, &len, issue_flags);
 	if (!buf)
 		return -ENOBUFS;
-	req->rw.addr = (unsigned long) buf;
+	rw->addr = (unsigned long) buf;
 	iov[0].iov_base = buf;
-	req->rw.len = iov[0].iov_len = (compat_size_t) len;
+	rw->len = iov[0].iov_len = (compat_size_t) len;
 	return 0;
 }
 #endif
@@ -3709,7 +3716,8 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				      unsigned int issue_flags)
 {
-	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
 	void __user *buf;
 	ssize_t len;
 
@@ -3722,21 +3730,23 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	buf = io_buffer_select(req, &len, issue_flags);
 	if (!buf)
 		return -ENOBUFS;
-	req->rw.addr = (unsigned long) buf;
+	rw->addr = (unsigned long) buf;
 	iov[0].iov_base = buf;
-	req->rw.len = iov[0].iov_len = len;
+	rw->len = iov[0].iov_len = len;
 	return 0;
 }
 
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				    unsigned int issue_flags)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
-		iov[0].iov_len = req->rw.len;
+		iov[0].iov_base = u64_to_user_ptr(rw->addr);
+		iov[0].iov_len = rw->len;
 		return 0;
 	}
-	if (req->rw.len != 1)
+	if (rw->len != 1)
 		return -EINVAL;
 
 #ifdef CONFIG_COMPAT
@@ -3754,10 +3764,11 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
 	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
 }
 
-static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 				       struct io_rw_state *s,
 				       unsigned int issue_flags)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 	struct iov_iter *iter = &s->iter;
 	u8 opcode = req->opcode;
 	struct iovec *iovec;
@@ -3766,25 +3777,25 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 	ssize_t ret;
 
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(req, rw, iter, issue_flags);
+		ret = io_import_fixed(req, ddir, iter, issue_flags);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;
 	}
 
-	buf = u64_to_user_ptr(req->rw.addr);
-	sqe_len = req->rw.len;
+	buf = u64_to_user_ptr(rw->addr);
+	sqe_len = rw->len;
 
 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 		if (io_do_buffer_select(req)) {
 			buf = io_buffer_select(req, &sqe_len, issue_flags);
 			if (!buf)
 				return ERR_PTR(-ENOBUFS);
-			req->rw.addr = (unsigned long) buf;
-			req->rw.len = sqe_len;
+			rw->addr = (unsigned long) buf;
+			rw->len = sqe_len;
 		}
 
-		ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+		ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;
@@ -3795,11 +3806,11 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 		ret = io_iov_buffer_select(req, iovec, issue_flags);
 		if (ret)
 			return ERR_PTR(ret);
-		iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+		iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len);
 		return NULL;
 	}
 
-	ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
+	ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
 			      req->ctx->compat);
 	if (unlikely(ret < 0))
 		return ERR_PTR(ret);
@@ -3827,10 +3838,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
  * For files that don't have ->read_iter() and ->write_iter(), handle them
  * by looping over ->read() or ->write() manually.
  */
-static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
+static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
 {
-	struct kiocb *kiocb = &req->rw.kiocb;
-	struct file *file = req->file;
+	struct kiocb *kiocb = &rw->kiocb;
+	struct file *file = kiocb->ki_filp;
 	ssize_t ret = 0;
 	loff_t *ppos;
 
@@ -3854,11 +3865,11 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 		if (!iov_iter_is_bvec(iter)) {
 			iovec = iov_iter_iovec(iter);
 		} else {
-			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
-			iovec.iov_len = req->rw.len;
+			iovec.iov_base = u64_to_user_ptr(rw->addr);
+			iovec.iov_len = rw->len;
 		}
 
-		if (rw == READ) {
+		if (ddir == READ) {
 			nr = file->f_op->read(file, iovec.iov_base,
 					      iovec.iov_len, ppos);
 		} else {
@@ -3875,9 +3886,9 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 		if (!iov_iter_is_bvec(iter)) {
 			iov_iter_advance(iter, nr);
 		} else {
-			req->rw.addr += nr;
-			req->rw.len -= nr;
-			if (!req->rw.len)
+			rw->addr += nr;
+			rw->len -= nr;
+			if (!rw->len)
 				break;
 		}
 		if (nr != iovec.iov_len)
@@ -3890,24 +3901,24 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 			  const struct iovec *fast_iov, struct iov_iter *iter)
 {
-	struct io_async_rw *rw = req->async_data;
+	struct io_async_rw *io = req->async_data;
 
-	memcpy(&rw->s.iter, iter, sizeof(*iter));
-	rw->free_iovec = iovec;
-	rw->bytes_done = 0;
+	memcpy(&io->s.iter, iter, sizeof(*iter));
+	io->free_iovec = iovec;
+	io->bytes_done = 0;
 	/* can only be fixed buffers, no need to do anything */
 	if (iov_iter_is_bvec(iter))
 		return;
 	if (!iovec) {
 		unsigned iov_off = 0;
 
-		rw->s.iter.iov = rw->s.fast_iov;
+		io->s.iter.iov = io->s.fast_iov;
 		if (iter->iov != fast_iov) {
 			iov_off = iter->iov - fast_iov;
-			rw->s.iter.iov += iov_off;
+			io->s.iter.iov += iov_off;
 		}
-		if (rw->s.fast_iov != fast_iov)
-			memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
+		if (io->s.fast_iov != fast_iov)
+			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
 			       sizeof(struct iovec) * iter->nr_segs);
 	} else {
 		req->flags |= REQ_F_NEED_CLEANUP;
@@ -3989,6 +4000,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 {
 	struct wait_page_queue *wpq;
 	struct io_kiocb *req = wait->private;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 	struct wait_page_key *key = arg;
 
 	wpq = container_of(wait, struct wait_page_queue, wait);
@@ -3996,7 +4008,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	if (!wake_page_match(wpq, key))
 		return 0;
 
-	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
+	rw->kiocb.ki_flags &= ~IOCB_WAITQ;
 	list_del_init(&wait->entry);
 	io_req_task_queue(req);
 	return 1;
@@ -4016,9 +4028,10 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
  */
 static bool io_rw_should_retry(struct io_kiocb *req)
 {
-	struct io_async_rw *rw = req->async_data;
-	struct wait_page_queue *wait = &rw->wpq;
-	struct kiocb *kiocb = &req->rw.kiocb;
+	struct io_async_rw *io = req->async_data;
+	struct wait_page_queue *wait = &io->wpq;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct kiocb *kiocb = &rw->kiocb;
 
 	/* never retry for NOWAIT, we just complete with -EAGAIN */
 	if (req->flags & REQ_F_NOWAIT)
@@ -4045,12 +4058,14 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	return true;
 }
 
-static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
 {
-	if (likely(req->file->f_op->read_iter))
-		return call_read_iter(req->file, &req->rw.kiocb, iter);
-	else if (req->file->f_op->read)
-		return loop_rw_iter(READ, req, iter);
+	struct file *file = rw->kiocb.ki_filp;
+
+	if (likely(file->f_op->read_iter))
+		return call_read_iter(file, &rw->kiocb, iter);
+	else if (file->f_op->read)
+		return loop_rw_iter(READ, rw, iter);
 	else
 		return -EINVAL;
 }
@@ -4063,7 +4078,8 @@ static bool need_read_all(struct io_kiocb *req)
 
 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 {
-	struct kiocb *kiocb = &req->rw.kiocb;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct kiocb *kiocb = &rw->kiocb;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
 	int ret;
@@ -4075,7 +4091,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
 	kiocb->ki_flags = iocb_flags(file);
-	ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
+	ret = kiocb_set_rw_flags(kiocb, rw->flags);
 	if (unlikely(ret))
 		return ret;
 
@@ -4107,11 +4123,12 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 
 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
-	struct kiocb *kiocb = &req->rw.kiocb;
+	struct kiocb *kiocb = &rw->kiocb;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	struct io_async_rw *rw;
+	struct io_async_rw *io;
 	ssize_t ret, ret2;
 	loff_t *ppos;
 
@@ -4120,8 +4137,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(ret < 0))
 			return ret;
 	} else {
-		rw = req->async_data;
-		s = &rw->s;
+		io = req->async_data;
+		s = &io->s;
 
 		/*
 		 * Safe and required to re-import if we're using provided
@@ -4168,7 +4185,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		return ret;
 	}
 
-	ret = io_iter_do_read(req, &s->iter);
+	ret = io_iter_do_read(rw, &s->iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
@@ -4202,8 +4219,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		return ret2;
 
 	iovec = NULL;
-	rw = req->async_data;
-	s = &rw->s;
+	io = req->async_data;
+	s = &io->s;
 	/*
 	 * Now use our persistent iterator and state, if we aren't already.
 	 * We've restored and mapped the iter to match.
@@ -4218,7 +4235,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		iov_iter_advance(&s->iter, ret);
 		if (!iov_iter_count(&s->iter))
 			break;
-		rw->bytes_done += ret;
+		io->bytes_done += ret;
 		iov_iter_save_state(&s->iter, &s->iter_state);
 
 		/* if we can retry, do so with the callbacks armed */
@@ -4233,7 +4250,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		 * desired page gets unlocked. We can also get a partial read
 		 * here, and if we do, then just retry at the new offset.
 		 */
-		ret = io_iter_do_read(req, &s->iter);
+		ret = io_iter_do_read(rw, &s->iter);
 		if (ret == -EIOCBQUEUED)
 			return 0;
 		/* we got some bytes, but not all. retry. */
@@ -4251,9 +4268,10 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req);
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
-	struct kiocb *kiocb = &req->rw.kiocb;
+	struct kiocb *kiocb = &rw->kiocb;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	ssize_t ret, ret2;
 	loff_t *ppos;
@@ -4263,9 +4281,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(ret < 0))
 			return ret;
 	} else {
-		struct io_async_rw *rw = req->async_data;
+		struct io_async_rw *io = req->async_data;
 
-		s = &rw->s;
+		s = &io->s;
 		iov_iter_restore(&s->iter, &s->iter_state);
 		iovec = NULL;
 	}
@@ -4315,7 +4333,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	if (likely(req->file->f_op->write_iter))
 		ret2 = call_write_iter(req->file, kiocb, &s->iter);
 	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, req, &s->iter);
+		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
 	else
 		ret2 = -EINVAL;
 

From 8d4388d1166faa5e16fd100157c9cb3c9e982397 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:13:46 -0600
Subject: [PATCH 006/172] io_uring: convert poll path to use io_cmd_type

Remove struct io_poll_iocb from io_kiocb, and convert the poll path to
use the io_cmd_type approach instead.

While at it, rename io_poll_iocb to io_poll which is consistent with the
other request type private structures.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 53 ++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index db2880ccb22778..7459a535060c33 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -556,7 +556,7 @@ struct io_uring_task {
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
  */
-struct io_poll_iocb {
+struct io_poll {
 	struct file			*file;
 	struct wait_queue_head		*head;
 	__poll_t			events;
@@ -919,8 +919,8 @@ enum {
 };
 
 struct async_poll {
-	struct io_poll_iocb	poll;
-	struct io_poll_iocb	*double_poll;
+	struct io_poll		poll;
+	struct io_poll		*double_poll;
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
@@ -976,7 +976,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_poll_iocb	poll;
 		struct io_poll_update	poll_update;
 		struct io_accept	accept;
 		struct io_sync		sync;
@@ -6579,7 +6578,7 @@ static void io_poll_mark_cancelled(struct io_kiocb *req)
 	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
 }
 
-static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
+static struct io_poll *io_poll_get_double(struct io_kiocb *req)
 {
 	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
 	if (req->opcode == IORING_OP_POLL_ADD)
@@ -6587,10 +6586,10 @@ static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
 	return req->apoll->double_poll;
 }
 
-static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
+static struct io_poll *io_poll_get_single(struct io_kiocb *req)
 {
 	if (req->opcode == IORING_OP_POLL_ADD)
-		return &req->poll;
+		return io_kiocb_to_cmd(req);
 	return &req->apoll->poll;
 }
 
@@ -6603,7 +6602,7 @@ static void io_poll_req_insert(struct io_kiocb *req)
 	hlist_add_head(&req->hash_node, list);
 }
 
-static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
+static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
 			      wait_queue_func_t wake_func)
 {
 	poll->head = NULL;
@@ -6614,7 +6613,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 	init_waitqueue_func_entry(&poll->wait, wake_func);
 }
 
-static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
+static inline void io_poll_remove_entry(struct io_poll *poll)
 {
 	struct wait_queue_head *head = smp_load_acquire(&poll->head);
 
@@ -6740,7 +6739,9 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	if (!ret) {
-		req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
+		struct io_poll *poll = io_kiocb_to_cmd(req);
+
+		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 	} else {
 		req->cqe.res = ret;
 		req_set_fail(req);
@@ -6816,8 +6817,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
 	struct io_kiocb *req = wqe_to_req(wait);
-	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
-						 wait);
+	struct io_poll *poll = container_of(wait, struct io_poll, wait);
 	__poll_t mask = key_to_poll(key);
 
 	if (unlikely(mask & POLLFREE)) {
@@ -6863,20 +6863,20 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	return 1;
 }
 
-static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
+static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 			    struct wait_queue_head *head,
-			    struct io_poll_iocb **poll_ptr)
+			    struct io_poll **poll_ptr)
 {
 	struct io_kiocb *req = pt->req;
 	unsigned long wqe_private = (unsigned long) req;
 
 	/*
 	 * The file being polled uses multiple waitqueues for poll handling
-	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
+	 * (e.g. one for read, one for write). Setup a separate io_poll
 	 * if this happens.
 	 */
 	if (unlikely(pt->nr_entries)) {
-		struct io_poll_iocb *first = poll;
+		struct io_poll *first = poll;
 
 		/* double add on the same waitqueue head, ignore */
 		if (first->head == head)
@@ -6918,13 +6918,14 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 			       struct poll_table_struct *p)
 {
 	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+	struct io_poll *poll = io_kiocb_to_cmd(pt->req);
 
-	__io_queue_proc(&pt->req->poll, pt, head,
-			(struct io_poll_iocb **) &pt->req->async_data);
+	__io_queue_proc(poll, pt, head,
+			(struct io_poll **) &pt->req->async_data);
 }
 
 static int __io_arm_poll_handler(struct io_kiocb *req,
-				 struct io_poll_iocb *poll,
+				 struct io_poll *poll,
 				 struct io_poll_table *ipt, __poll_t mask)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -7207,7 +7208,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
 
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_poll_iocb *poll = &req->poll;
+	struct io_poll *poll = io_kiocb_to_cmd(req);
 	u32 flags;
 
 	if (sqe->buf_index || sqe->off || sqe->addr)
@@ -7225,13 +7226,13 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_poll_iocb *poll = &req->poll;
+	struct io_poll *poll = io_kiocb_to_cmd(req);
 	struct io_poll_table ipt;
 	int ret;
 
 	ipt.pt._qproc = io_poll_queue_proc;
 
-	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
+	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
 	if (!ret && ipt.error)
 		req_set_fail(req);
 	ret = ret ?: ipt.error;
@@ -7260,9 +7261,11 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	if (req->poll_update.update_events || req->poll_update.update_user_data) {
 		/* only mask one event flags, keep behavior flags */
 		if (req->poll_update.update_events) {
-			preq->poll.events &= ~0xffff;
-			preq->poll.events |= req->poll_update.events & 0xffff;
-			preq->poll.events |= IO_POLL_UNMASK;
+			struct io_poll *poll = io_kiocb_to_cmd(preq);
+
+			poll->events &= ~0xffff;
+			poll->events |= req->poll_update.events & 0xffff;
+			poll->events |= IO_POLL_UNMASK;
 		}
 		if (req->poll_update.update_user_data)
 			preq->cqe.user_data = req->poll_update.new_user_data;

From c24b154967b6d335189eac25be59913538e2cdb4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:16:40 -0600
Subject: [PATCH 007/172] io_uring: convert poll_update path to use io_cmd_type

Remove struct io_poll_update from io_kiocb, and convert the poll path to
use the io_cmd_type approach instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7459a535060c33..9114d4a42f2bca 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -976,7 +976,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_poll_update	poll_update;
 		struct io_accept	accept;
 		struct io_sync		sync;
 		struct io_cancel	cancel;
@@ -7178,7 +7177,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 static int io_poll_remove_prep(struct io_kiocb *req,
 			       const struct io_uring_sqe *sqe)
 {
-	struct io_poll_update *upd = &req->poll_update;
+	struct io_poll_update *upd = io_kiocb_to_cmd(req);
 	u32 flags;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -7243,7 +7242,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
+	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
+	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
@@ -7258,17 +7258,17 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	spin_unlock(&ctx->completion_lock);
 
-	if (req->poll_update.update_events || req->poll_update.update_user_data) {
+	if (poll_update->update_events || poll_update->update_user_data) {
 		/* only mask one event flags, keep behavior flags */
-		if (req->poll_update.update_events) {
+		if (poll_update->update_events) {
 			struct io_poll *poll = io_kiocb_to_cmd(preq);
 
 			poll->events &= ~0xffff;
-			poll->events |= req->poll_update.events & 0xffff;
+			poll->events |= poll_update->events & 0xffff;
 			poll->events |= IO_POLL_UNMASK;
 		}
-		if (req->poll_update.update_user_data)
-			preq->cqe.user_data = req->poll_update.new_user_data;
+		if (poll_update->update_user_data)
+			preq->cqe.user_data = poll_update->new_user_data;
 
 		ret2 = io_poll_add(preq, issue_flags);
 		/* successfully updated, don't complete poll request */

From bd8587e4997a88635a37fac546223346e512a30b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:24:42 -0600
Subject: [PATCH 008/172] io_uring: remove recvmsg knowledge from
 io_arm_poll_handler()

There's a special case for recvmsg with MSG_ERRQUEUE set. This is
problematic as it means the core needs to know about this special
request type.

For now, just add a generic flag for it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9114d4a42f2bca..2c061424797d7c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -846,6 +846,7 @@ enum {
 	REQ_F_PARTIAL_IO_BIT,
 	REQ_F_CQE32_INIT_BIT,
 	REQ_F_APOLL_MULTISHOT_BIT,
+	REQ_F_CLEAR_POLLIN_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -916,6 +917,8 @@ enum {
 	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
 	/* ->extra1 and ->extra2 are initialised */
 	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
+	/* recvmsg special flag, clear EPOLLIN */
+	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
 };
 
 struct async_poll {
@@ -6145,6 +6148,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
+	if (sr->msg_flags & MSG_ERRQUEUE)
+		req->flags |= REQ_F_CLEAR_POLLIN;
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -7023,8 +7028,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
-		if ((req->opcode == IORING_OP_RECVMSG) &&
-		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
+		if (req->flags & REQ_F_CLEAR_POLLIN)
 			mask &= ~EPOLLIN;
 	} else {
 		mask |= EPOLLOUT | EPOLLWRNORM;

From 8ff86d85b74d547a4e5f09acc884dd4cc173d087 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:27:38 -0600
Subject: [PATCH 009/172] io_uring: convert net related opcodes to use
 io_cmd_type

This converts accept, connect, send/recv, sendmsg/recvmsg, shutdown, and
socket to use io_cmd_type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 53 +++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2c061424797d7c..6798859c601d65 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,13 +979,10 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_accept	accept;
 		struct io_sync		sync;
 		struct io_cancel	cancel;
 		struct io_timeout	timeout;
 		struct io_timeout_rem	timeout_rem;
-		struct io_connect	connect;
-		struct io_sr_msg	sr_msg;
 		struct io_open		open;
 		struct io_close		close;
 		struct io_rsrc_update	rsrc_update;
@@ -995,7 +992,6 @@ struct io_kiocb {
 		struct io_splice	splice;
 		struct io_provide_buf	pbuf;
 		struct io_statx		statx;
-		struct io_shutdown	shutdown;
 		struct io_rename	rename;
 		struct io_unlink	unlink;
 		struct io_mkdir		mkdir;
@@ -1003,7 +999,6 @@ struct io_kiocb {
 		struct io_hardlink	hardlink;
 		struct io_msg		msg;
 		struct io_xattr		xattr;
-		struct io_socket	sock;
 		struct io_uring_cmd	uring_cmd;
 	};
 
@@ -5824,16 +5819,19 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+
 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
 		     sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
-	req->shutdown.how = READ_ONCE(sqe->len);
+	shutdown->how = READ_ONCE(sqe->len);
 	return 0;
 }
 
 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
 	struct socket *sock;
 	int ret;
 
@@ -5844,7 +5842,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	ret = __sys_shutdown_sock(sock, req->shutdown.how);
+	ret = __sys_shutdown_sock(sock, shutdown->how);
 	io_req_complete(req, ret);
 	return 0;
 }
@@ -5881,10 +5879,12 @@ static int io_setup_async_msg(struct io_kiocb *req,
 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 			       struct io_async_msghdr *iomsg)
 {
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
 	iomsg->msg.msg_name = &iomsg->addr;
 	iomsg->free_iov = iomsg->fast_iov;
-	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
-				   req->sr_msg.msg_flags, &iomsg->free_iov);
+	return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
+					&iomsg->free_iov);
 }
 
 static int io_sendmsg_prep_async(struct io_kiocb *req)
@@ -5899,7 +5899,7 @@ static int io_sendmsg_prep_async(struct io_kiocb *req)
 
 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sr_msg *sr = &req->sr_msg;
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 
 	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
@@ -5923,8 +5923,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 	struct io_async_msghdr iomsg, *kmsg;
-	struct io_sr_msg *sr = &req->sr_msg;
 	struct socket *sock;
 	unsigned flags;
 	int min_ret = 0;
@@ -5981,7 +5981,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = &req->sr_msg;
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 	struct msghdr msg;
 	struct iovec iov;
 	struct socket *sock;
@@ -6039,7 +6039,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = &req->sr_msg;
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 	struct iovec __user *uiov;
 	size_t iov_len;
 	int ret;
@@ -6072,7 +6072,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 					struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = &req->sr_msg;
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 	struct compat_iovec __user *uiov;
 	compat_uptr_t ptr;
 	compat_size_t len;
@@ -6135,7 +6135,7 @@ static int io_recvmsg_prep_async(struct io_kiocb *req)
 
 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sr_msg *sr = &req->sr_msg;
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 
 	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
@@ -6161,8 +6161,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 	struct io_async_msghdr iomsg, *kmsg;
-	struct io_sr_msg *sr = &req->sr_msg;
 	struct socket *sock;
 	unsigned int cflags;
 	unsigned flags;
@@ -6238,7 +6238,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = &req->sr_msg;
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
 	struct msghdr msg;
 	struct socket *sock;
 	struct iovec iov;
@@ -6314,7 +6314,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_accept *accept = &req->accept;
+	struct io_accept *accept = io_kiocb_to_cmd(req);
 	unsigned flags;
 
 	if (sqe->len || sqe->buf_index)
@@ -6348,7 +6348,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_accept *accept = &req->accept;
+	struct io_accept *accept = io_kiocb_to_cmd(req);
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 	bool fixed = !!accept->file_slot;
@@ -6413,7 +6413,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_socket *sock = &req->sock;
+	struct io_socket *sock = io_kiocb_to_cmd(req);
 
 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
@@ -6434,7 +6434,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_socket *sock = &req->sock;
+	struct io_socket *sock = io_kiocb_to_cmd(req);
 	bool fixed = !!sock->file_slot;
 	struct file *file;
 	int ret, fd;
@@ -6468,14 +6468,14 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 static int io_connect_prep_async(struct io_kiocb *req)
 {
 	struct io_async_connect *io = req->async_data;
-	struct io_connect *conn = &req->connect;
+	struct io_connect *conn = io_kiocb_to_cmd(req);
 
 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
 }
 
 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_connect *conn = &req->connect;
+	struct io_connect *conn = io_kiocb_to_cmd(req);
 
 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
@@ -6487,6 +6487,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_connect *connect = io_kiocb_to_cmd(req);
 	struct io_async_connect __io, *io;
 	unsigned file_flags;
 	int ret;
@@ -6495,8 +6496,8 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 	if (req_has_async_data(req)) {
 		io = req->async_data;
 	} else {
-		ret = move_addr_to_kernel(req->connect.addr,
-						req->connect.addr_len,
+		ret = move_addr_to_kernel(connect->addr,
+						connect->addr_len,
 						&__io.address);
 		if (ret)
 			goto out;
@@ -6506,7 +6507,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 	file_flags = force_nonblock ? O_NONBLOCK : 0;
 
 	ret = __sys_connect_file(req->file, &io->address,
-					req->connect.addr_len, file_flags);
+					connect->addr_len, file_flags);
 	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
 		if (req_has_async_data(req))
 			return -EAGAIN;

From e4a71006eace14a6699d66a514b6310cd8b57f7d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:30:45 -0600
Subject: [PATCH 010/172] io_uring: convert the sync and fallocate paths to use
 io_cmd_type

They all share the same struct io_sync, convert them to use the
io_cmd_type approach instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6798859c601d65..b59edae29956f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,7 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_sync		sync;
 		struct io_cancel	cancel;
 		struct io_timeout	timeout;
 		struct io_timeout_rem	timeout_rem;
@@ -5085,30 +5084,32 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+
 	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
-	req->sync.flags = READ_ONCE(sqe->fsync_flags);
-	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
+	sync->flags = READ_ONCE(sqe->fsync_flags);
+	if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC))
 		return -EINVAL;
 
-	req->sync.off = READ_ONCE(sqe->off);
-	req->sync.len = READ_ONCE(sqe->len);
+	sync->off = READ_ONCE(sqe->off);
+	sync->len = READ_ONCE(sqe->len);
 	return 0;
 }
 
 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 {
-	loff_t end = req->sync.off + req->sync.len;
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+	loff_t end = sync->off + sync->len;
 	int ret;
 
 	/* fsync always requires a blocking context */
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
-	ret = vfs_fsync_range(req->file, req->sync.off,
-				end > 0 ? end : LLONG_MAX,
-				req->sync.flags & IORING_FSYNC_DATASYNC);
+	ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
+				sync->flags & IORING_FSYNC_DATASYNC);
 	io_req_complete(req, ret);
 	return 0;
 }
@@ -5116,24 +5117,26 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 static int io_fallocate_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+
 	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
-	req->sync.off = READ_ONCE(sqe->off);
-	req->sync.len = READ_ONCE(sqe->addr);
-	req->sync.mode = READ_ONCE(sqe->len);
+	sync->off = READ_ONCE(sqe->off);
+	sync->len = READ_ONCE(sqe->addr);
+	sync->mode = READ_ONCE(sqe->len);
 	return 0;
 }
 
 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_sync *sync = io_kiocb_to_cmd(req);
 	int ret;
 
 	/* fallocate always requiring blocking context */
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
-	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
-				req->sync.len);
+	ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
 	if (ret >= 0)
 		fsnotify_modify(req->file);
 	io_req_complete(req, ret);
@@ -5792,25 +5795,27 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+
 	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
-	req->sync.off = READ_ONCE(sqe->off);
-	req->sync.len = READ_ONCE(sqe->len);
-	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
+	sync->off = READ_ONCE(sqe->off);
+	sync->len = READ_ONCE(sqe->len);
+	sync->flags = READ_ONCE(sqe->sync_range_flags);
 	return 0;
 }
 
 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_sync *sync = io_kiocb_to_cmd(req);
 	int ret;
 
 	/* sync_file_range always requires a blocking context */
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
-	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
-				req->sync.flags);
+	ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
 	io_req_complete(req, ret);
 	return 0;
 }

From f38987f09a062c17e9c5119e23b41abc6326cd4b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:33:01 -0600
Subject: [PATCH 011/172] io_uring: convert cancel path to use io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b59edae29956f2..ef7b5430bb14bb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,7 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_cancel	cancel;
 		struct io_timeout	timeout;
 		struct io_timeout_rem	timeout_rem;
 		struct io_open		open;
@@ -7699,19 +7698,21 @@ static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 static int io_async_cancel_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
+	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 	if (sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
-	req->cancel.addr = READ_ONCE(sqe->addr);
-	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
-	if (req->cancel.flags & ~CANCEL_FLAGS)
+	cancel->addr = READ_ONCE(sqe->addr);
+	cancel->flags = READ_ONCE(sqe->cancel_flags);
+	if (cancel->flags & ~CANCEL_FLAGS)
 		return -EINVAL;
-	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
-		if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
+	if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
+		if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
 			return -EINVAL;
-		req->cancel.fd = READ_ONCE(sqe->fd);
+		cancel->fd = READ_ONCE(sqe->fd);
 	}
 
 	return 0;
@@ -7753,20 +7754,21 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 
 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_cancel *cancel = io_kiocb_to_cmd(req);
 	struct io_cancel_data cd = {
 		.ctx	= req->ctx,
-		.data	= req->cancel.addr,
-		.flags	= req->cancel.flags,
+		.data	= cancel->addr,
+		.flags	= cancel->flags,
 		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
 	};
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
 		if (req->flags & REQ_F_FIXED_FILE)
-			req->file = io_file_get_fixed(req, req->cancel.fd,
+			req->file = io_file_get_fixed(req, cancel->fd,
 							issue_flags);
 		else
-			req->file = io_file_get_normal(req, req->cancel.fd);
+			req->file = io_file_get_normal(req, cancel->fd);
 		if (!req->file) {
 			ret = -EBADF;
 			goto done;

From a43714ace50d85b4cb19af46152f624affeef2d3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:45:22 -0600
Subject: [PATCH 012/172] io_uring: convert timeout path to use io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 117 +++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 49 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ef7b5430bb14bb..d0daa65f7138cc 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,8 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_timeout	timeout;
-		struct io_timeout_rem	timeout_rem;
 		struct io_open		open;
 		struct io_close		close;
 		struct io_rsrc_update	rsrc_update;
@@ -1652,7 +1650,9 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 
 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 {
-	return !req->timeout.off;
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+
+	return !timeout->off;
 }
 
 static __cold void io_fallback_req_func(struct work_struct *work)
@@ -1898,11 +1898,13 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 	struct io_timeout_data *io = req->async_data;
 
 	if (hrtimer_try_to_cancel(&io->timer) != -1) {
+		struct io_timeout *timeout = io_kiocb_to_cmd(req);
+
 		if (status)
 			req_set_fail(req);
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
-		list_del_init(&req->timeout.list);
+		list_del_init(&timeout->list);
 		io_req_tw_post_queue(req, status, 0);
 	}
 }
@@ -1925,10 +1927,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->completion_lock)
 {
 	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-	struct io_kiocb *req, *tmp;
+	struct io_timeout *timeout, *tmp;
 
 	spin_lock_irq(&ctx->timeout_lock);
-	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
+	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
+		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
 		u32 events_needed, events_got;
 
 		if (io_is_timeout_noseq(req))
@@ -1941,7 +1944,7 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 		 * these subtractions won't have wrapped, so we can check if
 		 * target is in [last_seq, current_seq] by comparing the two.
 		 */
-		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
+		events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
 		events_got = seq - ctx->cq_last_tm_flush;
 		if (events_got < events_needed)
 			break;
@@ -2546,11 +2549,12 @@ static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
 
 	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 		struct io_timeout_data *io = link->async_data;
+		struct io_timeout *timeout = io_kiocb_to_cmd(link);
 
 		io_remove_next_linked(req);
-		link->timeout.head = NULL;
+		timeout->head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
-			list_del(&link->timeout.list);
+			list_del(&timeout->list);
 			return link;
 		}
 	}
@@ -7302,11 +7306,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	struct io_timeout_data *data = container_of(timer,
 						struct io_timeout_data, timer);
 	struct io_kiocb *req = data->req;
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->timeout_lock, flags);
-	list_del_init(&req->timeout.list);
+	list_del_init(&timeout->list);
 	atomic_set(&req->ctx->cq_timeouts,
 		atomic_read(&req->ctx->cq_timeouts) + 1);
 	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
@@ -7324,29 +7329,32 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 					   struct io_cancel_data *cd)
 	__must_hold(&ctx->timeout_lock)
 {
+	struct io_timeout *timeout;
 	struct io_timeout_data *io;
-	struct io_kiocb *req;
-	bool found = false;
+	struct io_kiocb *req = NULL;
+
+	list_for_each_entry(timeout, &ctx->timeout_list, list) {
+		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
 
-	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
-		    cd->data != req->cqe.user_data)
+		    cd->data != tmp->cqe.user_data)
 			continue;
 		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
-			if (cd->seq == req->work.cancel_seq)
+			if (cd->seq == tmp->work.cancel_seq)
 				continue;
-			req->work.cancel_seq = cd->seq;
+			tmp->work.cancel_seq = cd->seq;
 		}
-		found = true;
+		req = tmp;
 		break;
 	}
-	if (!found)
+	if (!req)
 		return ERR_PTR(-ENOENT);
 
 	io = req->async_data;
 	if (hrtimer_try_to_cancel(&io->timer) == -1)
 		return ERR_PTR(-EALREADY);
-	list_del_init(&req->timeout.list);
+	timeout = io_kiocb_to_cmd(req);
+	list_del_init(&timeout->list);
 	return req;
 }
 
@@ -7386,15 +7394,18 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 	__must_hold(&ctx->timeout_lock)
 {
 	struct io_timeout_data *io;
-	struct io_kiocb *req;
-	bool found = false;
+	struct io_timeout *timeout;
+	struct io_kiocb *req = NULL;
+
+	list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
+		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
 
-	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
-		found = user_data == req->cqe.user_data;
-		if (found)
+		if (user_data == tmp->cqe.user_data) {
+			req = tmp;
 			break;
+		}
 	}
-	if (!found)
+	if (!req)
 		return -ENOENT;
 
 	io = req->async_data;
@@ -7412,14 +7423,15 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 {
 	struct io_cancel_data cd = { .data = user_data, };
 	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_timeout_data *data;
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	req->timeout.off = 0; /* noseq */
+	timeout->off = 0; /* noseq */
 	data = req->async_data;
-	list_add_tail(&req->timeout.list, &ctx->timeout_list);
+	list_add_tail(&timeout->list, &ctx->timeout_list);
 	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
@@ -7429,7 +7441,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 static int io_timeout_remove_prep(struct io_kiocb *req,
 				  const struct io_uring_sqe *sqe)
 {
-	struct io_timeout_rem *tr = &req->timeout_rem;
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
 
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
@@ -7469,11 +7481,11 @@ static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
  */
 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_timeout_rem *tr = &req->timeout_rem;
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+	if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
 		struct io_cancel_data cd = { .data = tr->addr, };
 
 		spin_lock(&ctx->completion_lock);
@@ -7500,6 +7512,7 @@ static int __io_timeout_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe,
 			     bool is_timeout_link)
 {
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_timeout_data *data;
 	unsigned flags;
 	u32 off = READ_ONCE(sqe->off);
@@ -7516,8 +7529,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
 	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 		return -EINVAL;
 
-	INIT_LIST_HEAD(&req->timeout.list);
-	req->timeout.off = off;
+	INIT_LIST_HEAD(&timeout->list);
+	timeout->off = off;
 	if (unlikely(off && !req->ctx->off_timeout_used))
 		req->ctx->off_timeout_used = true;
 
@@ -7536,7 +7549,7 @@ static int __io_timeout_prep(struct io_kiocb *req,
 	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
 		return -EINVAL;
 
-	INIT_LIST_HEAD(&req->timeout.list);
+	INIT_LIST_HEAD(&timeout->list);
 	data->mode = io_translate_timeout_mode(flags);
 	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
 
@@ -7547,7 +7560,7 @@ static int __io_timeout_prep(struct io_kiocb *req,
 			return -EINVAL;
 		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
 			return -EINVAL;
-		req->timeout.head = link->last;
+		timeout->head = link->last;
 		link->last->flags |= REQ_F_ARM_LTIMEOUT;
 	}
 	return 0;
@@ -7567,10 +7580,11 @@ static int io_link_timeout_prep(struct io_kiocb *req,
 
 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data = req->async_data;
 	struct list_head *entry;
-	u32 tail, off = req->timeout.off;
+	u32 tail, off = timeout->off;
 
 	spin_lock_irq(&ctx->timeout_lock);
 
@@ -7585,7 +7599,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-	req->timeout.target_seq = tail + off;
+	timeout->target_seq = tail + off;
 
 	/* Update the last seq here in case io_flush_timeouts() hasn't.
 	 * This is safe because ->completion_lock is held, and submissions
@@ -7598,17 +7612,17 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 	 * the one we need first.
 	 */
 	list_for_each_prev(entry, &ctx->timeout_list) {
-		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
-						  timeout.list);
+		struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
+		struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);
 
 		if (io_is_timeout_noseq(nxt))
 			continue;
 		/* nxt.seq is behind @tail, otherwise would've been completed */
-		if (off >= nxt->timeout.target_seq - tail)
+		if (off >= nextt->target_seq - tail)
 			break;
 	}
 add:
-	list_add(&req->timeout.list, entry);
+	list_add(&timeout->list, entry);
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 	spin_unlock_irq(&ctx->timeout_lock);
@@ -8198,7 +8212,8 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 
 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 {
-	struct io_kiocb *prev = req->timeout.prev;
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
 
 	if (prev) {
@@ -8222,12 +8237,13 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	struct io_timeout_data *data = container_of(timer,
 						struct io_timeout_data, timer);
 	struct io_kiocb *prev, *req = data->req;
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->timeout_lock, flags);
-	prev = req->timeout.head;
-	req->timeout.head = NULL;
+	prev = timeout->head;
+	timeout->head = NULL;
 
 	/*
 	 * We don't expect the list to be empty, that will only happen if we
@@ -8238,8 +8254,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 		if (!req_ref_inc_not_zero(prev))
 			prev = NULL;
 	}
-	list_del(&req->timeout.list);
-	req->timeout.prev = prev;
+	list_del(&timeout->list);
+	timeout->prev = prev;
 	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 
 	req->io_task_work.func = io_req_task_link_timeout;
@@ -8249,6 +8265,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
 static void io_queue_linked_timeout(struct io_kiocb *req)
 {
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock_irq(&ctx->timeout_lock);
@@ -8256,13 +8273,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 	 * If the back reference is NULL, then our linked request finished
 	 * before we got a chance to setup the timer
 	 */
-	if (req->timeout.head) {
+	if (timeout->head) {
 		struct io_timeout_data *data = req->async_data;
 
 		data->timer.function = io_link_timeout_fn;
 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 				data->mode);
-		list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
+		list_add_tail(&timeout->list, &ctx->ltimeout_list);
 	}
 	spin_unlock_irq(&ctx->timeout_lock);
 	/* drop submission reference */
@@ -10930,12 +10947,14 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
 				    struct task_struct *tsk, bool cancel_all)
 {
-	struct io_kiocb *req, *tmp;
+	struct io_timeout *timeout, *tmp;
 	int canceled = 0;
 
 	spin_lock(&ctx->completion_lock);
 	spin_lock_irq(&ctx->timeout_lock);
-	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
+	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
+		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
+
 		if (io_match_task(req, tsk, cancel_all)) {
 			io_kill_timeout(req, -ECANCELED);
 			canceled++;

From dd752582e398d22fe6798f065953f4266bf4f5db Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:49:25 -0600
Subject: [PATCH 013/172] io_uring: convert open/close path to use io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 72 +++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 32 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d0daa65f7138cc..65eb41a60d741d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,8 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_open		open;
-		struct io_close		close;
 		struct io_rsrc_update	rsrc_update;
 		struct io_fadvise	fadvise;
 		struct io_madvise	madvise;
@@ -5148,6 +5146,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_open *open = io_kiocb_to_cmd(req);
 	const char __user *fname;
 	int ret;
 
@@ -5157,38 +5156,40 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EBADF;
 
 	/* open.how should be already initialised */
-	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
-		req->open.how.flags |= O_LARGEFILE;
+	if (!(open->how.flags & O_PATH) && force_o_largefile())
+		open->how.flags |= O_LARGEFILE;
 
-	req->open.dfd = READ_ONCE(sqe->fd);
+	open->dfd = READ_ONCE(sqe->fd);
 	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	req->open.filename = getname(fname);
-	if (IS_ERR(req->open.filename)) {
-		ret = PTR_ERR(req->open.filename);
-		req->open.filename = NULL;
+	open->filename = getname(fname);
+	if (IS_ERR(open->filename)) {
+		ret = PTR_ERR(open->filename);
+		open->filename = NULL;
 		return ret;
 	}
 
-	req->open.file_slot = READ_ONCE(sqe->file_index);
-	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
+	open->file_slot = READ_ONCE(sqe->file_index);
+	if (open->file_slot && (open->how.flags & O_CLOEXEC))
 		return -EINVAL;
 
-	req->open.nofile = rlimit(RLIMIT_NOFILE);
+	open->nofile = rlimit(RLIMIT_NOFILE);
 	req->flags |= REQ_F_NEED_CLEANUP;
 	return 0;
 }
 
 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_open *open = io_kiocb_to_cmd(req);
 	u64 mode = READ_ONCE(sqe->len);
 	u64 flags = READ_ONCE(sqe->open_flags);
 
-	req->open.how = build_open_how(flags, mode);
+	open->how = build_open_how(flags, mode);
 	return __io_openat_prep(req, sqe);
 }
 
 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_open *open = io_kiocb_to_cmd(req);
 	struct open_how __user *how;
 	size_t len;
 	int ret;
@@ -5198,8 +5199,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (len < OPEN_HOW_SIZE_VER0)
 		return -EINVAL;
 
-	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
-					len);
+	ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len);
 	if (ret)
 		return ret;
 
@@ -5261,35 +5261,36 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_open *open = io_kiocb_to_cmd(req);
 	struct open_flags op;
 	struct file *file;
 	bool resolve_nonblock, nonblock_set;
-	bool fixed = !!req->open.file_slot;
+	bool fixed = !!open->file_slot;
 	int ret;
 
-	ret = build_open_flags(&req->open.how, &op);
+	ret = build_open_flags(&open->how, &op);
 	if (ret)
 		goto err;
 	nonblock_set = op.open_flag & O_NONBLOCK;
-	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
+	resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
 	if (issue_flags & IO_URING_F_NONBLOCK) {
 		/*
 		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
 		 * it'll always -EAGAIN
 		 */
-		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+		if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
 			return -EAGAIN;
 		op.lookup_flags |= LOOKUP_CACHED;
 		op.open_flag |= O_NONBLOCK;
 	}
 
 	if (!fixed) {
-		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+		ret = __get_unused_fd_flags(open->how.flags, open->nofile);
 		if (ret < 0)
 			goto err;
 	}
 
-	file = do_filp_open(req->open.dfd, req->open.filename, &op);
+	file = do_filp_open(open->dfd, open->filename, &op);
 	if (IS_ERR(file)) {
 		/*
 		 * We could hang on to this 'fd' on retrying, but seems like
@@ -5315,9 +5316,9 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 		fd_install(ret, file);
 	else
 		ret = io_fixed_fd_install(req, issue_flags, file,
-						req->open.file_slot);
+						open->file_slot);
 err:
-	putname(req->open.filename);
+	putname(open->filename);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail(req);
@@ -5737,14 +5738,16 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_close *close = io_kiocb_to_cmd(req);
+
 	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
 
-	req->close.fd = READ_ONCE(sqe->fd);
-	req->close.file_slot = READ_ONCE(sqe->file_index);
-	if (req->close.file_slot && req->close.fd)
+	close->fd = READ_ONCE(sqe->fd);
+	close->file_slot = READ_ONCE(sqe->file_index);
+	if (close->file_slot && close->fd)
 		return -EINVAL;
 
 	return 0;
@@ -5753,12 +5756,12 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct files_struct *files = current->files;
-	struct io_close *close = &req->close;
+	struct io_close *close = io_kiocb_to_cmd(req);
 	struct fdtable *fdt;
 	struct file *file;
 	int ret = -EBADF;
 
-	if (req->close.file_slot) {
+	if (close->file_slot) {
 		ret = io_close_fixed(req, issue_flags);
 		goto err;
 	}
@@ -7982,10 +7985,13 @@ static void io_clean_op(struct io_kiocb *req)
 			break;
 			}
 		case IORING_OP_OPENAT:
-		case IORING_OP_OPENAT2:
-			if (req->open.filename)
-				putname(req->open.filename);
+		case IORING_OP_OPENAT2: {
+			struct io_open *open = io_kiocb_to_cmd(req);
+
+			if (open->filename)
+				putname(open->filename);
 			break;
+			}
 		case IORING_OP_RENAMEAT:
 			putname(req->rename.oldpath);
 			putname(req->rename.newpath);
@@ -9831,7 +9837,9 @@ static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
 
 static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 {
-	return __io_close_fixed(req, issue_flags, req->close.file_slot - 1);
+	struct io_close *close = io_kiocb_to_cmd(req);
+
+	return __io_close_fixed(req, issue_flags, close->file_slot - 1);
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,

From 37d4842f11c5a8d4eeb8e85bf2120a167d174456 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:51:05 -0600
Subject: [PATCH 014/172] io_uring: convert madvise/fadvise to use io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 65eb41a60d741d..60d462d7c8479b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -980,8 +980,6 @@ struct io_kiocb {
 		struct file		*file;
 		struct io_cmd_data	cmd;
 		struct io_rsrc_update	rsrc_update;
-		struct io_fadvise	fadvise;
-		struct io_madvise	madvise;
 		struct io_epoll		epoll;
 		struct io_splice	splice;
 		struct io_provide_buf	pbuf;
@@ -5629,12 +5627,14 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+	struct io_madvise *ma = io_kiocb_to_cmd(req);
+
 	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
 
-	req->madvise.addr = READ_ONCE(sqe->addr);
-	req->madvise.len = READ_ONCE(sqe->len);
-	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
+	ma->addr = READ_ONCE(sqe->addr);
+	ma->len = READ_ONCE(sqe->len);
+	ma->advice = READ_ONCE(sqe->fadvise_advice);
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -5644,7 +5644,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = &req->madvise;
+	struct io_madvise *ma = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -5660,18 +5660,20 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+
 	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 		return -EINVAL;
 
-	req->fadvise.offset = READ_ONCE(sqe->off);
-	req->fadvise.len = READ_ONCE(sqe->len);
-	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
+	fa->offset = READ_ONCE(sqe->off);
+	fa->len = READ_ONCE(sqe->len);
+	fa->advice = READ_ONCE(sqe->fadvise_advice);
 	return 0;
 }
 
 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_fadvise *fa = &req->fadvise;
+	struct io_fadvise *fa = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK) {

From bb040a21fd0579ec7575a0ede7185d2504c021b7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 09:59:28 -0600
Subject: [PATCH 015/172] io_uring: convert file system request types to use
 io_cmd_type

This converts statx, rename, unlink, mkdir, symlink, and hardlink to
use io_cmd_type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 102 +++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 45 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 60d462d7c8479b..38c34da088b5ac 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -983,12 +983,6 @@ struct io_kiocb {
 		struct io_epoll		epoll;
 		struct io_splice	splice;
 		struct io_provide_buf	pbuf;
-		struct io_statx		statx;
-		struct io_rename	rename;
-		struct io_unlink	unlink;
-		struct io_mkdir		mkdir;
-		struct io_symlink	symlink;
-		struct io_hardlink	hardlink;
 		struct io_msg		msg;
 		struct io_xattr		xattr;
 		struct io_uring_cmd	uring_cmd;
@@ -4367,7 +4361,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 static int io_renameat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_rename *ren = &req->rename;
+	struct io_rename *ren = io_kiocb_to_cmd(req);
 	const char __user *oldf, *newf;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -4397,7 +4391,7 @@ static int io_renameat_prep(struct io_kiocb *req,
 
 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rename *ren = &req->rename;
+	struct io_rename *ren = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4654,7 +4648,7 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 static int io_unlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_unlink *un = &req->unlink;
+	struct io_unlink *un = io_kiocb_to_cmd(req);
 	const char __user *fname;
 
 	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
@@ -4679,7 +4673,7 @@ static int io_unlinkat_prep(struct io_kiocb *req,
 
 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_unlink *un = &req->unlink;
+	struct io_unlink *un = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4698,7 +4692,7 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 static int io_mkdirat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_mkdir *mkd = &req->mkdir;
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
 	const char __user *fname;
 
 	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -4720,7 +4714,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
 
 static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_mkdir *mkd = &req->mkdir;
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4736,7 +4730,7 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 static int io_symlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_symlink *sl = &req->symlink;
+	struct io_symlink *sl = io_kiocb_to_cmd(req);
 	const char __user *oldpath, *newpath;
 
 	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -4764,7 +4758,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
 
 static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_symlink *sl = &req->symlink;
+	struct io_symlink *sl = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4780,7 +4774,7 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 static int io_linkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_hardlink *lnk = &req->hardlink;
+	struct io_hardlink *lnk = io_kiocb_to_cmd(req);
 	const char __user *oldf, *newf;
 
 	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -4810,7 +4804,7 @@ static int io_linkat_prep(struct io_kiocb *req,
 
 static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_hardlink *lnk = &req->hardlink;
+	struct io_hardlink *lnk = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -5696,6 +5690,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_statx *sx = io_kiocb_to_cmd(req);
 	const char __user *path;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -5703,20 +5698,20 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
 
-	req->statx.dfd = READ_ONCE(sqe->fd);
-	req->statx.mask = READ_ONCE(sqe->len);
+	sx->dfd = READ_ONCE(sqe->fd);
+	sx->mask = READ_ONCE(sqe->len);
 	path = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	req->statx.flags = READ_ONCE(sqe->statx_flags);
+	sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	sx->flags = READ_ONCE(sqe->statx_flags);
 
-	req->statx.filename = getname_flags(path,
-					getname_statx_lookup_flags(req->statx.flags),
-					NULL);
+	sx->filename = getname_flags(path,
+				     getname_statx_lookup_flags(sx->flags),
+				     NULL);
 
-	if (IS_ERR(req->statx.filename)) {
-		int ret = PTR_ERR(req->statx.filename);
+	if (IS_ERR(sx->filename)) {
+		int ret = PTR_ERR(sx->filename);
 
-		req->statx.filename = NULL;
+		sx->filename = NULL;
 		return ret;
 	}
 
@@ -5726,14 +5721,13 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_statx *ctx = &req->statx;
+	struct io_statx *sx = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
-	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
-		       ctx->buffer);
+	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
 	io_req_complete(req, ret);
 	return 0;
 }
@@ -7994,28 +7988,46 @@ static void io_clean_op(struct io_kiocb *req)
 				putname(open->filename);
 			break;
 			}
-		case IORING_OP_RENAMEAT:
-			putname(req->rename.oldpath);
-			putname(req->rename.newpath);
+		case IORING_OP_RENAMEAT: {
+			struct io_rename *ren = io_kiocb_to_cmd(req);
+
+			putname(ren->oldpath);
+			putname(ren->newpath);
 			break;
-		case IORING_OP_UNLINKAT:
-			putname(req->unlink.filename);
+			}
+		case IORING_OP_UNLINKAT: {
+			struct io_unlink *ul = io_kiocb_to_cmd(req);
+
+			putname(ul->filename);
 			break;
-		case IORING_OP_MKDIRAT:
-			putname(req->mkdir.filename);
+			}
+		case IORING_OP_MKDIRAT: {
+			struct io_mkdir *md = io_kiocb_to_cmd(req);
+
+			putname(md->filename);
 			break;
-		case IORING_OP_SYMLINKAT:
-			putname(req->symlink.oldpath);
-			putname(req->symlink.newpath);
+			}
+		case IORING_OP_SYMLINKAT: {
+			struct io_symlink *sl = io_kiocb_to_cmd(req);
+
+			putname(sl->oldpath);
+			putname(sl->newpath);
 			break;
-		case IORING_OP_LINKAT:
-			putname(req->hardlink.oldpath);
-			putname(req->hardlink.newpath);
+			}
+		case IORING_OP_LINKAT: {
+			struct io_hardlink *hl = io_kiocb_to_cmd(req);
+
+			putname(hl->oldpath);
+			putname(hl->newpath);
 			break;
-		case IORING_OP_STATX:
-			if (req->statx.filename)
-				putname(req->statx.filename);
+			}
+		case IORING_OP_STATX: {
+			struct io_statx *sx = io_kiocb_to_cmd(req);
+
+			if (sx->filename)
+				putname(sx->filename);
 			break;
+			}
 		case IORING_OP_SETXATTR:
 		case IORING_OP_FSETXATTR:
 		case IORING_OP_GETXATTR:

From 3e93a3571a17022d8bde08249e468349da6b7305 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:01:09 -0600
Subject: [PATCH 016/172] io_uring: convert epoll to io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 38c34da088b5ac..bcdc6ed7f46bb7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -980,7 +980,6 @@ struct io_kiocb {
 		struct file		*file;
 		struct io_cmd_data	cmd;
 		struct io_rsrc_update	rsrc_update;
-		struct io_epoll		epoll;
 		struct io_splice	splice;
 		struct io_provide_buf	pbuf;
 		struct io_msg		msg;
@@ -5577,18 +5576,20 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_EPOLL)
+	struct io_epoll *epoll = io_kiocb_to_cmd(req);
+
 	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 
-	req->epoll.epfd = READ_ONCE(sqe->fd);
-	req->epoll.op = READ_ONCE(sqe->len);
-	req->epoll.fd = READ_ONCE(sqe->off);
+	epoll->epfd = READ_ONCE(sqe->fd);
+	epoll->op = READ_ONCE(sqe->len);
+	epoll->fd = READ_ONCE(sqe->off);
 
-	if (ep_op_has_event(req->epoll.op)) {
+	if (ep_op_has_event(epoll->op)) {
 		struct epoll_event __user *ev;
 
 		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
-		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
+		if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
 			return -EFAULT;
 	}
 
@@ -5601,7 +5602,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_EPOLL)
-	struct io_epoll *ie = &req->epoll;
+	struct io_epoll *ie = io_kiocb_to_cmd(req);
 	int ret;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 

From 2511d3030c5eaea37baee030cb36e70426ad4e6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:01:47 -0600
Subject: [PATCH 017/172] io_uring: convert splice to use io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index bcdc6ed7f46bb7..d0251d0744494d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -980,7 +980,6 @@ struct io_kiocb {
 		struct file		*file;
 		struct io_cmd_data	cmd;
 		struct io_rsrc_update	rsrc_update;
-		struct io_splice	splice;
 		struct io_provide_buf	pbuf;
 		struct io_msg		msg;
 		struct io_xattr		xattr;
@@ -4918,7 +4917,7 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 static int __io_splice_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_splice *sp = &req->splice;
+	struct io_splice *sp = io_kiocb_to_cmd(req);
 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 
 	sp->len = READ_ONCE(sqe->len);
@@ -4939,7 +4938,7 @@ static int io_tee_prep(struct io_kiocb *req,
 
 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_splice *sp = &req->splice;
+	struct io_splice *sp = io_kiocb_to_cmd(req);
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	struct file *in;
@@ -4971,7 +4970,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_splice *sp = &req->splice;
+	struct io_splice *sp = io_kiocb_to_cmd(req);
 
 	sp->off_in = READ_ONCE(sqe->splice_off_in);
 	sp->off_out = READ_ONCE(sqe->off);
@@ -4980,7 +4979,7 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_splice *sp = &req->splice;
+	struct io_splice *sp = io_kiocb_to_cmd(req);
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	loff_t *poff_in, *poff_out;

From c1ee5595015502dedd2d602ecc65bff9907fb14b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:03:49 -0600
Subject: [PATCH 018/172] io_uring: convert msg and nop to io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d0251d0744494d..41253ef58ada7e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -980,8 +980,6 @@ struct io_kiocb {
 		struct file		*file;
 		struct io_cmd_data	cmd;
 		struct io_rsrc_update	rsrc_update;
-		struct io_provide_buf	pbuf;
-		struct io_msg		msg;
 		struct io_xattr		xattr;
 		struct io_uring_cmd	uring_cmd;
 	};
@@ -5030,19 +5028,21 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 static int io_msg_ring_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+
 	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
 		     sqe->buf_index || sqe->personality))
 		return -EINVAL;
 
-	req->msg.user_data = READ_ONCE(sqe->off);
-	req->msg.len = READ_ONCE(sqe->len);
+	msg->user_data = READ_ONCE(sqe->off);
+	msg->len = READ_ONCE(sqe->len);
 	return 0;
 }
 
 static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_msg *msg = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *target_ctx;
-	struct io_msg *msg = &req->msg;
 	bool filled;
 	int ret;
 
@@ -5324,7 +5324,7 @@ static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 static int io_remove_buffers_prep(struct io_kiocb *req,
 				  const struct io_uring_sqe *sqe)
 {
-	struct io_provide_buf *p = &req->pbuf;
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
 	u64 tmp;
 
 	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
@@ -5381,7 +5381,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 
 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_provide_buf *p = &req->pbuf;
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
@@ -5409,7 +5409,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
 				   const struct io_uring_sqe *sqe)
 {
 	unsigned long size, tmp_check;
-	struct io_provide_buf *p = &req->pbuf;
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
 	u64 tmp;
 
 	if (sqe->rw_flags || sqe->splice_fd_in)
@@ -5528,7 +5528,7 @@ static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
 
 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_provide_buf *p = &req->pbuf;
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;

From ea5af87d29cfe7323f9a401539c526bdd43416a5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:05:49 -0600
Subject: [PATCH 019/172] io_uring: convert rsrc_update to io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 41253ef58ada7e..be8da26f70d6d6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,7 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_rsrc_update	rsrc_update;
 		struct io_xattr		xattr;
 		struct io_uring_cmd	uring_cmd;
 	};
@@ -7800,23 +7799,26 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 static int io_files_update_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
 	if (sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
-	req->rsrc_update.offset = READ_ONCE(sqe->off);
-	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
-	if (!req->rsrc_update.nr_args)
+	up->offset = READ_ONCE(sqe->off);
+	up->nr_args = READ_ONCE(sqe->len);
+	if (!up->nr_args)
 		return -EINVAL;
-	req->rsrc_update.arg = READ_ONCE(sqe->addr);
+	up->arg = READ_ONCE(sqe->addr);
 	return 0;
 }
 
 static int io_files_update_with_index_alloc(struct io_kiocb *req,
 					    unsigned int issue_flags)
 {
-	__s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	__s32 __user *fds = u64_to_user_ptr(up->arg);
 	unsigned int done;
 	struct file *file;
 	int ret, fd;
@@ -7824,7 +7826,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	if (!req->ctx->file_data)
 		return -ENXIO;
 
-	for (done = 0; done < req->rsrc_update.nr_args; done++) {
+	for (done = 0; done < up->nr_args; done++) {
 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			ret = -EFAULT;
 			break;
@@ -7853,23 +7855,24 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_rsrc_update2 up;
+	struct io_uring_rsrc_update2 up2;
 	int ret;
 
-	up.offset = req->rsrc_update.offset;
-	up.data = req->rsrc_update.arg;
-	up.nr = 0;
-	up.tags = 0;
-	up.resv = 0;
-	up.resv2 = 0;
+	up2.offset = up->offset;
+	up2.data = up->arg;
+	up2.nr = 0;
+	up2.tags = 0;
+	up2.resv = 0;
+	up2.resv2 = 0;
 
-	if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) {
+	if (up->offset == IORING_FILE_INDEX_ALLOC) {
 		ret = io_files_update_with_index_alloc(req, issue_flags);
 	} else {
 		io_ring_submit_lock(ctx, issue_flags);
 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
-				&up, req->rsrc_update.nr_args);
+						&up2, up->nr_args);
 		io_ring_submit_unlock(ctx, issue_flags);
 	}
 

From ceb452e1b4ba4ab207dfe119c47bf61d4519dc2e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:06:46 -0600
Subject: [PATCH 020/172] io_uring: convert xattr to use io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index be8da26f70d6d6..0bb3c63f38696c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,7 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_xattr		xattr;
 		struct io_uring_cmd	uring_cmd;
 	};
 
@@ -4402,7 +4401,7 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 
 static inline void __io_xattr_finish(struct io_kiocb *req)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 
 	if (ix->filename)
 		putname(ix->filename);
@@ -4422,7 +4421,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
 static int __io_getxattr_prep(struct io_kiocb *req,
 			      const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	const char __user *name;
 	int ret;
 
@@ -4465,7 +4464,7 @@ static int io_fgetxattr_prep(struct io_kiocb *req,
 static int io_getxattr_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	const char __user *path;
 	int ret;
 
@@ -4486,7 +4485,7 @@ static int io_getxattr_prep(struct io_kiocb *req,
 
 static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4502,7 +4501,7 @@ static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 	struct path path;
 	int ret;
@@ -4531,7 +4530,7 @@ static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 static int __io_setxattr_prep(struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	const char __user *name;
 	int ret;
 
@@ -4562,7 +4561,7 @@ static int __io_setxattr_prep(struct io_kiocb *req,
 static int io_setxattr_prep(struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	const char __user *path;
 	int ret;
 
@@ -4590,7 +4589,7 @@ static int io_fsetxattr_prep(struct io_kiocb *req,
 static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
 			struct path *path)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	int ret;
 
 	ret = mnt_want_write(path->mnt);
@@ -4617,7 +4616,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = &req->xattr;
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 	struct path path;
 	int ret;

From 9a3a11f977f9972b812cc8666d1ffdb93699bd92 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:09:32 -0600
Subject: [PATCH 021/172] io_uring: convert iouring_cmd to io_cmd_type

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0bb3c63f38696c..21246d2e6221ff 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -979,7 +979,6 @@ struct io_kiocb {
 		 */
 		struct file		*file;
 		struct io_cmd_data	cmd;
-		struct io_uring_cmd	uring_cmd;
 	};
 
 	u8				opcode;
@@ -4814,15 +4813,17 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 
 static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
 {
-	req->uring_cmd.task_work_cb(&req->uring_cmd);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+
+	ioucmd->task_work_cb(ioucmd);
 }
 
 void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 			void (*task_work_cb)(struct io_uring_cmd *))
 {
-	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
 
-	req->uring_cmd.task_work_cb = task_work_cb;
+	ioucmd->task_work_cb = task_work_cb;
 	req->io_task_work.func = io_uring_cmd_work;
 	io_req_task_work_add(req);
 }
@@ -4842,7 +4843,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
  */
 void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 {
-	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
 
 	if (ret < 0)
 		req_set_fail(req);
@@ -4855,18 +4856,19 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
 static int io_uring_cmd_prep_async(struct io_kiocb *req)
 {
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
 	size_t cmd_size;
 
 	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
 
-	memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
+	memcpy(req->async_data, ioucmd->cmd, cmd_size);
 	return 0;
 }
 
 static int io_uring_cmd_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
-	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
 
 	if (sqe->rw_flags || sqe->__pad1)
 		return -EINVAL;
@@ -4877,7 +4879,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
 
 static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
 	int ret;

From 890968dc03361b92957e183fe8f692371e4ef18b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:19:47 -0600
Subject: [PATCH 022/172] io_uring: unify struct io_symlink and io_hardlink

They are really just a subset of each other, just use the one type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 21246d2e6221ff..808acb854b660d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -761,14 +761,7 @@ struct io_mkdir {
 	struct filename			*filename;
 };
 
-struct io_symlink {
-	struct file			*file;
-	int				new_dfd;
-	struct filename			*oldpath;
-	struct filename			*newpath;
-};
-
-struct io_hardlink {
+struct io_link {
 	struct file			*file;
 	int				old_dfd;
 	int				new_dfd;
@@ -4723,7 +4716,7 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 static int io_symlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_symlink *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req);
 	const char __user *oldpath, *newpath;
 
 	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -4751,7 +4744,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
 
 static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_symlink *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -4767,7 +4760,7 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 static int io_linkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_hardlink *lnk = io_kiocb_to_cmd(req);
+	struct io_link *lnk = io_kiocb_to_cmd(req);
 	const char __user *oldf, *newf;
 
 	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -4797,7 +4790,7 @@ static int io_linkat_prep(struct io_kiocb *req,
 
 static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_hardlink *lnk = io_kiocb_to_cmd(req);
+	struct io_link *lnk = io_kiocb_to_cmd(req);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -8011,15 +8004,9 @@ static void io_clean_op(struct io_kiocb *req)
 			putname(md->filename);
 			break;
 			}
-		case IORING_OP_SYMLINKAT: {
-			struct io_symlink *sl = io_kiocb_to_cmd(req);
-
-			putname(sl->oldpath);
-			putname(sl->newpath);
-			break;
-			}
+		case IORING_OP_SYMLINKAT:
 		case IORING_OP_LINKAT: {
-			struct io_hardlink *hl = io_kiocb_to_cmd(req);
+			struct io_link *hl = io_kiocb_to_cmd(req);
 
 			putname(hl->oldpath);
 			putname(hl->newpath);

From 4d4c9cff4f702d10f65473a6b4994ce1a13e64ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:26:28 -0600
Subject: [PATCH 023/172] io_uring: define a request type cleanup handler

This can move request type specific cleanup into a private handler,
removing the need for the core io_uring parts to know what types
they are dealing with.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 155 ++++++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 69 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 808acb854b660d..75d8c31a59d50e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1093,6 +1093,7 @@ struct io_op_def {
 	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
 	int (*issue)(struct io_kiocb *, unsigned int);
 	int (*prep_async)(struct io_kiocb *);
+	void (*cleanup)(struct io_kiocb *);
 };
 
 static const struct io_op_def io_op_defs[];
@@ -3433,6 +3434,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+static void io_readv_writev_cleanup(struct io_kiocb *req)
+{
+	struct io_async_rw *io = req->async_data;
+
+	kfree(io->free_iovec);
+}
+
 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 {
 	switch (ret) {
@@ -4391,7 +4399,15 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
-static inline void __io_xattr_finish(struct io_kiocb *req)
+static void io_renameat_cleanup(struct io_kiocb *req)
+{
+	struct io_rename *ren = io_kiocb_to_cmd(req);
+
+	putname(ren->oldpath);
+	putname(ren->newpath);
+}
+
+static inline void io_xattr_cleanup(struct io_kiocb *req)
 {
 	struct io_xattr *ix = io_kiocb_to_cmd(req);
 
@@ -4406,7 +4422,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
 {
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
-	__io_xattr_finish(req);
+	io_xattr_cleanup(req);
 	io_req_complete(req, ret);
 }
 
@@ -4675,6 +4691,13 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static void io_unlinkat_cleanup(struct io_kiocb *req)
+{
+	struct io_unlink *ul = io_kiocb_to_cmd(req);
+
+	putname(ul->filename);
+}
+
 static int io_mkdirat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -4713,6 +4736,13 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static void io_mkdirat_cleanup(struct io_kiocb *req)
+{
+	struct io_mkdir *md = io_kiocb_to_cmd(req);
+
+	putname(md->filename);
+}
+
 static int io_symlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -4804,6 +4834,14 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static void io_link_cleanup(struct io_kiocb *req)
+{
+	struct io_link *sl = io_kiocb_to_cmd(req);
+
+	putname(sl->oldpath);
+	putname(sl->newpath);
+}
+
 static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
@@ -5314,6 +5352,14 @@ static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 	return io_openat2(req, issue_flags);
 }
 
+static void io_open_cleanup(struct io_kiocb *req)
+{
+	struct io_open *open = io_kiocb_to_cmd(req);
+
+	if (open->filename)
+		putname(open->filename);
+}
+
 static int io_remove_buffers_prep(struct io_kiocb *req,
 				  const struct io_uring_sqe *sqe)
 {
@@ -5725,6 +5771,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static void io_statx_cleanup(struct io_kiocb *req)
+{
+	struct io_statx *sx = io_kiocb_to_cmd(req);
+
+	if (sx->filename)
+		putname(sx->filename);
+}
+
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_close *close = io_kiocb_to_cmd(req);
@@ -5897,6 +5951,13 @@ static int io_sendmsg_prep_async(struct io_kiocb *req)
 	return ret;
 }
 
+static void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
+{
+	struct io_async_msghdr *io = req->async_data;
+
+	kfree(io->free_iov);
+}
+
 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -7958,74 +8019,10 @@ static void io_clean_op(struct io_kiocb *req)
 	}
 
 	if (req->flags & REQ_F_NEED_CLEANUP) {
-		switch (req->opcode) {
-		case IORING_OP_READV:
-		case IORING_OP_READ_FIXED:
-		case IORING_OP_READ:
-		case IORING_OP_WRITEV:
-		case IORING_OP_WRITE_FIXED:
-		case IORING_OP_WRITE: {
-			struct io_async_rw *io = req->async_data;
-
-			kfree(io->free_iovec);
-			break;
-			}
-		case IORING_OP_RECVMSG:
-		case IORING_OP_SENDMSG: {
-			struct io_async_msghdr *io = req->async_data;
-
-			kfree(io->free_iov);
-			break;
-			}
-		case IORING_OP_OPENAT:
-		case IORING_OP_OPENAT2: {
-			struct io_open *open = io_kiocb_to_cmd(req);
-
-			if (open->filename)
-				putname(open->filename);
-			break;
-			}
-		case IORING_OP_RENAMEAT: {
-			struct io_rename *ren = io_kiocb_to_cmd(req);
-
-			putname(ren->oldpath);
-			putname(ren->newpath);
-			break;
-			}
-		case IORING_OP_UNLINKAT: {
-			struct io_unlink *ul = io_kiocb_to_cmd(req);
-
-			putname(ul->filename);
-			break;
-			}
-		case IORING_OP_MKDIRAT: {
-			struct io_mkdir *md = io_kiocb_to_cmd(req);
-
-			putname(md->filename);
-			break;
-			}
-		case IORING_OP_SYMLINKAT:
-		case IORING_OP_LINKAT: {
-			struct io_link *hl = io_kiocb_to_cmd(req);
-
-			putname(hl->oldpath);
-			putname(hl->newpath);
-			break;
-			}
-		case IORING_OP_STATX: {
-			struct io_statx *sx = io_kiocb_to_cmd(req);
+		const struct io_op_def *def = &io_op_defs[req->opcode];
 
-			if (sx->filename)
-				putname(sx->filename);
-			break;
-			}
-		case IORING_OP_SETXATTR:
-		case IORING_OP_FSETXATTR:
-		case IORING_OP_GETXATTR:
-		case IORING_OP_FGETXATTR:
-			__io_xattr_finish(req);
-			break;
-		}
+		if (def->cleanup)
+			def->cleanup(req);
 	}
 	if ((req->flags & REQ_F_POLLED) && req->apoll) {
 		kfree(req->apoll->double_poll);
@@ -12838,6 +12835,7 @@ static const struct io_op_def io_op_defs[] = {
 		.prep			= io_prep_rw,
 		.issue			= io_read,
 		.prep_async		= io_readv_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
 	},
 	[IORING_OP_WRITEV] = {
 		.needs_file		= 1,
@@ -12852,6 +12850,7 @@ static const struct io_op_def io_op_defs[] = {
 		.prep			= io_prep_rw,
 		.issue			= io_write,
 		.prep_async		= io_writev_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
@@ -12911,6 +12910,9 @@ static const struct io_op_def io_op_defs[] = {
 		.prep			= io_sendmsg_prep,
 		.issue			= io_sendmsg,
 		.prep_async		= io_sendmsg_prep_async,
+#if defined(CONFIG_NET)
+		.cleanup		= io_sendmsg_recvmsg_cleanup,
+#endif
 	},
 	[IORING_OP_RECVMSG] = {
 		.needs_file		= 1,
@@ -12922,6 +12924,9 @@ static const struct io_op_def io_op_defs[] = {
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recvmsg,
 		.prep_async		= io_recvmsg_prep_async,
+#if defined(CONFIG_NET)
+		.cleanup		= io_sendmsg_recvmsg_cleanup,
+#endif
 	},
 	[IORING_OP_TIMEOUT] = {
 		.audit_skip		= 1,
@@ -12972,6 +12977,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_OPENAT] = {
 		.prep			= io_openat_prep,
 		.issue			= io_openat,
+		.cleanup		= io_open_cleanup,
 	},
 	[IORING_OP_CLOSE] = {
 		.prep			= io_close_prep,
@@ -12987,6 +12993,7 @@ static const struct io_op_def io_op_defs[] = {
 		.audit_skip		= 1,
 		.prep			= io_statx_prep,
 		.issue			= io_statx,
+		.cleanup		= io_statx_cleanup,
 	},
 	[IORING_OP_READ] = {
 		.needs_file		= 1,
@@ -13046,6 +13053,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_OPENAT2] = {
 		.prep			= io_openat2_prep,
 		.issue			= io_openat2,
+		.cleanup		= io_open_cleanup,
 	},
 	[IORING_OP_EPOLL_CTL] = {
 		.unbound_nonreg_file	= 1,
@@ -13089,22 +13097,27 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_RENAMEAT] = {
 		.prep			= io_renameat_prep,
 		.issue			= io_renameat,
+		.cleanup		= io_renameat_cleanup,
 	},
 	[IORING_OP_UNLINKAT] = {
 		.prep			= io_unlinkat_prep,
 		.issue			= io_unlinkat,
+		.cleanup		= io_unlinkat_cleanup,
 	},
 	[IORING_OP_MKDIRAT] = {
 		.prep			= io_mkdirat_prep,
 		.issue			= io_mkdirat,
+		.cleanup		= io_mkdirat_cleanup,
 	},
 	[IORING_OP_SYMLINKAT] = {
 		.prep			= io_symlinkat_prep,
 		.issue			= io_symlinkat,
+		.cleanup		= io_link_cleanup,
 	},
 	[IORING_OP_LINKAT] = {
 		.prep			= io_linkat_prep,
 		.issue			= io_linkat,
+		.cleanup		= io_link_cleanup,
 	},
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
@@ -13116,19 +13129,23 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file = 1,
 		.prep			= io_fsetxattr_prep,
 		.issue			= io_fsetxattr,
+		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_SETXATTR] = {
 		.prep			= io_setxattr_prep,
 		.issue			= io_setxattr,
+		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_FGETXATTR] = {
 		.needs_file = 1,
 		.prep			= io_fgetxattr_prep,
 		.issue			= io_fgetxattr,
+		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_GETXATTR] = {
 		.prep			= io_getxattr_prep,
 		.issue			= io_getxattr,
+		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_SOCKET] = {
 		.audit_skip		= 1,

From e27f928ee1cb068ac6c18ea244351a4c90a61139 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 10:56:14 -0600
Subject: [PATCH 024/172] io_uring: add io_uring_types.h

This adds definitions of structs that both the core and the various
opcode handlers need to know about.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c       | 491 +------------------------------------
 io_uring/io_uring_types.h | 496 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 499 insertions(+), 488 deletions(-)
 create mode 100644 io_uring/io_uring_types.h

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 75d8c31a59d50e..ff7886c35490ab 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -90,6 +90,8 @@
 #include "../fs/internal.h"
 #include "io-wq.h"
 
+#include "io_uring_types.h"
+
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
@@ -122,89 +124,6 @@
 
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
-struct io_uring {
-	u32 head ____cacheline_aligned_in_smp;
-	u32 tail ____cacheline_aligned_in_smp;
-};
-
-/*
- * This data is shared with the application through the mmap at offsets
- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_sqring_offsets when calling io_uring_setup.
- */
-struct io_rings {
-	/*
-	 * Head and tail offsets into the ring; the offsets need to be
-	 * masked to get valid indices.
-	 *
-	 * The kernel controls head of the sq ring and the tail of the cq ring,
-	 * and the application controls tail of the sq ring and the head of the
-	 * cq ring.
-	 */
-	struct io_uring		sq, cq;
-	/*
-	 * Bitmasks to apply to head and tail offsets (constant, equals
-	 * ring_entries - 1)
-	 */
-	u32			sq_ring_mask, cq_ring_mask;
-	/* Ring sizes (constant, power of 2) */
-	u32			sq_ring_entries, cq_ring_entries;
-	/*
-	 * Number of invalid entries dropped by the kernel due to
-	 * invalid index stored in array
-	 *
-	 * Written by the kernel, shouldn't be modified by the
-	 * application (i.e. get number of "new events" by comparing to
-	 * cached value).
-	 *
-	 * After a new SQ head value was read by the application this
-	 * counter includes all submissions that were dropped reaching
-	 * the new SQ head (and possibly more).
-	 */
-	u32			sq_dropped;
-	/*
-	 * Runtime SQ flags
-	 *
-	 * Written by the kernel, shouldn't be modified by the
-	 * application.
-	 *
-	 * The application needs a full memory barrier before checking
-	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
-	 */
-	atomic_t		sq_flags;
-	/*
-	 * Runtime CQ flags
-	 *
-	 * Written by the application, shouldn't be modified by the
-	 * kernel.
-	 */
-	u32			cq_flags;
-	/*
-	 * Number of completion events lost because the queue was full;
-	 * this should be avoided by the application by making sure
-	 * there are not more requests pending than there is space in
-	 * the completion queue.
-	 *
-	 * Written by the kernel, shouldn't be modified by the
-	 * application (i.e. get number of "new events" by comparing to
-	 * cached value).
-	 *
-	 * As completion events come in out of order this counter is not
-	 * ordered with any other data.
-	 */
-	u32			cq_overflow;
-	/*
-	 * Ring buffer of completion events.
-	 *
-	 * The kernel writes completion events fresh every time they are
-	 * produced, so the application is allowed to modify pending
-	 * entries.
-	 */
-	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
-};
-
 struct io_mapped_ubuf {
 	u64		ubuf;
 	u64		ubuf_end;
@@ -252,12 +171,6 @@ struct io_rsrc_put {
 	};
 };
 
-struct io_file_table {
-	struct io_fixed_file *files;
-	unsigned long *bitmap;
-	unsigned int alloc_hint;
-};
-
 struct io_rsrc_node {
 	struct percpu_ref		refs;
 	struct list_head		node;
@@ -310,14 +223,6 @@ struct io_buffer {
 	__u16 bgid;
 };
 
-struct io_restriction {
-	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
-	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
-	u8 sqe_flags_allowed;
-	u8 sqe_flags_required;
-	bool registered;
-};
-
 enum {
 	IO_SQ_THREAD_SHOULD_STOP = 0,
 	IO_SQ_THREAD_SHOULD_PARK,
@@ -347,186 +252,7 @@ struct io_sq_data {
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
-struct io_submit_link {
-	struct io_kiocb		*head;
-	struct io_kiocb		*last;
-};
-
-struct io_submit_state {
-	/* inline/task_work completion list, under ->uring_lock */
-	struct io_wq_work_node	free_list;
-	/* batch completion logic */
-	struct io_wq_work_list	compl_reqs;
-	struct io_submit_link	link;
-
-	bool			plug_started;
-	bool			need_plug;
-	bool			flush_cqes;
-	unsigned short		submit_nr;
-	struct blk_plug		plug;
-};
-
-struct io_ev_fd {
-	struct eventfd_ctx	*cq_ev_fd;
-	unsigned int		eventfd_async: 1;
-	struct rcu_head		rcu;
-};
-
-#define BGID_ARRAY	64
-
-struct io_ring_ctx {
-	/* const or read-mostly hot data */
-	struct {
-		struct percpu_ref	refs;
-
-		struct io_rings		*rings;
-		unsigned int		flags;
-		enum task_work_notify_mode	notify_method;
-		unsigned int		compat: 1;
-		unsigned int		drain_next: 1;
-		unsigned int		restricted: 1;
-		unsigned int		off_timeout_used: 1;
-		unsigned int		drain_active: 1;
-		unsigned int		drain_disabled: 1;
-		unsigned int		has_evfd: 1;
-		unsigned int		syscall_iopoll: 1;
-	} ____cacheline_aligned_in_smp;
-
-	/* submission data */
-	struct {
-		struct mutex		uring_lock;
-
-		/*
-		 * Ring buffer of indices into array of io_uring_sqe, which is
-		 * mmapped by the application using the IORING_OFF_SQES offset.
-		 *
-		 * This indirection could e.g. be used to assign fixed
-		 * io_uring_sqe entries to operations and only submit them to
-		 * the queue when needed.
-		 *
-		 * The kernel modifies neither the indices array nor the entries
-		 * array.
-		 */
-		u32			*sq_array;
-		struct io_uring_sqe	*sq_sqes;
-		unsigned		cached_sq_head;
-		unsigned		sq_entries;
-		struct list_head	defer_list;
-
-		/*
-		 * Fixed resources fast path, should be accessed only under
-		 * uring_lock, and updated through io_uring_register(2)
-		 */
-		struct io_rsrc_node	*rsrc_node;
-		int			rsrc_cached_refs;
-		atomic_t		cancel_seq;
-		struct io_file_table	file_table;
-		unsigned		nr_user_files;
-		unsigned		nr_user_bufs;
-		struct io_mapped_ubuf	**user_bufs;
-
-		struct io_submit_state	submit_state;
-
-		struct io_buffer_list	*io_bl;
-		struct xarray		io_bl_xa;
-		struct list_head	io_buffers_cache;
-
-		struct list_head	timeout_list;
-		struct list_head	ltimeout_list;
-		struct list_head	cq_overflow_list;
-		struct list_head	apoll_cache;
-		struct xarray		personalities;
-		u32			pers_next;
-		unsigned		sq_thread_idle;
-	} ____cacheline_aligned_in_smp;
-
-	/* IRQ completion list, under ->completion_lock */
-	struct io_wq_work_list	locked_free_list;
-	unsigned int		locked_free_nr;
-
-	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
-	struct io_sq_data	*sq_data;	/* if using sq thread polling */
-
-	struct wait_queue_head	sqo_sq_wait;
-	struct list_head	sqd_list;
-
-	unsigned long		check_cq;
-
-	struct {
-		/*
-		 * We cache a range of free CQEs we can use, once exhausted it
-		 * should go through a slower range setup, see __io_get_cqe()
-		 */
-		struct io_uring_cqe	*cqe_cached;
-		struct io_uring_cqe	*cqe_sentinel;
-
-		unsigned		cached_cq_tail;
-		unsigned		cq_entries;
-		struct io_ev_fd	__rcu	*io_ev_fd;
-		struct wait_queue_head	cq_wait;
-		unsigned		cq_extra;
-		atomic_t		cq_timeouts;
-		unsigned		cq_last_tm_flush;
-	} ____cacheline_aligned_in_smp;
-
-	struct {
-		spinlock_t		completion_lock;
-
-		spinlock_t		timeout_lock;
-
-		/*
-		 * ->iopoll_list is protected by the ctx->uring_lock for
-		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
-		 * For SQPOLL, only the single threaded io_sq_thread() will
-		 * manipulate the list, hence no extra locking is needed there.
-		 */
-		struct io_wq_work_list	iopoll_list;
-		struct hlist_head	*cancel_hash;
-		unsigned		cancel_hash_bits;
-		bool			poll_multi_queue;
-
-		struct list_head	io_buffers_comp;
-	} ____cacheline_aligned_in_smp;
-
-	struct io_restriction		restrictions;
-
-	/* slow path rsrc auxilary data, used by update/register */
-	struct {
-		struct io_rsrc_node		*rsrc_backup_node;
-		struct io_mapped_ubuf		*dummy_ubuf;
-		struct io_rsrc_data		*file_data;
-		struct io_rsrc_data		*buf_data;
-
-		struct delayed_work		rsrc_put_work;
-		struct llist_head		rsrc_put_llist;
-		struct list_head		rsrc_ref_list;
-		spinlock_t			rsrc_ref_lock;
-
-		struct list_head	io_buffers_pages;
-	};
-
-	/* Keep this last, we don't need it for the fast path */
-	struct {
-		#if defined(CONFIG_UNIX)
-			struct socket		*ring_sock;
-		#endif
-		/* hashed buffered write serialization */
-		struct io_wq_hash		*hash_map;
-
-		/* Only used for accounting purposes */
-		struct user_struct		*user;
-		struct mm_struct		*mm_account;
-
-		/* ctx exit and cancelation */
-		struct llist_head		fallback_llist;
-		struct delayed_work		fallback_work;
-		struct work_struct		exit_work;
-		struct list_head		tctx_list;
-		struct completion		ref_comp;
-		u32				iowq_limits[2];
-		bool				iowq_limits_set;
-	};
-};
+#define BGID_ARRAY			64
 
 /*
  * Arbitrary limit, can be raised if need be
@@ -808,232 +534,21 @@ struct io_xattr {
 	struct filename			*filename;
 };
 
-enum {
-	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
-	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
-	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
-	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
-	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
-	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
-	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
-
-	/* first byte is taken by user flags, shift it to not overlap */
-	REQ_F_FAIL_BIT		= 8,
-	REQ_F_INFLIGHT_BIT,
-	REQ_F_CUR_POS_BIT,
-	REQ_F_NOWAIT_BIT,
-	REQ_F_LINK_TIMEOUT_BIT,
-	REQ_F_NEED_CLEANUP_BIT,
-	REQ_F_POLLED_BIT,
-	REQ_F_BUFFER_SELECTED_BIT,
-	REQ_F_BUFFER_RING_BIT,
-	REQ_F_COMPLETE_INLINE_BIT,
-	REQ_F_REISSUE_BIT,
-	REQ_F_CREDS_BIT,
-	REQ_F_REFCOUNT_BIT,
-	REQ_F_ARM_LTIMEOUT_BIT,
-	REQ_F_ASYNC_DATA_BIT,
-	REQ_F_SKIP_LINK_CQES_BIT,
-	REQ_F_SINGLE_POLL_BIT,
-	REQ_F_DOUBLE_POLL_BIT,
-	REQ_F_PARTIAL_IO_BIT,
-	REQ_F_CQE32_INIT_BIT,
-	REQ_F_APOLL_MULTISHOT_BIT,
-	REQ_F_CLEAR_POLLIN_BIT,
-	/* keep async read/write and isreg together and in order */
-	REQ_F_SUPPORT_NOWAIT_BIT,
-	REQ_F_ISREG_BIT,
-
-	/* not a real bit, just to check we're not overflowing the space */
-	__REQ_F_LAST_BIT,
-};
-
-enum {
-	/* ctx owns file */
-	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
-	/* drain existing IO first */
-	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
-	/* linked sqes */
-	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
-	/* doesn't sever on completion < 0 */
-	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
-	/* IOSQE_ASYNC */
-	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
-	/* IOSQE_BUFFER_SELECT */
-	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
-	/* IOSQE_CQE_SKIP_SUCCESS */
-	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
-
-	/* fail rest of links */
-	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
-	/* on inflight list, should be cancelled and waited on exit reliably */
-	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
-	/* read/write uses file position */
-	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
-	/* must not punt to workers */
-	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
-	/* has or had linked timeout */
-	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
-	/* needs cleanup */
-	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
-	/* already went through poll handler */
-	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
-	/* buffer already selected */
-	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
-	/* buffer selected from ring, needs commit */
-	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
-	/* completion is deferred through io_comp_state */
-	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
-	/* caller should reissue async */
-	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
-	/* supports async reads/writes */
-	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
-	/* regular file */
-	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
-	/* has creds assigned */
-	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
-	/* skip refcounting if not set */
-	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
-	/* there is a linked timeout that has to be armed */
-	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
-	/* ->async_data allocated */
-	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
-	/* don't post CQEs while failing linked requests */
-	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
-	/* single poll may be active */
-	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
-	/* double poll may active */
-	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
-	/* request has already done partial IO */
-	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
-	/* fast poll multishot mode */
-	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
-	/* ->extra1 and ->extra2 are initialised */
-	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
-	/* recvmsg special flag, clear EPOLLIN */
-	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
-};
-
 struct async_poll {
 	struct io_poll		poll;
 	struct io_poll		*double_poll;
 };
 
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
-
-struct io_task_work {
-	union {
-		struct io_wq_work_node	node;
-		struct llist_node	fallback_node;
-	};
-	io_req_tw_func_t		func;
-};
-
 enum {
 	IORING_RSRC_FILE		= 0,
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_cqe {
-	__u64	user_data;
-	__s32	res;
-	/* fd initially, then cflags for completion */
-	union {
-		__u32	flags;
-		int	fd;
-	};
-};
-
 enum {
 	IO_CHECK_CQ_OVERFLOW_BIT,
 	IO_CHECK_CQ_DROPPED_BIT,
 };
 
-/*
- * Each request type overlays its private data structure on top of this one.
- * They must not exceed this one in size.
- */
-struct io_cmd_data {
-	struct file		*file;
-	/* each command gets 56 bytes of data */
-	__u8			data[56];
-};
-
-#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
-#define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
-
-struct io_kiocb {
-	union {
-		/*
-		 * NOTE! Each of the io_kiocb union members has the file pointer
-		 * as the first entry in their struct definition. So you can
-		 * access the file pointer through any of the sub-structs,
-		 * or directly as just 'file' in this struct.
-		 */
-		struct file		*file;
-		struct io_cmd_data	cmd;
-	};
-
-	u8				opcode;
-	/* polled IO has completed */
-	u8				iopoll_completed;
-	/*
-	 * Can be either a fixed buffer index, or used with provided buffers.
-	 * For the latter, before issue it points to the buffer group ID,
-	 * and after selection it points to the buffer ID itself.
-	 */
-	u16				buf_index;
-	unsigned int			flags;
-
-	struct io_cqe			cqe;
-
-	struct io_ring_ctx		*ctx;
-	struct task_struct		*task;
-
-	struct io_rsrc_node		*rsrc_node;
-
-	union {
-		/* store used ubuf, so we can prevent reloading */
-		struct io_mapped_ubuf	*imu;
-
-		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
-		struct io_buffer	*kbuf;
-
-		/*
-		 * stores buffer ID for ring provided buffers, valid IFF
-		 * REQ_F_BUFFER_RING is set.
-		 */
-		struct io_buffer_list	*buf_list;
-	};
-
-	union {
-		/* used by request caches, completion batching and iopoll */
-		struct io_wq_work_node	comp_list;
-		/* cache ->apoll->events */
-		__poll_t apoll_events;
-	};
-	atomic_t			refs;
-	atomic_t			poll_refs;
-	struct io_task_work		io_task_work;
-	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-	union {
-		struct hlist_node	hash_node;
-		struct {
-			u64		extra1;
-			u64		extra2;
-		};
-	};
-	/* internal polling, see IORING_FEAT_FAST_POLL */
-	struct async_poll		*apoll;
-	/* opcode allocated if it needs to store data for async defer */
-	void				*async_data;
-	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
-	struct io_kiocb			*link;
-	/* custom credentials, valid IFF REQ_F_CREDS is set */
-	const struct cred		*creds;
-	struct io_wq_work		work;
-};
-
 struct io_tctx_node {
 	struct list_head	ctx_node;
 	struct task_struct	*task;
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
new file mode 100644
index 00000000000000..1a0f592ff6fc35
--- /dev/null
+++ b/io_uring/io_uring_types.h
@@ -0,0 +1,496 @@
+#ifndef IO_URING_TYPES_H
+#define IO_URING_TYPES_H
+
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
+
+#include "io-wq.h"
+
+struct io_uring {
+	u32 head ____cacheline_aligned_in_smp;
+	u32 tail ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
+struct io_rings {
+	/*
+	 * Head and tail offsets into the ring; the offsets need to be
+	 * masked to get valid indices.
+	 *
+	 * The kernel controls head of the sq ring and the tail of the cq ring,
+	 * and the application controls tail of the sq ring and the head of the
+	 * cq ring.
+	 */
+	struct io_uring		sq, cq;
+	/*
+	 * Bitmasks to apply to head and tail offsets (constant, equals
+	 * ring_entries - 1)
+	 */
+	u32			sq_ring_mask, cq_ring_mask;
+	/* Ring sizes (constant, power of 2) */
+	u32			sq_ring_entries, cq_ring_entries;
+	/*
+	 * Number of invalid entries dropped by the kernel due to
+	 * invalid index stored in array
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * After a new SQ head value was read by the application this
+	 * counter includes all submissions that were dropped reaching
+	 * the new SQ head (and possibly more).
+	 */
+	u32			sq_dropped;
+	/*
+	 * Runtime SQ flags
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application.
+	 *
+	 * The application needs a full memory barrier before checking
+	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+	 */
+	atomic_t		sq_flags;
+	/*
+	 * Runtime CQ flags
+	 *
+	 * Written by the application, shouldn't be modified by the
+	 * kernel.
+	 */
+	u32			cq_flags;
+	/*
+	 * Number of completion events lost because the queue was full;
+	 * this should be avoided by the application by making sure
+	 * there are not more requests pending than there is space in
+	 * the completion queue.
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * As completion events come in out of order this counter is not
+	 * ordered with any other data.
+	 */
+	u32			cq_overflow;
+	/*
+	 * Ring buffer of completion events.
+	 *
+	 * The kernel writes completion events fresh every time they are
+	 * produced, so the application is allowed to modify pending
+	 * entries.
+	 */
+	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
+};
+
+struct io_restriction {
+	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+	u8 sqe_flags_allowed;
+	u8 sqe_flags_required;
+	bool registered;
+};
+
+struct io_submit_link {
+	struct io_kiocb		*head;
+	struct io_kiocb		*last;
+};
+
+struct io_submit_state {
+	/* inline/task_work completion list, under ->uring_lock */
+	struct io_wq_work_node	free_list;
+	/* batch completion logic */
+	struct io_wq_work_list	compl_reqs;
+	struct io_submit_link	link;
+
+	bool			plug_started;
+	bool			need_plug;
+	bool			flush_cqes;
+	unsigned short		submit_nr;
+	struct blk_plug		plug;
+};
+
+struct io_ev_fd {
+	struct eventfd_ctx	*cq_ev_fd;
+	unsigned int		eventfd_async: 1;
+	struct rcu_head		rcu;
+};
+
+struct io_file_table {
+	struct io_fixed_file *files;
+	unsigned long *bitmap;
+	unsigned int alloc_hint;
+};
+
+struct io_ring_ctx {
+	/* const or read-mostly hot data */
+	struct {
+		struct percpu_ref	refs;
+
+		struct io_rings		*rings;
+		unsigned int		flags;
+		enum task_work_notify_mode	notify_method;
+		unsigned int		compat: 1;
+		unsigned int		drain_next: 1;
+		unsigned int		restricted: 1;
+		unsigned int		off_timeout_used: 1;
+		unsigned int		drain_active: 1;
+		unsigned int		drain_disabled: 1;
+		unsigned int		has_evfd: 1;
+		unsigned int		syscall_iopoll: 1;
+	} ____cacheline_aligned_in_smp;
+
+	/* submission data */
+	struct {
+		struct mutex		uring_lock;
+
+		/*
+		 * Ring buffer of indices into array of io_uring_sqe, which is
+		 * mmapped by the application using the IORING_OFF_SQES offset.
+		 *
+		 * This indirection could e.g. be used to assign fixed
+		 * io_uring_sqe entries to operations and only submit them to
+		 * the queue when needed.
+		 *
+		 * The kernel modifies neither the indices array nor the entries
+		 * array.
+		 */
+		u32			*sq_array;
+		struct io_uring_sqe	*sq_sqes;
+		unsigned		cached_sq_head;
+		unsigned		sq_entries;
+		struct list_head	defer_list;
+
+		/*
+		 * Fixed resources fast path, should be accessed only under
+		 * uring_lock, and updated through io_uring_register(2)
+		 */
+		struct io_rsrc_node	*rsrc_node;
+		int			rsrc_cached_refs;
+		atomic_t		cancel_seq;
+		struct io_file_table	file_table;
+		unsigned		nr_user_files;
+		unsigned		nr_user_bufs;
+		struct io_mapped_ubuf	**user_bufs;
+
+		struct io_submit_state	submit_state;
+
+		struct io_buffer_list	*io_bl;
+		struct xarray		io_bl_xa;
+		struct list_head	io_buffers_cache;
+
+		struct list_head	timeout_list;
+		struct list_head	ltimeout_list;
+		struct list_head	cq_overflow_list;
+		struct list_head	apoll_cache;
+		struct xarray		personalities;
+		u32			pers_next;
+		unsigned		sq_thread_idle;
+	} ____cacheline_aligned_in_smp;
+
+	/* IRQ completion list, under ->completion_lock */
+	struct io_wq_work_list	locked_free_list;
+	unsigned int		locked_free_nr;
+
+	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
+	struct io_sq_data	*sq_data;	/* if using sq thread polling */
+
+	struct wait_queue_head	sqo_sq_wait;
+	struct list_head	sqd_list;
+
+	unsigned long		check_cq;
+
+	struct {
+		/*
+		 * We cache a range of free CQEs we can use, once exhausted it
+		 * should go through a slower range setup, see __io_get_cqe()
+		 */
+		struct io_uring_cqe	*cqe_cached;
+		struct io_uring_cqe	*cqe_sentinel;
+
+		unsigned		cached_cq_tail;
+		unsigned		cq_entries;
+		struct io_ev_fd	__rcu	*io_ev_fd;
+		struct wait_queue_head	cq_wait;
+		unsigned		cq_extra;
+		atomic_t		cq_timeouts;
+		unsigned		cq_last_tm_flush;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t		completion_lock;
+
+		spinlock_t		timeout_lock;
+
+		/*
+		 * ->iopoll_list is protected by the ctx->uring_lock for
+		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
+		 * For SQPOLL, only the single threaded io_sq_thread() will
+		 * manipulate the list, hence no extra locking is needed there.
+		 */
+		struct io_wq_work_list	iopoll_list;
+		struct hlist_head	*cancel_hash;
+		unsigned		cancel_hash_bits;
+		bool			poll_multi_queue;
+
+		struct list_head	io_buffers_comp;
+	} ____cacheline_aligned_in_smp;
+
+	struct io_restriction		restrictions;
+
+	/* slow path rsrc auxilary data, used by update/register */
+	struct {
+		struct io_rsrc_node		*rsrc_backup_node;
+		struct io_mapped_ubuf		*dummy_ubuf;
+		struct io_rsrc_data		*file_data;
+		struct io_rsrc_data		*buf_data;
+
+		struct delayed_work		rsrc_put_work;
+		struct llist_head		rsrc_put_llist;
+		struct list_head		rsrc_ref_list;
+		spinlock_t			rsrc_ref_lock;
+
+		struct list_head	io_buffers_pages;
+	};
+
+	/* Keep this last, we don't need it for the fast path */
+	struct {
+		#if defined(CONFIG_UNIX)
+			struct socket		*ring_sock;
+		#endif
+		/* hashed buffered write serialization */
+		struct io_wq_hash		*hash_map;
+
+		/* Only used for accounting purposes */
+		struct user_struct		*user;
+		struct mm_struct		*mm_account;
+
+		/* ctx exit and cancelation */
+		struct llist_head		fallback_llist;
+		struct delayed_work		fallback_work;
+		struct work_struct		exit_work;
+		struct list_head		tctx_list;
+		struct completion		ref_comp;
+		u32				iowq_limits[2];
+		bool				iowq_limits_set;
+	};
+};
+
+enum {
+	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
+	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
+	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
+	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
+	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
+	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
+	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
+
+	/* first byte is taken by user flags, shift it to not overlap */
+	REQ_F_FAIL_BIT		= 8,
+	REQ_F_INFLIGHT_BIT,
+	REQ_F_CUR_POS_BIT,
+	REQ_F_NOWAIT_BIT,
+	REQ_F_LINK_TIMEOUT_BIT,
+	REQ_F_NEED_CLEANUP_BIT,
+	REQ_F_POLLED_BIT,
+	REQ_F_BUFFER_SELECTED_BIT,
+	REQ_F_BUFFER_RING_BIT,
+	REQ_F_COMPLETE_INLINE_BIT,
+	REQ_F_REISSUE_BIT,
+	REQ_F_CREDS_BIT,
+	REQ_F_REFCOUNT_BIT,
+	REQ_F_ARM_LTIMEOUT_BIT,
+	REQ_F_ASYNC_DATA_BIT,
+	REQ_F_SKIP_LINK_CQES_BIT,
+	REQ_F_SINGLE_POLL_BIT,
+	REQ_F_DOUBLE_POLL_BIT,
+	REQ_F_PARTIAL_IO_BIT,
+	REQ_F_CQE32_INIT_BIT,
+	REQ_F_APOLL_MULTISHOT_BIT,
+	REQ_F_CLEAR_POLLIN_BIT,
+	/* keep async read/write and isreg together and in order */
+	REQ_F_SUPPORT_NOWAIT_BIT,
+	REQ_F_ISREG_BIT,
+
+	/* not a real bit, just to check we're not overflowing the space */
+	__REQ_F_LAST_BIT,
+};
+
+enum {
+	/* ctx owns file */
+	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
+	/* drain existing IO first */
+	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
+	/* linked sqes */
+	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
+	/* doesn't sever on completion < 0 */
+	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
+	/* IOSQE_ASYNC */
+	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
+	/* IOSQE_BUFFER_SELECT */
+	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
+	/* IOSQE_CQE_SKIP_SUCCESS */
+	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
+
+	/* fail rest of links */
+	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
+	/* on inflight list, should be cancelled and waited on exit reliably */
+	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
+	/* read/write uses file position */
+	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
+	/* must not punt to workers */
+	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
+	/* has or had linked timeout */
+	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
+	/* needs cleanup */
+	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
+	/* already went through poll handler */
+	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
+	/* buffer already selected */
+	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
+	/* buffer selected from ring, needs commit */
+	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
+	/* completion is deferred through io_comp_state */
+	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
+	/* caller should reissue async */
+	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
+	/* supports async reads/writes */
+	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
+	/* regular file */
+	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
+	/* has creds assigned */
+	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
+	/* skip refcounting if not set */
+	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
+	/* there is a linked timeout that has to be armed */
+	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
+	/* ->async_data allocated */
+	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
+	/* don't post CQEs while failing linked requests */
+	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
+	/* single poll may be active */
+	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
+	/* double poll may active */
+	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
+	/* request has already done partial IO */
+	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
+	/* fast poll multishot mode */
+	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
+	/* ->extra1 and ->extra2 are initialised */
+	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
+	/* recvmsg special flag, clear EPOLLIN */
+	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
+};
+
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+
+struct io_task_work {
+	union {
+		struct io_wq_work_node	node;
+		struct llist_node	fallback_node;
+	};
+	io_req_tw_func_t		func;
+};
+
+struct io_cqe {
+	__u64	user_data;
+	__s32	res;
+	/* fd initially, then cflags for completion */
+	union {
+		__u32	flags;
+		int	fd;
+	};
+};
+
+/*
+ * Each request type overlays its private data structure on top of this one.
+ * They must not exceed this one in size.
+ */
+struct io_cmd_data {
+	struct file		*file;
+	/* each command gets 56 bytes of data */
+	__u8			data[56];
+};
+
+#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
+#define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
+
+struct io_kiocb {
+	union {
+		/*
+		 * NOTE! Each of the io_kiocb union members has the file pointer
+		 * as the first entry in their struct definition. So you can
+		 * access the file pointer through any of the sub-structs,
+		 * or directly as just 'file' in this struct.
+		 */
+		struct file		*file;
+		struct io_cmd_data	cmd;
+	};
+
+	u8				opcode;
+	/* polled IO has completed */
+	u8				iopoll_completed;
+	/*
+	 * Can be either a fixed buffer index, or used with provided buffers.
+	 * For the latter, before issue it points to the buffer group ID,
+	 * and after selection it points to the buffer ID itself.
+	 */
+	u16				buf_index;
+	unsigned int			flags;
+
+	struct io_cqe			cqe;
+
+	struct io_ring_ctx		*ctx;
+	struct task_struct		*task;
+
+	struct io_rsrc_node		*rsrc_node;
+
+	union {
+		/* store used ubuf, so we can prevent reloading */
+		struct io_mapped_ubuf	*imu;
+
+		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+		struct io_buffer	*kbuf;
+
+		/*
+		 * stores buffer ID for ring provided buffers, valid IFF
+		 * REQ_F_BUFFER_RING is set.
+		 */
+		struct io_buffer_list	*buf_list;
+	};
+
+	union {
+		/* used by request caches, completion batching and iopoll */
+		struct io_wq_work_node	comp_list;
+		/* cache ->apoll->events */
+		__poll_t apoll_events;
+	};
+	atomic_t			refs;
+	atomic_t			poll_refs;
+	struct io_task_work		io_task_work;
+	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+	union {
+		struct hlist_node	hash_node;
+		struct {
+			u64		extra1;
+			u64		extra2;
+		};
+	};
+	/* internal polling, see IORING_FEAT_FAST_POLL */
+	struct async_poll		*apoll;
+	/* opcode allocated if it needs to store data for async defer */
+	void				*async_data;
+	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+	struct io_kiocb			*link;
+	/* custom credentials, valid IFF REQ_F_CREDS is set */
+	const struct cred		*creds;
+	struct io_wq_work		work;
+};
+
+#endif

From de23077eda61f549dbdadc4b6aa95f6e7b251552 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 12:45:38 -0600
Subject: [PATCH 025/172] io_uring: set completion results upfront

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 21 +++++++++------------
 io_uring/io_uring.h | 13 +++++++++++++
 2 files changed, 22 insertions(+), 12 deletions(-)
 create mode 100644 io_uring/io_uring.h

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ff7886c35490ab..92e51bbb769c96 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -91,6 +91,7 @@
 #include "io-wq.h"
 
 #include "io_uring_types.h"
+#include "io_uring.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -1876,21 +1877,15 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
 	io_cqring_ev_posted(ctx);
 }
 
-static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
-					 u32 cflags)
-{
-	req->cqe.res = res;
-	req->cqe.flags = cflags;
-	req->flags |= REQ_F_COMPLETE_INLINE;
-}
-
 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
 				     s32 res, u32 cflags)
 {
-	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
-		io_req_complete_state(req, res, cflags);
-	else
+	if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
+		io_req_set_res(req, res, cflags);
+		req->flags |= REQ_F_COMPLETE_INLINE;
+	} else {
 		io_req_complete_post(req, res, cflags);
+	}
 }
 
 static inline void io_req_complete(struct io_kiocb *req, s32 res)
@@ -2749,7 +2744,8 @@ static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	int res = req->cqe.res;
 
 	if (*locked) {
-		io_req_complete_state(req, res, io_put_kbuf(req, 0));
+		io_req_set_res(req, res, io_put_kbuf(req, 0));
+		req->flags |= REQ_F_COMPLETE_INLINE;
 		io_req_add_compl_list(req);
 	} else {
 		io_req_complete_post(req, res,
@@ -4394,6 +4390,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 	if (ret < 0)
 		req_set_fail(req);
 
+	io_req_set_res(req, 0, ret);
 	if (req->ctx->flags & IORING_SETUP_CQE32)
 		io_req_set_cqe32_extra(req, res2, 0);
 	io_req_complete(req, ret);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
new file mode 100644
index 00000000000000..522e652197572e
--- /dev/null
+++ b/io_uring/io_uring.h
@@ -0,0 +1,13 @@
+#ifndef IOU_CORE_H
+#define IOU_CORE_H
+
+#include <linux/errno.h>
+#include "io_uring_types.h"
+
+static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
+{
+	req->cqe.res = res;
+	req->cqe.flags = cflags;
+}
+
+#endif

From 97b388d70b53fd7d286ac1b81e5a88bd6af98209 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 15:21:00 -0600
Subject: [PATCH 026/172] io_uring: handle completions in the core

Normally request handlers complete requests themselves, if they don't
return an error. For the latter case, the core will complete it for
them.

This is unhandy for pushing opcode handlers further out, as we don't
want a bunch of inline completion code and we don't want to make the
completion path slower than it is now.

Let the core handle any completion, unless the handler explicitly
asks us not to.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 271 ++++++++++++++++++++++----------------------
 io_uring/io_uring.h |   5 +
 2 files changed, 142 insertions(+), 134 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 92e51bbb769c96..7b8f1c9b7b48ea 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -625,7 +625,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
-static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1126,7 +1125,7 @@ static inline void req_set_fail(struct io_kiocb *req)
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
 {
 	req_set_fail(req);
-	req->cqe.res = res;
+	io_req_set_res(req, res, 0);
 }
 
 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
@@ -1855,50 +1854,37 @@ static void __io_req_complete_put(struct io_kiocb *req)
 	}
 }
 
-static void __io_req_complete_post(struct io_kiocb *req, s32 res,
-				   u32 cflags)
+static void __io_req_complete_post(struct io_kiocb *req)
 {
-	if (!(req->flags & REQ_F_CQE_SKIP)) {
-		req->cqe.res = res;
-		req->cqe.flags = cflags;
+	if (!(req->flags & REQ_F_CQE_SKIP))
 		__io_fill_cqe_req(req->ctx, req);
-	}
 	__io_req_complete_put(req);
 }
 
-static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
+static void io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock(&ctx->completion_lock);
-	__io_req_complete_post(req, res, cflags);
+	__io_req_complete_post(req);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 }
 
-static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
-				     s32 res, u32 cflags)
+static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 {
-	if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
-		io_req_set_res(req, res, cflags);
+	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 		req->flags |= REQ_F_COMPLETE_INLINE;
-	} else {
-		io_req_complete_post(req, res, cflags);
-	}
-}
-
-static inline void io_req_complete(struct io_kiocb *req, s32 res)
-{
-	if (res < 0)
-		req_set_fail(req);
-	__io_req_complete(req, 0, res, 0);
+	else
+		io_req_complete_post(req);
 }
 
 static void io_req_complete_failed(struct io_kiocb *req, s32 res)
 {
 	req_set_fail(req);
-	io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
+	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
+	io_req_complete_post(req);
 }
 
 /*
@@ -2071,7 +2057,8 @@ static void io_fail_links(struct io_kiocb *req)
 			link->flags |= REQ_F_CQE_SKIP;
 		else
 			link->flags &= ~REQ_F_CQE_SKIP;
-		__io_req_complete_post(link, res, 0);
+		io_req_set_res(link, res, 0);
+		__io_req_complete_post(link);
 		link = nxt;
 	}
 }
@@ -2185,11 +2172,12 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
 			if (unlikely(!*uring_locked))
 				spin_lock(&(*ctx)->completion_lock);
 		}
-		if (likely(*uring_locked))
+		if (likely(*uring_locked)) {
 			req->io_task_work.func(req, uring_locked);
-		else
-			__io_req_complete_post(req, req->cqe.res,
-						io_put_kbuf_comp(req));
+		} else {
+			req->cqe.flags = io_put_kbuf_comp(req);
+			__io_req_complete_post(req);
+		}
 		node = next;
 	} while (node);
 
@@ -2317,13 +2305,12 @@ static void io_req_task_prio_work_add(struct io_kiocb *req)
 
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 {
-	io_req_complete_post(req, req->cqe.res, req->cqe.flags);
+	io_req_complete_post(req);
 }
 
 static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
 {
-	req->cqe.res = res;
-	req->cqe.flags = cflags;
+	io_req_set_res(req, res, cflags);
 	req->io_task_work.func = io_req_tw_post;
 	io_req_task_work_add(req);
 }
@@ -2347,7 +2334,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 
 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
-	req->cqe.res = ret;
+	io_req_set_res(req, ret, 0);
 	req->io_task_work.func = io_req_task_cancel;
 	io_req_task_work_add(req);
 }
@@ -2741,15 +2728,13 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 
 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	int res = req->cqe.res;
-
 	if (*locked) {
-		io_req_set_res(req, res, io_put_kbuf(req, 0));
+		req->cqe.flags |= io_put_kbuf(req, 0);
 		req->flags |= REQ_F_COMPLETE_INLINE;
 		io_req_add_compl_list(req);
 	} else {
-		io_req_complete_post(req, res,
-					io_put_kbuf(req, IO_URING_F_UNLOCKED));
+		req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED);
+		io_req_complete_post(req);
 	}
 }
 
@@ -2758,8 +2743,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
 {
 	if (__io_complete_rw_common(req, res))
 		return;
-	__io_req_complete(req, issue_flags, req->cqe.res,
-				io_put_kbuf(req, issue_flags));
+	io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags));
+	__io_req_complete(req, issue_flags);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res)
@@ -2769,7 +2754,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 
 	if (__io_complete_rw_common(req, res))
 		return;
-	req->cqe.res = res;
+	io_req_set_res(req, res, 0);
 	req->io_task_work.func = io_req_task_complete;
 	io_req_task_prio_work_add(req);
 }
@@ -3745,7 +3730,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		 */
 		ret = io_iter_do_read(rw, &s->iter);
 		if (ret == -EIOCBQUEUED)
-			return 0;
+			return IOU_ISSUE_SKIP_COMPLETE;
 		/* we got some bytes, but not all. retry. */
 		kiocb->ki_flags &= ~IOCB_WAITQ;
 		iov_iter_restore(&s->iter, &s->iter_state);
@@ -3756,7 +3741,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	/* it's faster to check here then delegate to kfree */
 	if (iovec)
 		kfree(iovec);
-	return 0;
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
@@ -3850,6 +3835,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 			goto copy_iov;
 done:
 		kiocb_done(req, ret2, issue_flags);
+		ret = IOU_ISSUE_SKIP_COMPLETE;
 	} else {
 copy_iov:
 		iov_iter_restore(&s->iter, &s->iter_state);
@@ -3906,8 +3892,8 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 				ren->newpath, ren->flags);
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static void io_renameat_cleanup(struct io_kiocb *req)
@@ -3934,7 +3920,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
 	io_xattr_cleanup(req);
-	io_req_complete(req, ret);
+	io_req_set_res(req, ret, 0);
 }
 
 static int __io_getxattr_prep(struct io_kiocb *req,
@@ -4015,7 +4001,7 @@ static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 			&ix->ctx);
 
 	io_xattr_finish(req, ret);
-	return 0;
+	return IOU_OK;
 }
 
 static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -4043,7 +4029,7 @@ static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	io_xattr_finish(req, ret);
-	return 0;
+	return IOU_OK;
 }
 
 static int __io_setxattr_prep(struct io_kiocb *req,
@@ -4129,8 +4115,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
 	io_xattr_finish(req, ret);
-
-	return 0;
+	return IOU_OK;
 }
 
 static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -4155,7 +4140,7 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	io_xattr_finish(req, ret);
-	return 0;
+	return IOU_OK;
 }
 
 static int io_unlinkat_prep(struct io_kiocb *req,
@@ -4198,8 +4183,8 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 		ret = do_unlinkat(un->dfd, un->filename);
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static void io_unlinkat_cleanup(struct io_kiocb *req)
@@ -4243,8 +4228,8 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static void io_mkdirat_cleanup(struct io_kiocb *req)
@@ -4294,8 +4279,8 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_linkat_prep(struct io_kiocb *req,
@@ -4341,8 +4326,8 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 				lnk->newpath, lnk->flags);
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static void io_link_cleanup(struct io_kiocb *req)
@@ -4393,7 +4378,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 	io_req_set_res(req, 0, ret);
 	if (req->ctx->flags & IORING_SETUP_CQE32)
 		io_req_set_cqe32_extra(req, res2, 0);
-	io_req_complete(req, ret);
+	__io_req_complete(req, 0);
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
@@ -4450,9 +4435,12 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 		return -EAGAIN;
 	}
 
-	if (ret != -EIOCBQUEUED)
+	if (ret != -EIOCBQUEUED) {
 		io_uring_cmd_done(ioucmd, ret, 0);
-	return 0;
+		return IOU_OK;
+	}
+
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static int __io_splice_prep(struct io_kiocb *req,
@@ -4505,8 +4493,8 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 done:
 	if (ret != sp->len)
 		req_set_fail(req);
-	__io_req_complete(req, 0, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4550,8 +4538,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 done:
 	if (ret != sp->len)
 		req_set_fail(req);
-	__io_req_complete(req, 0, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4564,8 +4552,8 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  */
 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
-	__io_req_complete(req, issue_flags, 0, 0);
-	return 0;
+	io_req_set_res(req, 0, 0);
+	return IOU_OK;
 }
 
 static int io_msg_ring_prep(struct io_kiocb *req,
@@ -4609,11 +4597,11 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 done:
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
+	io_req_set_res(req, ret, 0);
 	/* put file to avoid an attempt to IOPOLL the req */
 	io_put_file(req->file);
 	req->file = NULL;
-	return 0;
+	return IOU_OK;
 }
 
 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4644,8 +4632,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
 				sync->flags & IORING_FSYNC_DATASYNC);
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_fallocate_prep(struct io_kiocb *req,
@@ -4673,8 +4661,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 	ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
 	if (ret >= 0)
 		fsnotify_modify(req->file);
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4855,8 +4843,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
@@ -4951,9 +4939,10 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 
 	/* complete before unlock, IOPOLL may need the lock */
-	__io_req_complete(req, issue_flags, ret, 0);
+	io_req_set_res(req, ret, 0);
+	__io_req_complete(req, issue_flags);
 	io_ring_submit_unlock(ctx, issue_flags);
-	return 0;
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static int io_provide_buffers_prep(struct io_kiocb *req,
@@ -5117,9 +5106,10 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	/* complete before unlock, IOPOLL may need the lock */
-	__io_req_complete(req, issue_flags, ret, 0);
+	io_req_set_res(req, ret, 0);
+	__io_req_complete(req, issue_flags);
 	io_ring_submit_unlock(ctx, issue_flags);
-	return 0;
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static int io_epoll_ctl_prep(struct io_kiocb *req,
@@ -5162,8 +5152,8 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -5196,8 +5186,8 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 		return -EAGAIN;
 
 	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -5235,8 +5225,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -5279,8 +5269,8 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 		return -EAGAIN;
 
 	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static void io_statx_cleanup(struct io_kiocb *req)
@@ -5350,8 +5340,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 err:
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -5377,8 +5367,8 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 		return -EAGAIN;
 
 	ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 #if defined(CONFIG_NET)
@@ -5409,8 +5399,8 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 		return -ENOTSOCK;
 
 	ret = __sys_shutdown_sock(sock, shutdown->how);
-	io_req_complete(req, ret);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static bool io_net_retry(struct socket *sock, int flags)
@@ -5548,8 +5538,8 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
@@ -5605,8 +5595,8 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
@@ -5805,8 +5795,8 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	cflags = io_put_kbuf(req, issue_flags);
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	__io_req_complete(req, issue_flags, ret, cflags);
-	return 0;
+	io_req_set_res(req, ret, cflags);
+	return IOU_OK;
 }
 
 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -5881,8 +5871,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	cflags = io_put_kbuf(req, issue_flags);
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	__io_req_complete(req, issue_flags, ret, cflags);
-	return 0;
+	io_req_set_res(req, ret, cflags);
+	return IOU_OK;
 }
 
 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -5948,7 +5938,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 			 */
 			if ((req->flags & IO_APOLL_MULTI_POLLED) ==
 			    IO_APOLL_MULTI_POLLED)
-				ret = 0;
+				ret = IOU_ISSUE_SKIP_COMPLETE;
 			return ret;
 		}
 		if (ret == -ERESTARTSYS)
@@ -5963,8 +5953,8 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
-		__io_req_complete(req, issue_flags, ret, 0);
-		return 0;
+		io_req_set_res(req, ret, 0);
+		return IOU_OK;
 	}
 	if (ret >= 0) {
 		bool filled;
@@ -6034,8 +6024,8 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_fixed_fd_install(req, issue_flags, file,
 					    sock->file_slot);
 	}
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_connect_prep_async(struct io_kiocb *req)
@@ -6096,8 +6086,8 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 out:
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 #else /* !CONFIG_NET */
 #define IO_NETOP_FN(op)							\
@@ -6328,7 +6318,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	io_poll_remove_entries(req);
 	spin_lock(&ctx->completion_lock);
 	hash_del(&req->hash_node);
-	__io_req_complete_post(req, req->cqe.res, 0);
+	req->cqe.flags = 0;
+	__io_req_complete_post(req);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
@@ -6357,7 +6348,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 static void __io_poll_execute(struct io_kiocb *req, int mask,
 			      __poll_t __maybe_unused events)
 {
-	req->cqe.res = mask;
+	io_req_set_res(req, mask, 0);
 	/*
 	 * This is useful for poll that is armed on behalf of another
 	 * request, and where the wakeup path could be on a different
@@ -6810,12 +6801,16 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	ipt.pt._qproc = io_poll_queue_proc;
 
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
-	if (!ret && ipt.error)
+	if (ret) {
+		io_req_set_res(req, ret, 0);
+		return IOU_OK;
+	}
+	if (ipt.error) {
 		req_set_fail(req);
-	ret = ret ?: ipt.error;
-	if (ret)
-		__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+		return ipt.error;
+	}
+
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
@@ -6850,20 +6845,22 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret2 = io_poll_add(preq, issue_flags);
 		/* successfully updated, don't complete poll request */
-		if (!ret2)
+		if (!ret2 || ret2 == -EIOCBQUEUED)
 			goto out;
 	}
 
 	req_set_fail(preq);
-	preq->cqe.res = -ECANCELED;
+	io_req_set_res(preq, -ECANCELED, 0);
 	locked = !(issue_flags & IO_URING_F_UNLOCKED);
 	io_req_task_complete(preq, &locked);
 out:
-	if (ret < 0)
+	if (ret < 0) {
 		req_set_fail(req);
+		return ret;
+	}
 	/* complete update request, we're done with it */
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
@@ -6884,7 +6881,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
 		req_set_fail(req);
 
-	req->cqe.res = -ETIME;
+	io_req_set_res(req, -ETIME, 0);
 	req->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(req);
 	return HRTIMER_NORESTART;
@@ -7069,8 +7066,8 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < 0)
 		req_set_fail(req);
-	io_req_complete_post(req, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int __io_timeout_prep(struct io_kiocb *req,
@@ -7191,7 +7188,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 	spin_unlock_irq(&ctx->timeout_lock);
-	return 0;
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
@@ -7359,8 +7356,8 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 done:
 	if (ret < 0)
 		req_set_fail(req);
-	io_req_complete_post(req, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_files_update_prep(struct io_kiocb *req,
@@ -7445,8 +7442,8 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < 0)
 		req_set_fail(req);
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
 }
 
 static int io_req_prep_async(struct io_kiocb *req)
@@ -7590,8 +7587,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (creds)
 		revert_creds(creds);
-	if (ret)
+
+	if (ret == IOU_OK)
+		__io_req_complete(req, issue_flags);
+	else if (ret != IOU_ISSUE_SKIP_COMPLETE)
 		return ret;
+
 	/* If the op doesn't have a file, we're not polling for it */
 	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
 		io_iopoll_req_issued(req, issue_flags);
@@ -7668,7 +7669,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	} while (1);
 
 	/* avoid locking problems by failing it from a clean context */
-	if (ret)
+	if (ret < 0)
 		io_req_task_queue_fail(req, ret);
 }
 
@@ -7745,10 +7746,12 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 
 			ret = io_try_cancel(req, &cd);
 		}
-		io_req_complete_post(req, ret ?: -ETIME, 0);
+		io_req_set_res(req, ret ?: -ETIME, 0);
+		io_req_complete_post(req);
 		io_put_req(prev);
 	} else {
-		io_req_complete_post(req, -ETIME, 0);
+		io_req_set_res(req, -ETIME, 0);
+		io_req_complete_post(req);
 	}
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 522e652197572e..73943dbe884e5a 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,11 @@
 #include <linux/errno.h>
 #include "io_uring_types.h"
 
+enum {
+	IOU_OK			= 0,
+	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
+};
+
 static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	req->cqe.res = res;

From 5e2a18d93fec514fc74f58a6061b74a79764af69 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 11:46:43 -0600
Subject: [PATCH 027/172] io_uring: move xattr related opcodes to its own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/io_uring.c | 248 +-----------------------------------------
 io_uring/xattr.c    | 259 ++++++++++++++++++++++++++++++++++++++++++++
 io_uring/xattr.h    |  15 +++
 4 files changed, 277 insertions(+), 247 deletions(-)
 create mode 100644 io_uring/xattr.c
 create mode 100644 io_uring/xattr.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 3680425df9478b..479b6957b85ff3 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,5 +2,5 @@
 #
 # Makefile for io_uring
 
-obj-$(CONFIG_IO_URING)		+= io_uring.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7b8f1c9b7b48ea..1af97e4a78b7f0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -80,7 +80,6 @@
 #include <linux/io_uring.h>
 #include <linux/audit.h>
 #include <linux/security.h>
-#include <linux/xattr.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -93,6 +92,8 @@
 #include "io_uring_types.h"
 #include "io_uring.h"
 
+#include "xattr.h"
+
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
@@ -529,12 +530,6 @@ struct io_async_rw {
 	struct wait_page_queue		wpq;
 };
 
-struct io_xattr {
-	struct file			*file;
-	struct xattr_ctx		ctx;
-	struct filename			*filename;
-};
-
 struct async_poll {
 	struct io_poll		poll;
 	struct io_poll		*double_poll;
@@ -3904,245 +3899,6 @@ static void io_renameat_cleanup(struct io_kiocb *req)
 	putname(ren->newpath);
 }
 
-static inline void io_xattr_cleanup(struct io_kiocb *req)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-
-	if (ix->filename)
-		putname(ix->filename);
-
-	kfree(ix->ctx.kname);
-	kvfree(ix->ctx.kvalue);
-}
-
-static void io_xattr_finish(struct io_kiocb *req, int ret)
-{
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-
-	io_xattr_cleanup(req);
-	io_req_set_res(req, ret, 0);
-}
-
-static int __io_getxattr_prep(struct io_kiocb *req,
-			      const struct io_uring_sqe *sqe)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	const char __user *name;
-	int ret;
-
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	ix->filename = NULL;
-	ix->ctx.kvalue = NULL;
-	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	ix->ctx.size = READ_ONCE(sqe->len);
-	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
-
-	if (ix->ctx.flags)
-		return -EINVAL;
-
-	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
-	if (!ix->ctx.kname)
-		return -ENOMEM;
-
-	ret = strncpy_from_user(ix->ctx.kname->name, name,
-				sizeof(ix->ctx.kname->name));
-	if (!ret || ret == sizeof(ix->ctx.kname->name))
-		ret = -ERANGE;
-	if (ret < 0) {
-		kfree(ix->ctx.kname);
-		return ret;
-	}
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_fgetxattr_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
-{
-	return __io_getxattr_prep(req, sqe);
-}
-
-static int io_getxattr_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	const char __user *path;
-	int ret;
-
-	ret = __io_getxattr_prep(req, sqe);
-	if (ret)
-		return ret;
-
-	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
-
-	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
-	if (IS_ERR(ix->filename)) {
-		ret = PTR_ERR(ix->filename);
-		ix->filename = NULL;
-	}
-
-	return ret;
-}
-
-static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
-			req->file->f_path.dentry,
-			&ix->ctx);
-
-	io_xattr_finish(req, ret);
-	return IOU_OK;
-}
-
-static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	unsigned int lookup_flags = LOOKUP_FOLLOW;
-	struct path path;
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-retry:
-	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
-	if (!ret) {
-		ret = do_getxattr(mnt_user_ns(path.mnt),
-				path.dentry,
-				&ix->ctx);
-
-		path_put(&path);
-		if (retry_estale(ret, lookup_flags)) {
-			lookup_flags |= LOOKUP_REVAL;
-			goto retry;
-		}
-	}
-
-	io_xattr_finish(req, ret);
-	return IOU_OK;
-}
-
-static int __io_setxattr_prep(struct io_kiocb *req,
-			const struct io_uring_sqe *sqe)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	const char __user *name;
-	int ret;
-
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	ix->filename = NULL;
-	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	ix->ctx.kvalue = NULL;
-	ix->ctx.size = READ_ONCE(sqe->len);
-	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
-
-	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
-	if (!ix->ctx.kname)
-		return -ENOMEM;
-
-	ret = setxattr_copy(name, &ix->ctx);
-	if (ret) {
-		kfree(ix->ctx.kname);
-		return ret;
-	}
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_setxattr_prep(struct io_kiocb *req,
-			const struct io_uring_sqe *sqe)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	const char __user *path;
-	int ret;
-
-	ret = __io_setxattr_prep(req, sqe);
-	if (ret)
-		return ret;
-
-	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
-
-	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
-	if (IS_ERR(ix->filename)) {
-		ret = PTR_ERR(ix->filename);
-		ix->filename = NULL;
-	}
-
-	return ret;
-}
-
-static int io_fsetxattr_prep(struct io_kiocb *req,
-			const struct io_uring_sqe *sqe)
-{
-	return __io_setxattr_prep(req, sqe);
-}
-
-static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
-			struct path *path)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	int ret;
-
-	ret = mnt_want_write(path->mnt);
-	if (!ret) {
-		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
-		mnt_drop_write(path->mnt);
-	}
-
-	return ret;
-}
-
-static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
-	io_xattr_finish(req, ret);
-	return IOU_OK;
-}
-
-static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
-	unsigned int lookup_flags = LOOKUP_FOLLOW;
-	struct path path;
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-retry:
-	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
-	if (!ret) {
-		ret = __io_setxattr(req, issue_flags, &path);
-		path_put(&path);
-		if (retry_estale(ret, lookup_flags)) {
-			lookup_flags |= LOOKUP_REVAL;
-			goto retry;
-		}
-	}
-
-	io_xattr_finish(req, ret);
-	return IOU_OK;
-}
-
 static int io_unlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
new file mode 100644
index 00000000000000..79adf4efba0184
--- /dev/null
+++ b/io_uring/xattr.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+#include <linux/xattr.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../fs/internal.h"
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "xattr.h"
+
+struct io_xattr {
+	struct file			*file;
+	struct xattr_ctx		ctx;
+	struct filename			*filename;
+};
+
+void io_xattr_cleanup(struct io_kiocb *req)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+
+	if (ix->filename)
+		putname(ix->filename);
+
+	kfree(ix->ctx.kname);
+	kvfree(ix->ctx.kvalue);
+}
+
+static void io_xattr_finish(struct io_kiocb *req, int ret)
+{
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+
+	io_xattr_cleanup(req);
+	io_req_set_res(req, ret, 0);
+}
+
+static int __io_getxattr_prep(struct io_kiocb *req,
+			      const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	const char __user *name;
+	int ret;
+
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	ix->filename = NULL;
+	ix->ctx.kvalue = NULL;
+	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	ix->ctx.size = READ_ONCE(sqe->len);
+	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+	if (ix->ctx.flags)
+		return -EINVAL;
+
+	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+	if (!ix->ctx.kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(ix->ctx.kname->name, name,
+				sizeof(ix->ctx.kname->name));
+	if (!ret || ret == sizeof(ix->ctx.kname->name))
+		ret = -ERANGE;
+	if (ret < 0) {
+		kfree(ix->ctx.kname);
+		return ret;
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return __io_getxattr_prep(req, sqe);
+}
+
+int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	const char __user *path;
+	int ret;
+
+	ret = __io_getxattr_prep(req, sqe);
+	if (ret)
+		return ret;
+
+	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	if (IS_ERR(ix->filename)) {
+		ret = PTR_ERR(ix->filename);
+		ix->filename = NULL;
+	}
+
+	return ret;
+}
+
+int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
+			req->file->f_path.dentry,
+			&ix->ctx);
+
+	io_xattr_finish(req, ret);
+	return IOU_OK;
+}
+
+int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	struct path path;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+retry:
+	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+	if (!ret) {
+		ret = do_getxattr(mnt_user_ns(path.mnt),
+				path.dentry,
+				&ix->ctx);
+
+		path_put(&path);
+		if (retry_estale(ret, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+
+	io_xattr_finish(req, ret);
+	return IOU_OK;
+}
+
+static int __io_setxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	const char __user *name;
+	int ret;
+
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	ix->filename = NULL;
+	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	ix->ctx.kvalue = NULL;
+	ix->ctx.size = READ_ONCE(sqe->len);
+	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+	if (!ix->ctx.kname)
+		return -ENOMEM;
+
+	ret = setxattr_copy(name, &ix->ctx);
+	if (ret) {
+		kfree(ix->ctx.kname);
+		return ret;
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	const char __user *path;
+	int ret;
+
+	ret = __io_setxattr_prep(req, sqe);
+	if (ret)
+		return ret;
+
+	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	if (IS_ERR(ix->filename)) {
+		ret = PTR_ERR(ix->filename);
+		ix->filename = NULL;
+	}
+
+	return ret;
+}
+
+int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return __io_setxattr_prep(req, sqe);
+}
+
+static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
+			struct path *path)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	int ret;
+
+	ret = mnt_want_write(path->mnt);
+	if (!ret) {
+		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
+		mnt_drop_write(path->mnt);
+	}
+
+	return ret;
+}
+
+int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
+	io_xattr_finish(req, ret);
+	return IOU_OK;
+}
+
+int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	struct path path;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+retry:
+	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+	if (!ret) {
+		ret = __io_setxattr(req, issue_flags, &path);
+		path_put(&path);
+		if (retry_estale(ret, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+
+	io_xattr_finish(req, ret);
+	return IOU_OK;
+}
diff --git a/io_uring/xattr.h b/io_uring/xattr.h
new file mode 100644
index 00000000000000..9b459d2ae90c5a
--- /dev/null
+++ b/io_uring/xattr.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+void io_xattr_cleanup(struct io_kiocb *req);
+
+int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_setxattr(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_getxattr(struct io_kiocb *req, unsigned int issue_flags);

From e28683bdfc2f2cb0dab042f9079cda89dbf589d9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 11:56:42 -0600
Subject: [PATCH 028/172] io_uring: move nop into its own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |  2 +-
 io_uring/io_uring.c | 15 +--------------
 io_uring/nop.c      | 26 ++++++++++++++++++++++++++
 io_uring/nop.h      |  4 ++++
 4 files changed, 32 insertions(+), 15 deletions(-)
 create mode 100644 io_uring/nop.c
 create mode 100644 io_uring/nop.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 479b6957b85ff3..32c02a38f83b42 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,5 +2,5 @@
 #
 # Makefile for io_uring
 
-obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1af97e4a78b7f0..3e74925bd4b454 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -93,6 +93,7 @@
 #include "io_uring.h"
 
 #include "xattr.h"
+#include "nop.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -4298,20 +4299,6 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	return 0;
-}
-
-/*
- * IORING_OP_NOP just posts a completion event, nothing else.
- */
-static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
-{
-	io_req_set_res(req, 0, 0);
-	return IOU_OK;
-}
-
 static int io_msg_ring_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
diff --git a/io_uring/nop.c b/io_uring/nop.c
new file mode 100644
index 00000000000000..d363d8ce70a3b7
--- /dev/null
+++ b/io_uring/nop.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "nop.h"
+
+int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return 0;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+int io_nop(struct io_kiocb *req, unsigned int issue_flags)
+{
+	io_req_set_res(req, 0, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/nop.h b/io_uring/nop.h
new file mode 100644
index 00000000000000..97f1535c9dec42
--- /dev/null
+++ b/io_uring/nop.h
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_nop(struct io_kiocb *req, unsigned int issue_flags);

From 11aeb71406ddd0ef526ad1df48b54aae628aad3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 21:13:00 -0600
Subject: [PATCH 029/172] io_uring: split out filesystem related operations

This splits out renameat, unlinkat, mkdirat, symlinkat, and linkat.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/fs.c       | 294 ++++++++++++++++++++++++++++++++++++++++++++
 io_uring/fs.h       |  20 +++
 io_uring/io_uring.c | 283 +-----------------------------------------
 4 files changed, 316 insertions(+), 283 deletions(-)
 create mode 100644 io_uring/fs.c
 create mode 100644 io_uring/fs.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 32c02a38f83b42..50e68c9a4d1bb6 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,5 +2,5 @@
 #
 # Makefile for io_uring
 
-obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/fs.c b/io_uring/fs.c
new file mode 100644
index 00000000000000..aac1bc5255b07f
--- /dev/null
+++ b/io_uring/fs.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../fs/internal.h"
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "fs.h"
+
+struct io_rename {
+	struct file			*file;
+	int				old_dfd;
+	int				new_dfd;
+	struct filename			*oldpath;
+	struct filename			*newpath;
+	int				flags;
+};
+
+struct io_unlink {
+	struct file			*file;
+	int				dfd;
+	int				flags;
+	struct filename			*filename;
+};
+
+struct io_mkdir {
+	struct file			*file;
+	int				dfd;
+	umode_t				mode;
+	struct filename			*filename;
+};
+
+struct io_link {
+	struct file			*file;
+	int				old_dfd;
+	int				new_dfd;
+	struct filename			*oldpath;
+	struct filename			*newpath;
+	int				flags;
+};
+
+int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_rename *ren = io_kiocb_to_cmd(req);
+	const char __user *oldf, *newf;
+
+	if (sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	ren->old_dfd = READ_ONCE(sqe->fd);
+	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	ren->new_dfd = READ_ONCE(sqe->len);
+	ren->flags = READ_ONCE(sqe->rename_flags);
+
+	ren->oldpath = getname(oldf);
+	if (IS_ERR(ren->oldpath))
+		return PTR_ERR(ren->oldpath);
+
+	ren->newpath = getname(newf);
+	if (IS_ERR(ren->newpath)) {
+		putname(ren->oldpath);
+		return PTR_ERR(ren->newpath);
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rename *ren = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+				ren->newpath, ren->flags);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+void io_renameat_cleanup(struct io_kiocb *req)
+{
+	struct io_rename *ren = io_kiocb_to_cmd(req);
+
+	putname(ren->oldpath);
+	putname(ren->newpath);
+}
+
+int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_unlink *un = io_kiocb_to_cmd(req);
+	const char __user *fname;
+
+	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	un->dfd = READ_ONCE(sqe->fd);
+
+	un->flags = READ_ONCE(sqe->unlink_flags);
+	if (un->flags & ~AT_REMOVEDIR)
+		return -EINVAL;
+
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	un->filename = getname(fname);
+	if (IS_ERR(un->filename))
+		return PTR_ERR(un->filename);
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_unlink *un = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	if (un->flags & AT_REMOVEDIR)
+		ret = do_rmdir(un->dfd, un->filename);
+	else
+		ret = do_unlinkat(un->dfd, un->filename);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+void io_unlinkat_cleanup(struct io_kiocb *req)
+{
+	struct io_unlink *ul = io_kiocb_to_cmd(req);
+
+	putname(ul->filename);
+}
+
+int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
+	const char __user *fname;
+
+	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	mkd->dfd = READ_ONCE(sqe->fd);
+	mkd->mode = READ_ONCE(sqe->len);
+
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	mkd->filename = getname(fname);
+	if (IS_ERR(mkd->filename))
+		return PTR_ERR(mkd->filename);
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+void io_mkdirat_cleanup(struct io_kiocb *req)
+{
+	struct io_mkdir *md = io_kiocb_to_cmd(req);
+
+	putname(md->filename);
+}
+
+int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_link *sl = io_kiocb_to_cmd(req);
+	const char __user *oldpath, *newpath;
+
+	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	sl->new_dfd = READ_ONCE(sqe->fd);
+	oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+
+	sl->oldpath = getname(oldpath);
+	if (IS_ERR(sl->oldpath))
+		return PTR_ERR(sl->oldpath);
+
+	sl->newpath = getname(newpath);
+	if (IS_ERR(sl->newpath)) {
+		putname(sl->oldpath);
+		return PTR_ERR(sl->newpath);
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_link *sl = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_link *lnk = io_kiocb_to_cmd(req);
+	const char __user *oldf, *newf;
+
+	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	lnk->old_dfd = READ_ONCE(sqe->fd);
+	lnk->new_dfd = READ_ONCE(sqe->len);
+	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	lnk->flags = READ_ONCE(sqe->hardlink_flags);
+
+	lnk->oldpath = getname(oldf);
+	if (IS_ERR(lnk->oldpath))
+		return PTR_ERR(lnk->oldpath);
+
+	lnk->newpath = getname(newf);
+	if (IS_ERR(lnk->newpath)) {
+		putname(lnk->oldpath);
+		return PTR_ERR(lnk->newpath);
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_link *lnk = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
+				lnk->newpath, lnk->flags);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+void io_link_cleanup(struct io_kiocb *req)
+{
+	struct io_link *sl = io_kiocb_to_cmd(req);
+
+	putname(sl->oldpath);
+	putname(sl->newpath);
+}
diff --git a/io_uring/fs.h b/io_uring/fs.h
new file mode 100644
index 00000000000000..0bb5efe3d6bbdf
--- /dev/null
+++ b/io_uring/fs.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_renameat(struct io_kiocb *req, unsigned int issue_flags);
+void io_renameat_cleanup(struct io_kiocb *req);
+
+int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags);
+void io_unlinkat_cleanup(struct io_kiocb *req);
+
+int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags);
+void io_mkdirat_cleanup(struct io_kiocb *req);
+
+int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
+void io_link_cleanup(struct io_kiocb *req);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3e74925bd4b454..5b1e67ff0faadf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -94,6 +94,7 @@
 
 #include "xattr.h"
 #include "nop.h"
+#include "fs.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -467,38 +468,6 @@ struct io_shutdown {
 	int				how;
 };
 
-struct io_rename {
-	struct file			*file;
-	int				old_dfd;
-	int				new_dfd;
-	struct filename			*oldpath;
-	struct filename			*newpath;
-	int				flags;
-};
-
-struct io_unlink {
-	struct file			*file;
-	int				dfd;
-	int				flags;
-	struct filename			*filename;
-};
-
-struct io_mkdir {
-	struct file			*file;
-	int				dfd;
-	umode_t				mode;
-	struct filename			*filename;
-};
-
-struct io_link {
-	struct file			*file;
-	int				old_dfd;
-	int				new_dfd;
-	struct filename			*oldpath;
-	struct filename			*newpath;
-	int				flags;
-};
-
 struct io_msg {
 	struct file			*file;
 	u64 user_data;
@@ -3845,256 +3814,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
-static int io_renameat_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_rename *ren = io_kiocb_to_cmd(req);
-	const char __user *oldf, *newf;
-
-	if (sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	ren->old_dfd = READ_ONCE(sqe->fd);
-	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	ren->new_dfd = READ_ONCE(sqe->len);
-	ren->flags = READ_ONCE(sqe->rename_flags);
-
-	ren->oldpath = getname(oldf);
-	if (IS_ERR(ren->oldpath))
-		return PTR_ERR(ren->oldpath);
-
-	ren->newpath = getname(newf);
-	if (IS_ERR(ren->newpath)) {
-		putname(ren->oldpath);
-		return PTR_ERR(ren->newpath);
-	}
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_rename *ren = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
-				ren->newpath, ren->flags);
-
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static void io_renameat_cleanup(struct io_kiocb *req)
-{
-	struct io_rename *ren = io_kiocb_to_cmd(req);
-
-	putname(ren->oldpath);
-	putname(ren->newpath);
-}
-
-static int io_unlinkat_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_unlink *un = io_kiocb_to_cmd(req);
-	const char __user *fname;
-
-	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	un->dfd = READ_ONCE(sqe->fd);
-
-	un->flags = READ_ONCE(sqe->unlink_flags);
-	if (un->flags & ~AT_REMOVEDIR)
-		return -EINVAL;
-
-	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	un->filename = getname(fname);
-	if (IS_ERR(un->filename))
-		return PTR_ERR(un->filename);
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_unlink *un = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	if (un->flags & AT_REMOVEDIR)
-		ret = do_rmdir(un->dfd, un->filename);
-	else
-		ret = do_unlinkat(un->dfd, un->filename);
-
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static void io_unlinkat_cleanup(struct io_kiocb *req)
-{
-	struct io_unlink *ul = io_kiocb_to_cmd(req);
-
-	putname(ul->filename);
-}
-
-static int io_mkdirat_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
-	const char __user *fname;
-
-	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	mkd->dfd = READ_ONCE(sqe->fd);
-	mkd->mode = READ_ONCE(sqe->len);
-
-	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	mkd->filename = getname(fname);
-	if (IS_ERR(mkd->filename))
-		return PTR_ERR(mkd->filename);
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
-
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static void io_mkdirat_cleanup(struct io_kiocb *req)
-{
-	struct io_mkdir *md = io_kiocb_to_cmd(req);
-
-	putname(md->filename);
-}
-
-static int io_symlinkat_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_link *sl = io_kiocb_to_cmd(req);
-	const char __user *oldpath, *newpath;
-
-	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	sl->new_dfd = READ_ONCE(sqe->fd);
-	oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-
-	sl->oldpath = getname(oldpath);
-	if (IS_ERR(sl->oldpath))
-		return PTR_ERR(sl->oldpath);
-
-	sl->newpath = getname(newpath);
-	if (IS_ERR(sl->newpath)) {
-		putname(sl->oldpath);
-		return PTR_ERR(sl->newpath);
-	}
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_link *sl = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
-
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int io_linkat_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_link *lnk = io_kiocb_to_cmd(req);
-	const char __user *oldf, *newf;
-
-	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	lnk->old_dfd = READ_ONCE(sqe->fd);
-	lnk->new_dfd = READ_ONCE(sqe->len);
-	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	lnk->flags = READ_ONCE(sqe->hardlink_flags);
-
-	lnk->oldpath = getname(oldf);
-	if (IS_ERR(lnk->oldpath))
-		return PTR_ERR(lnk->oldpath);
-
-	lnk->newpath = getname(newf);
-	if (IS_ERR(lnk->newpath)) {
-		putname(lnk->oldpath);
-		return PTR_ERR(lnk->newpath);
-	}
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_link *lnk = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
-				lnk->newpath, lnk->flags);
-
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static void io_link_cleanup(struct io_kiocb *req)
-{
-	struct io_link *sl = io_kiocb_to_cmd(req);
-
-	putname(sl->oldpath);
-	putname(sl->newpath);
-}
-
 static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);

From 531113bbd5bfc93e8de45440752af11c751e4aaf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 21:19:47 -0600
Subject: [PATCH 030/172] io_uring: split out splice related operations

This splits out splice and tee support.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/io_uring.c | 133 ++------------------------------------------
 io_uring/io_uring.h |  19 +++++++
 io_uring/splice.c   | 123 ++++++++++++++++++++++++++++++++++++++++
 io_uring/splice.h   |   7 +++
 5 files changed, 154 insertions(+), 130 deletions(-)
 create mode 100644 io_uring/splice.c
 create mode 100644 io_uring/splice.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 50e68c9a4d1bb6..c6aca2af6f4ab9 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,5 +2,5 @@
 #
 # Makefile for io_uring
 
-obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5b1e67ff0faadf..43d4044d3bb958 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@
 #include "xattr.h"
 #include "nop.h"
 #include "fs.h"
+#include "splice.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -436,15 +437,6 @@ struct io_epoll {
 	struct epoll_event		event;
 };
 
-struct io_splice {
-	struct file			*file_out;
-	loff_t				off_out;
-	loff_t				off_in;
-	u64				len;
-	int				splice_fd_in;
-	unsigned int			flags;
-};
-
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -596,9 +588,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
-static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
-					     unsigned issue_flags);
-static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 static void io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
 
@@ -1078,15 +1067,6 @@ static inline bool req_has_async_data(struct io_kiocb *req)
 	return req->flags & REQ_F_ASYNC_DATA;
 }
 
-static inline void req_set_fail(struct io_kiocb *req)
-{
-	req->flags |= REQ_F_FAIL;
-	if (req->flags & REQ_F_CQE_SKIP) {
-		req->flags &= ~REQ_F_CQE_SKIP;
-		req->flags |= REQ_F_SKIP_LINK_CQES;
-	}
-}
-
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
 {
 	req_set_fail(req);
@@ -1941,12 +1921,6 @@ static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 	return container_of(node, struct io_kiocb, comp_list);
 }
 
-static inline void io_put_file(struct file *file)
-{
-	if (file)
-		fput(file);
-}
-
 static inline void io_dismantle_req(struct io_kiocb *req)
 {
 	unsigned int flags = req->flags;
@@ -3919,105 +3893,6 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
-static int __io_splice_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_splice *sp = io_kiocb_to_cmd(req);
-	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
-
-	sp->len = READ_ONCE(sqe->len);
-	sp->flags = READ_ONCE(sqe->splice_flags);
-	if (unlikely(sp->flags & ~valid_flags))
-		return -EINVAL;
-	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
-	return 0;
-}
-
-static int io_tee_prep(struct io_kiocb *req,
-		       const struct io_uring_sqe *sqe)
-{
-	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
-		return -EINVAL;
-	return __io_splice_prep(req, sqe);
-}
-
-static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_splice *sp = io_kiocb_to_cmd(req);
-	struct file *out = sp->file_out;
-	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
-	struct file *in;
-	long ret = 0;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	if (sp->flags & SPLICE_F_FD_IN_FIXED)
-		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
-	else
-		in = io_file_get_normal(req, sp->splice_fd_in);
-	if (!in) {
-		ret = -EBADF;
-		goto done;
-	}
-
-	if (sp->len)
-		ret = do_tee(in, out, sp->len, flags);
-
-	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
-		io_put_file(in);
-done:
-	if (ret != sp->len)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_splice *sp = io_kiocb_to_cmd(req);
-
-	sp->off_in = READ_ONCE(sqe->splice_off_in);
-	sp->off_out = READ_ONCE(sqe->off);
-	return __io_splice_prep(req, sqe);
-}
-
-static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_splice *sp = io_kiocb_to_cmd(req);
-	struct file *out = sp->file_out;
-	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
-	loff_t *poff_in, *poff_out;
-	struct file *in;
-	long ret = 0;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	if (sp->flags & SPLICE_F_FD_IN_FIXED)
-		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
-	else
-		in = io_file_get_normal(req, sp->splice_fd_in);
-	if (!in) {
-		ret = -EBADF;
-		goto done;
-	}
-
-	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
-	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
-
-	if (sp->len)
-		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
-
-	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
-		io_put_file(in);
-done:
-	if (ret != sp->len)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 static int io_msg_ring_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -7157,8 +7032,8 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
 	file_slot->file_ptr = file_ptr;
 }
 
-static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
-					     unsigned int issue_flags)
+inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+				      unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = NULL;
@@ -7181,7 +7056,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	return file;
 }
 
-static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
+struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 {
 	struct file *file = fget(fd);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 73943dbe884e5a..02c00122b97a31 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -9,10 +9,29 @@ enum {
 	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
 };
 
+static inline void req_set_fail(struct io_kiocb *req)
+{
+	req->flags |= REQ_F_FAIL;
+	if (req->flags & REQ_F_CQE_SKIP) {
+		req->flags &= ~REQ_F_CQE_SKIP;
+		req->flags |= REQ_F_SKIP_LINK_CQES;
+	}
+}
+
 static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	req->cqe.res = res;
 	req->cqe.flags = cflags;
 }
 
+static inline void io_put_file(struct file *file)
+{
+	if (file)
+		fput(file);
+}
+
+struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+			       unsigned issue_flags);
+
 #endif
diff --git a/io_uring/splice.c b/io_uring/splice.c
new file mode 100644
index 00000000000000..0e19d63303452a
--- /dev/null
+++ b/io_uring/splice.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+#include <linux/splice.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "splice.h"
+
+struct io_splice {
+	struct file			*file_out;
+	loff_t				off_out;
+	loff_t				off_in;
+	u64				len;
+	int				splice_fd_in;
+	unsigned int			flags;
+};
+
+static int __io_splice_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	struct io_splice *sp = io_kiocb_to_cmd(req);
+	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+
+	sp->len = READ_ONCE(sqe->len);
+	sp->flags = READ_ONCE(sqe->splice_flags);
+	if (unlikely(sp->flags & ~valid_flags))
+		return -EINVAL;
+	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
+	return 0;
+}
+
+int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
+		return -EINVAL;
+	return __io_splice_prep(req, sqe);
+}
+
+int io_tee(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct file *out = sp->file_out;
+	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+	struct file *in;
+	long ret = 0;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	if (sp->flags & SPLICE_F_FD_IN_FIXED)
+		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+	else
+		in = io_file_get_normal(req, sp->splice_fd_in);
+	if (!in) {
+		ret = -EBADF;
+		goto done;
+	}
+
+	if (sp->len)
+		ret = do_tee(in, out, sp->len, flags);
+
+	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+		io_put_file(in);
+done:
+	if (ret != sp->len)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_splice *sp = io_kiocb_to_cmd(req);
+
+	sp->off_in = READ_ONCE(sqe->splice_off_in);
+	sp->off_out = READ_ONCE(sqe->off);
+	return __io_splice_prep(req, sqe);
+}
+
+int io_splice(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct file *out = sp->file_out;
+	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+	loff_t *poff_in, *poff_out;
+	struct file *in;
+	long ret = 0;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	if (sp->flags & SPLICE_F_FD_IN_FIXED)
+		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+	else
+		in = io_file_get_normal(req, sp->splice_fd_in);
+	if (!in) {
+		ret = -EBADF;
+		goto done;
+	}
+
+	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
+	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
+
+	if (sp->len)
+		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
+
+	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+		io_put_file(in);
+done:
+	if (ret != sp->len)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/splice.h b/io_uring/splice.h
new file mode 100644
index 00000000000000..542f94168ad3a1
--- /dev/null
+++ b/io_uring/splice.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_tee(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_splice(struct io_kiocb *req, unsigned int issue_flags);

From 0d58472740370108f8ece9076d79f736f53b5b77 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 21:25:19 -0600
Subject: [PATCH 031/172] io_uring: split out fs related sync/fallocate
 functions

This splits out sync_file_range, fsync, and fallocate.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   3 +-
 io_uring/io_uring.c |  97 +-------------------------------------
 io_uring/sync.c     | 111 ++++++++++++++++++++++++++++++++++++++++++++
 io_uring/sync.h     |  10 ++++
 4 files changed, 124 insertions(+), 97 deletions(-)
 create mode 100644 io_uring/sync.c
 create mode 100644 io_uring/sync.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index c6aca2af6f4ab9..7285c6aef8da0f 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,5 +2,6 @@
 #
 # Makefile for io_uring
 
-obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
+					sync.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 43d4044d3bb958..7bcf8da7dd4063 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -96,6 +96,7 @@
 #include "nop.h"
 #include "fs.h"
 #include "splice.h"
+#include "sync.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -336,14 +337,6 @@ struct io_socket {
 	unsigned long			nofile;
 };
 
-struct io_sync {
-	struct file			*file;
-	loff_t				len;
-	loff_t				off;
-	int				flags;
-	int				mode;
-};
-
 struct io_cancel {
 	struct file			*file;
 	u64				addr;
@@ -3941,67 +3934,6 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_sync *sync = io_kiocb_to_cmd(req);
-
-	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
-		return -EINVAL;
-
-	sync->flags = READ_ONCE(sqe->fsync_flags);
-	if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC))
-		return -EINVAL;
-
-	sync->off = READ_ONCE(sqe->off);
-	sync->len = READ_ONCE(sqe->len);
-	return 0;
-}
-
-static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sync *sync = io_kiocb_to_cmd(req);
-	loff_t end = sync->off + sync->len;
-	int ret;
-
-	/* fsync always requires a blocking context */
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
-				sync->flags & IORING_FSYNC_DATASYNC);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int io_fallocate_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
-{
-	struct io_sync *sync = io_kiocb_to_cmd(req);
-
-	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
-		return -EINVAL;
-
-	sync->off = READ_ONCE(sqe->off);
-	sync->len = READ_ONCE(sqe->addr);
-	sync->mode = READ_ONCE(sqe->len);
-	return 0;
-}
-
-static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sync *sync = io_kiocb_to_cmd(req);
-	int ret;
-
-	/* fallocate always requiring blocking context */
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-	ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
-	if (ret >= 0)
-		fsnotify_modify(req->file);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_open *open = io_kiocb_to_cmd(req);
@@ -4681,33 +4613,6 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_sync *sync = io_kiocb_to_cmd(req);
-
-	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
-		return -EINVAL;
-
-	sync->off = READ_ONCE(sqe->off);
-	sync->len = READ_ONCE(sqe->len);
-	sync->flags = READ_ONCE(sqe->sync_range_flags);
-	return 0;
-}
-
-static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sync *sync = io_kiocb_to_cmd(req);
-	int ret;
-
-	/* sync_file_range always requires a blocking context */
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 #if defined(CONFIG_NET)
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
diff --git a/io_uring/sync.c b/io_uring/sync.c
new file mode 100644
index 00000000000000..9ee8ff865521f3
--- /dev/null
+++ b/io_uring/sync.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+#include <linux/fsnotify.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "sync.h"
+
+struct io_sync {
+	struct file			*file;
+	loff_t				len;
+	loff_t				off;
+	int				flags;
+	int				mode;
+};
+
+int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+
+	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
+		return -EINVAL;
+
+	sync->off = READ_ONCE(sqe->off);
+	sync->len = READ_ONCE(sqe->len);
+	sync->flags = READ_ONCE(sqe->sync_range_flags);
+	return 0;
+}
+
+int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+	int ret;
+
+	/* sync_file_range always requires a blocking context */
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+
+	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
+		return -EINVAL;
+
+	sync->flags = READ_ONCE(sqe->fsync_flags);
+	if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC))
+		return -EINVAL;
+
+	sync->off = READ_ONCE(sqe->off);
+	sync->len = READ_ONCE(sqe->len);
+	return 0;
+}
+
+int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+	loff_t end = sync->off + sync->len;
+	int ret;
+
+	/* fsync always requires a blocking context */
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
+				sync->flags & IORING_FSYNC_DATASYNC);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+
+	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
+		return -EINVAL;
+
+	sync->off = READ_ONCE(sqe->off);
+	sync->len = READ_ONCE(sqe->addr);
+	sync->mode = READ_ONCE(sqe->len);
+	return 0;
+}
+
+int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sync *sync = io_kiocb_to_cmd(req);
+	int ret;
+
+	/* fallocate always requiring blocking context */
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+	ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
+	if (ret >= 0)
+		fsnotify_modify(req->file);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/sync.h b/io_uring/sync.h
new file mode 100644
index 00000000000000..e873c888da79b1
--- /dev/null
+++ b/io_uring/sync.h
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_fsync(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_fallocate(struct io_kiocb *req, unsigned int issue_flags);
+int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);

From f4c163dd7d4b1031772317cd3cd58dd6711ee51e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 21:28:33 -0600
Subject: [PATCH 032/172] io_uring: split out fadvise/madvise operations

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/advise.c   | 100 ++++++++++++++++++++++++++++++++++++++++++++
 io_uring/advise.h   |   7 ++++
 io_uring/io_uring.c |  85 +------------------------------------
 4 files changed, 109 insertions(+), 85 deletions(-)
 create mode 100644 io_uring/advise.c
 create mode 100644 io_uring/advise.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 7285c6aef8da0f..4492aa24397efe 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -3,5 +3,5 @@
 # Makefile for io_uring
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
-					sync.o
+					sync.o advise.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/advise.c b/io_uring/advise.c
new file mode 100644
index 00000000000000..8870fdf66ffbbd
--- /dev/null
+++ b/io_uring/advise.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/fadvise.h>
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "advise.h"
+
+struct io_fadvise {
+	struct file			*file;
+	u64				offset;
+	u32				len;
+	u32				advice;
+};
+
+struct io_madvise {
+	struct file			*file;
+	u64				addr;
+	u32				len;
+	u32				advice;
+};
+
+int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+	struct io_madvise *ma = io_kiocb_to_cmd(req);
+
+	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
+		return -EINVAL;
+
+	ma->addr = READ_ONCE(sqe->addr);
+	ma->len = READ_ONCE(sqe->len);
+	ma->advice = READ_ONCE(sqe->fadvise_advice);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+	struct io_madvise *ma = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+
+	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
+		return -EINVAL;
+
+	fa->offset = READ_ONCE(sqe->off);
+	fa->len = READ_ONCE(sqe->len);
+	fa->advice = READ_ONCE(sqe->fadvise_advice);
+	return 0;
+}
+
+int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK) {
+		switch (fa->advice) {
+		case POSIX_FADV_NORMAL:
+		case POSIX_FADV_RANDOM:
+		case POSIX_FADV_SEQUENTIAL:
+			break;
+		default:
+			return -EAGAIN;
+		}
+	}
+
+	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/advise.h b/io_uring/advise.h
new file mode 100644
index 00000000000000..5ece2a045185ff
--- /dev/null
+++ b/io_uring/advise.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_madvise(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_fadvise(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7bcf8da7dd4063..c2041fb10aa2c4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -97,6 +97,7 @@
 #include "fs.h"
 #include "splice.h"
 #include "sync.h"
+#include "advise.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -408,20 +409,6 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
-struct io_fadvise {
-	struct file			*file;
-	u64				offset;
-	u32				len;
-	u32				advice;
-};
-
-struct io_madvise {
-	struct file			*file;
-	u64				addr;
-	u32				len;
-	u32				advice;
-};
-
 struct io_epoll {
 	struct file			*file;
 	int				epfd;
@@ -4428,76 +4415,6 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 #endif
 }
 
-static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = io_kiocb_to_cmd(req);
-
-	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
-		return -EINVAL;
-
-	ma->addr = READ_ONCE(sqe->addr);
-	ma->len = READ_ONCE(sqe->len);
-	ma->advice = READ_ONCE(sqe->fadvise_advice);
-	return 0;
-#else
-	return -EOPNOTSUPP;
-#endif
-}
-
-static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-#else
-	return -EOPNOTSUPP;
-#endif
-}
-
-static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_fadvise *fa = io_kiocb_to_cmd(req);
-
-	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
-		return -EINVAL;
-
-	fa->offset = READ_ONCE(sqe->off);
-	fa->len = READ_ONCE(sqe->len);
-	fa->advice = READ_ONCE(sqe->fadvise_advice);
-	return 0;
-}
-
-static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_fadvise *fa = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK) {
-		switch (fa->advice) {
-		case POSIX_FADV_NORMAL:
-		case POSIX_FADV_RANDOM:
-		case POSIX_FADV_SEQUENTIAL:
-			break;
-		default:
-			return -EAGAIN;
-		}
-	}
-
-	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_statx *sx = io_kiocb_to_cmd(req);

From 453b329be5eacfc48dd43035af82bc7f28ecfedf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 21:43:10 -0600
Subject: [PATCH 033/172] io_uring: separate out file table handling code

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile         |  2 +-
 io_uring/filetable.c      | 57 ++++++++++++++++++++++++++
 io_uring/filetable.h      | 58 ++++++++++++++++++++++++++
 io_uring/io_uring.c       | 86 ---------------------------------------
 io_uring/io_uring_types.h |  7 +---
 5 files changed, 117 insertions(+), 93 deletions(-)
 create mode 100644 io_uring/filetable.c
 create mode 100644 io_uring/filetable.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 4492aa24397efe..5efc4fe565a172 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -3,5 +3,5 @@
 # Makefile for io_uring
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
-					sync.o advise.o
+					sync.o advise.o filetable.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
new file mode 100644
index 00000000000000..560629a93c04b0
--- /dev/null
+++ b/io_uring/filetable.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+
+int io_file_bitmap_get(struct io_ring_ctx *ctx)
+{
+	struct io_file_table *table = &ctx->file_table;
+	unsigned long nr = ctx->nr_user_files;
+	int ret;
+
+	do {
+		ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
+		if (ret != nr)
+			return ret;
+
+		if (!table->alloc_hint)
+			break;
+
+		nr = table->alloc_hint;
+		table->alloc_hint = 0;
+	} while (1);
+
+	return -ENFILE;
+}
+
+bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+				GFP_KERNEL_ACCOUNT);
+	if (unlikely(!table->files))
+		return false;
+
+	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
+	if (unlikely(!table->bitmap)) {
+		kvfree(table->files);
+		return false;
+	}
+
+	return true;
+}
+
+void io_free_file_tables(struct io_file_table *table)
+{
+	kvfree(table->files);
+	bitmap_free(table->bitmap);
+	table->files = NULL;
+	table->bitmap = NULL;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
new file mode 100644
index 00000000000000..fe1ec581958dda
--- /dev/null
+++ b/io_uring/filetable.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_FILE_TABLE_H
+#define IOU_FILE_TABLE_H
+
+struct io_ring_ctx;
+
+/*
+ * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
+ * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
+ * can't safely always dereference the file when the task has exited and ring
+ * cleanup is done. If a file is tracked and part of SCM, then unix gc on
+ * process exit may reap it before __io_sqe_files_unregister() is run.
+ */
+#define FFS_NOWAIT		0x1UL
+#define FFS_ISREG		0x2UL
+#if defined(CONFIG_64BIT)
+#define FFS_SCM			0x4UL
+#else
+#define IO_URING_SCM_ALL
+#define FFS_SCM			0x0UL
+#endif
+#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
+
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
+struct io_file_table {
+	struct io_fixed_file *files;
+	unsigned long *bitmap;
+	unsigned int alloc_hint;
+};
+
+bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
+void io_free_file_tables(struct io_file_table *table);
+int io_file_bitmap_get(struct io_ring_ctx *ctx);
+
+static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
+{
+	__clear_bit(bit, table->bitmap);
+	table->alloc_hint = bit;
+}
+
+static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
+{
+	WARN_ON_ONCE(test_bit(bit, table->bitmap));
+	__set_bit(bit, table->bitmap);
+	table->alloc_hint = bit + 1;
+}
+
+static inline struct io_fixed_file *
+io_fixed_file_slot(struct io_file_table *table, unsigned i)
+{
+	return &table->files[i];
+}
+
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c2041fb10aa2c4..4b4d6fd509d13f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -146,28 +146,6 @@ struct io_overflow_cqe {
 	struct io_uring_cqe cqe;
 };
 
-/*
- * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
- * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
- * can't safely always dereference the file when the task has exited and ring
- * cleanup is done. If a file is tracked and part of SCM, then unix gc on
- * process exit may reap it before __io_sqe_files_unregister() is run.
- */
-#define FFS_NOWAIT		0x1UL
-#define FFS_ISREG		0x2UL
-#if defined(CONFIG_64BIT)
-#define FFS_SCM			0x4UL
-#else
-#define IO_URING_SCM_ALL
-#define FFS_SCM			0x0UL
-#endif
-#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
-
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
-};
-
 struct io_rsrc_put {
 	struct list_head list;
 	u64 tag;
@@ -3983,27 +3961,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_openat_prep(req, sqe);
 }
 
-static int io_file_bitmap_get(struct io_ring_ctx *ctx)
-{
-	struct io_file_table *table = &ctx->file_table;
-	unsigned long nr = ctx->nr_user_files;
-	int ret;
-
-	do {
-		ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
-		if (ret != nr)
-			return ret;
-
-		if (!table->alloc_hint)
-			break;
-
-		nr = table->alloc_hint;
-		table->alloc_hint = 0;
-	} while (1);
-
-	return -ENFILE;
-}
-
 /*
  * Note when io_fixed_fd_install() returns error value, it will ensure
  * fput() is called correspondingly.
@@ -6832,12 +6789,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
 		io_req_task_queue_fail(req, ret);
 }
 
-static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
-						       unsigned i)
-{
-	return &table->files[i];
-}
-
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 					      int index)
 {
@@ -7934,43 +7885,6 @@ static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_pu
 	return ret;
 }
 
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
-	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
-				GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->files))
-		return false;
-
-	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->bitmap)) {
-		kvfree(table->files);
-		return false;
-	}
-
-	return true;
-}
-
-static void io_free_file_tables(struct io_file_table *table)
-{
-	kvfree(table->files);
-	bitmap_free(table->bitmap);
-	table->files = NULL;
-	table->bitmap = NULL;
-}
-
-static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
-{
-	WARN_ON_ONCE(test_bit(bit, table->bitmap));
-	__set_bit(bit, table->bitmap);
-	table->alloc_hint = bit + 1;
-}
-
-static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
-{
-	__clear_bit(bit, table->bitmap);
-	table->alloc_hint = bit;
-}
-
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 #if !defined(IO_URING_SCM_ALL)
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 1a0f592ff6fc35..dba72113c59d67 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -5,6 +5,7 @@
 #include <linux/task_work.h>
 
 #include "io-wq.h"
+#include "filetable.h"
 
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
@@ -122,12 +123,6 @@ struct io_ev_fd {
 	struct rcu_head		rcu;
 };
 
-struct io_file_table {
-	struct io_fixed_file *files;
-	unsigned long *bitmap;
-	unsigned int alloc_hint;
-};
-
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {

From cd40cae29ef815de6f7e72207b677c78f43f4688 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 May 2022 21:54:43 -0600
Subject: [PATCH 034/172] io_uring: split out open/close operations

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile    |   3 +-
 io_uring/io_uring.c  | 311 ++-----------------------------------------
 io_uring/io_uring.h  |  32 +++++
 io_uring/openclose.c | 283 +++++++++++++++++++++++++++++++++++++++
 io_uring/openclose.h |  14 ++
 5 files changed, 345 insertions(+), 298 deletions(-)
 create mode 100644 io_uring/openclose.c
 create mode 100644 io_uring/openclose.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 5efc4fe565a172..e60def39ca2c6a 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -3,5 +3,6 @@
 # Makefile for io_uring
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
-					sync.o advise.o filetable.o
+					sync.o advise.o filetable.o \
+					openclose.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4b4d6fd509d13f..a79186ba8c44ec 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -98,6 +98,7 @@
 #include "splice.h"
 #include "sync.h"
 #include "advise.h"
+#include "openclose.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -283,12 +284,6 @@ struct io_poll_update {
 	bool				update_user_data;
 };
 
-struct io_close {
-	struct file			*file;
-	int				fd;
-	u32				file_slot;
-};
-
 struct io_timeout_data {
 	struct io_kiocb			*req;
 	struct hrtimer			timer;
@@ -371,15 +366,6 @@ struct io_sr_msg {
 	unsigned int			flags;
 };
 
-struct io_open {
-	struct file			*file;
-	int				dfd;
-	u32				file_slot;
-	struct filename			*filename;
-	struct open_how			how;
-	unsigned long			nofile;
-};
-
 struct io_rsrc_update {
 	struct file			*file;
 	u64				arg;
@@ -555,9 +541,6 @@ static int io_req_prep_async(struct io_kiocb *req);
 
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index);
-static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
-			    unsigned int offset);
-static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
 
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 static void io_eventfd_signal(struct io_ring_ctx *ctx);
@@ -670,10 +653,15 @@ const char *io_uring_get_opcode(u8 opcode)
 	return "INVALID";
 }
 
+bool io_is_uring_fops(struct file *file)
+{
+	return file->f_op == &io_uring_fops;
+}
+
 struct sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
-	if (file->f_op == &io_uring_fops) {
+	if (io_is_uring_fops(file)) {
 		struct io_ring_ctx *ctx = file->private_data;
 
 		return ctx->ring_sock->sk;
@@ -699,26 +687,6 @@ static inline bool io_file_need_scm(struct file *filp)
 }
 #endif
 
-static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
-{
-	lockdep_assert_held(&ctx->uring_lock);
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
-{
-	/*
-	 * "Normal" inline submissions always hold the uring_lock, since we
-	 * grab it from the system call. Same is true for the SQPOLL offload.
-	 * The only exception is when we've detached the request and issue it
-	 * from an async worker thread, grab the lock for that case.
-	 */
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		mutex_lock(&ctx->uring_lock);
-	lockdep_assert_held(&ctx->uring_lock);
-}
-
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!*locked) {
@@ -3899,74 +3867,12 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_open *open = io_kiocb_to_cmd(req);
-	const char __user *fname;
-	int ret;
-
-	if (unlikely(sqe->buf_index))
-		return -EINVAL;
-	if (unlikely(req->flags & REQ_F_FIXED_FILE))
-		return -EBADF;
-
-	/* open.how should be already initialised */
-	if (!(open->how.flags & O_PATH) && force_o_largefile())
-		open->how.flags |= O_LARGEFILE;
-
-	open->dfd = READ_ONCE(sqe->fd);
-	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	open->filename = getname(fname);
-	if (IS_ERR(open->filename)) {
-		ret = PTR_ERR(open->filename);
-		open->filename = NULL;
-		return ret;
-	}
-
-	open->file_slot = READ_ONCE(sqe->file_index);
-	if (open->file_slot && (open->how.flags & O_CLOEXEC))
-		return -EINVAL;
-
-	open->nofile = rlimit(RLIMIT_NOFILE);
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_open *open = io_kiocb_to_cmd(req);
-	u64 mode = READ_ONCE(sqe->len);
-	u64 flags = READ_ONCE(sqe->open_flags);
-
-	open->how = build_open_how(flags, mode);
-	return __io_openat_prep(req, sqe);
-}
-
-static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_open *open = io_kiocb_to_cmd(req);
-	struct open_how __user *how;
-	size_t len;
-	int ret;
-
-	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	len = READ_ONCE(sqe->len);
-	if (len < OPEN_HOW_SIZE_VER0)
-		return -EINVAL;
-
-	ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len);
-	if (ret)
-		return ret;
-
-	return __io_openat_prep(req, sqe);
-}
-
 /*
  * Note when io_fixed_fd_install() returns error value, it will ensure
  * fput() is called correspondingly.
  */
-static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
-			       struct file *file, unsigned int file_slot)
+int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			struct file *file, unsigned int file_slot)
 {
 	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3993,86 +3899,6 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 	return ret;
 }
 
-static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_open *open = io_kiocb_to_cmd(req);
-	struct open_flags op;
-	struct file *file;
-	bool resolve_nonblock, nonblock_set;
-	bool fixed = !!open->file_slot;
-	int ret;
-
-	ret = build_open_flags(&open->how, &op);
-	if (ret)
-		goto err;
-	nonblock_set = op.open_flag & O_NONBLOCK;
-	resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
-	if (issue_flags & IO_URING_F_NONBLOCK) {
-		/*
-		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
-		 * it'll always -EAGAIN
-		 */
-		if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
-			return -EAGAIN;
-		op.lookup_flags |= LOOKUP_CACHED;
-		op.open_flag |= O_NONBLOCK;
-	}
-
-	if (!fixed) {
-		ret = __get_unused_fd_flags(open->how.flags, open->nofile);
-		if (ret < 0)
-			goto err;
-	}
-
-	file = do_filp_open(open->dfd, open->filename, &op);
-	if (IS_ERR(file)) {
-		/*
-		 * We could hang on to this 'fd' on retrying, but seems like
-		 * marginal gain for something that is now known to be a slower
-		 * path. So just put it, and we'll get a new one when we retry.
-		 */
-		if (!fixed)
-			put_unused_fd(ret);
-
-		ret = PTR_ERR(file);
-		/* only retry if RESOLVE_CACHED wasn't already set by application */
-		if (ret == -EAGAIN &&
-		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
-			return -EAGAIN;
-		goto err;
-	}
-
-	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
-		file->f_flags &= ~O_NONBLOCK;
-	fsnotify_open(file);
-
-	if (!fixed)
-		fd_install(ret, file);
-	else
-		ret = io_fixed_fd_install(req, issue_flags, file,
-						open->file_slot);
-err:
-	putname(open->filename);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
-{
-	return io_openat2(req, issue_flags);
-}
-
-static void io_open_cleanup(struct io_kiocb *req)
-{
-	struct io_open *open = io_kiocb_to_cmd(req);
-
-	if (open->filename)
-		putname(open->filename);
-}
-
 static int io_remove_buffers_prep(struct io_kiocb *req,
 				  const struct io_uring_sqe *sqe)
 {
@@ -4424,69 +4250,6 @@ static void io_statx_cleanup(struct io_kiocb *req)
 		putname(sx->filename);
 }
 
-static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_close *close = io_kiocb_to_cmd(req);
-
-	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
-		return -EINVAL;
-	if (req->flags & REQ_F_FIXED_FILE)
-		return -EBADF;
-
-	close->fd = READ_ONCE(sqe->fd);
-	close->file_slot = READ_ONCE(sqe->file_index);
-	if (close->file_slot && close->fd)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int io_close(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct files_struct *files = current->files;
-	struct io_close *close = io_kiocb_to_cmd(req);
-	struct fdtable *fdt;
-	struct file *file;
-	int ret = -EBADF;
-
-	if (close->file_slot) {
-		ret = io_close_fixed(req, issue_flags);
-		goto err;
-	}
-
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (close->fd >= fdt->max_fds) {
-		spin_unlock(&files->file_lock);
-		goto err;
-	}
-	file = rcu_dereference_protected(fdt->fd[close->fd],
-			lockdep_is_held(&files->file_lock));
-	if (!file || file->f_op == &io_uring_fops) {
-		spin_unlock(&files->file_lock);
-		goto err;
-	}
-
-	/* if the file has a flush method, be safe and punt to async */
-	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
-		spin_unlock(&files->file_lock);
-		return -EAGAIN;
-	}
-
-	file = __close_fd_get_file(close->fd);
-	spin_unlock(&files->file_lock);
-	if (!file)
-		goto err;
-
-	/* No ->flush() or already async, safely close from here */
-	ret = filp_close(file, current->files);
-err:
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 #if defined(CONFIG_NET)
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
@@ -7744,8 +7507,8 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void)
 	return ref_node;
 }
 
-static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-				struct io_rsrc_data *data_to_kill)
+void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+			 struct io_rsrc_data *data_to_kill)
 	__must_hold(&ctx->uring_lock)
 {
 	WARN_ON_ONCE(!ctx->rsrc_backup_node);
@@ -7772,7 +7535,7 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 	}
 }
 
-static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
 	if (ctx->rsrc_backup_node)
 		return 0;
@@ -8319,8 +8082,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-				 struct io_rsrc_node *node, void *rsrc)
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+			  struct io_rsrc_node *node, void *rsrc)
 {
 	u64 *tag_slot = io_get_tag_slot(data, idx);
 	struct io_rsrc_put *prsrc;
@@ -8386,52 +8149,6 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 	return ret;
 }
 
-static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
-			    unsigned int offset)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *file_slot;
-	struct file *file;
-	int ret;
-
-	io_ring_submit_lock(ctx, issue_flags);
-	ret = -ENXIO;
-	if (unlikely(!ctx->file_data))
-		goto out;
-	ret = -EINVAL;
-	if (offset >= ctx->nr_user_files)
-		goto out;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		goto out;
-
-	offset = array_index_nospec(offset, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
-	ret = -EBADF;
-	if (!file_slot->file_ptr)
-		goto out;
-
-	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
-	if (ret)
-		goto out;
-
-	file_slot->file_ptr = 0;
-	io_file_bitmap_clear(&ctx->file_table, offset);
-	io_rsrc_node_switch(ctx, ctx->file_data);
-	ret = 0;
-out:
-	io_ring_submit_unlock(ctx, issue_flags);
-	return ret;
-}
-
-static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_close *close = io_kiocb_to_cmd(req);
-
-	return __io_close_fixed(req, issue_flags, close->file_slot - 1);
-}
-
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 02c00122b97a31..ebb225e8501251 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -2,6 +2,7 @@
 #define IOU_CORE_H
 
 #include <linux/errno.h>
+#include <linux/lockdep.h>
 #include "io_uring_types.h"
 
 enum {
@@ -30,8 +31,39 @@ static inline void io_put_file(struct file *file)
 		fput(file);
 }
 
+static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
+					 unsigned issue_flags)
+{
+	lockdep_assert_held(&ctx->uring_lock);
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_unlock(&ctx->uring_lock);
+}
+
+static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
+				       unsigned issue_flags)
+{
+	/*
+	 * "Normal" inline submissions always hold the uring_lock, since we
+	 * grab it from the system call. Same is true for the SQPOLL offload.
+	 * The only exception is when we've detached the request and issue it
+	 * from an async worker thread, grab the lock for that case.
+	 */
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_lock(&ctx->uring_lock);
+	lockdep_assert_held(&ctx->uring_lock);
+}
+
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
+int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			struct file *file, unsigned int file_slot);
+
+int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+			  struct io_rsrc_node *node, void *rsrc);
+void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+			 struct io_rsrc_data *data_to_kill);
+bool io_is_uring_fops(struct file *file);
 
 #endif
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
new file mode 100644
index 00000000000000..fa35bd56a33086
--- /dev/null
+++ b/io_uring/openclose.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../fs/internal.h"
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "openclose.h"
+
+struct io_open {
+	struct file			*file;
+	int				dfd;
+	u32				file_slot;
+	struct filename			*filename;
+	struct open_how			how;
+	unsigned long			nofile;
+};
+
+struct io_close {
+	struct file			*file;
+	int				fd;
+	u32				file_slot;
+};
+
+static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_open *open = io_kiocb_to_cmd(req);
+	const char __user *fname;
+	int ret;
+
+	if (unlikely(sqe->buf_index))
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	/* open.how should be already initialised */
+	if (!(open->how.flags & O_PATH) && force_o_largefile())
+		open->how.flags |= O_LARGEFILE;
+
+	open->dfd = READ_ONCE(sqe->fd);
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	open->filename = getname(fname);
+	if (IS_ERR(open->filename)) {
+		ret = PTR_ERR(open->filename);
+		open->filename = NULL;
+		return ret;
+	}
+
+	open->file_slot = READ_ONCE(sqe->file_index);
+	if (open->file_slot && (open->how.flags & O_CLOEXEC))
+		return -EINVAL;
+
+	open->nofile = rlimit(RLIMIT_NOFILE);
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_open *open = io_kiocb_to_cmd(req);
+	u64 mode = READ_ONCE(sqe->len);
+	u64 flags = READ_ONCE(sqe->open_flags);
+
+	open->how = build_open_how(flags, mode);
+	return __io_openat_prep(req, sqe);
+}
+
+int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_open *open = io_kiocb_to_cmd(req);
+	struct open_how __user *how;
+	size_t len;
+	int ret;
+
+	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	len = READ_ONCE(sqe->len);
+	if (len < OPEN_HOW_SIZE_VER0)
+		return -EINVAL;
+
+	ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len);
+	if (ret)
+		return ret;
+
+	return __io_openat_prep(req, sqe);
+}
+
+int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_open *open = io_kiocb_to_cmd(req);
+	struct open_flags op;
+	struct file *file;
+	bool resolve_nonblock, nonblock_set;
+	bool fixed = !!open->file_slot;
+	int ret;
+
+	ret = build_open_flags(&open->how, &op);
+	if (ret)
+		goto err;
+	nonblock_set = op.open_flag & O_NONBLOCK;
+	resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
+	if (issue_flags & IO_URING_F_NONBLOCK) {
+		/*
+		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
+		 * it'll always -EAGAIN
+		 */
+		if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+			return -EAGAIN;
+		op.lookup_flags |= LOOKUP_CACHED;
+		op.open_flag |= O_NONBLOCK;
+	}
+
+	if (!fixed) {
+		ret = __get_unused_fd_flags(open->how.flags, open->nofile);
+		if (ret < 0)
+			goto err;
+	}
+
+	file = do_filp_open(open->dfd, open->filename, &op);
+	if (IS_ERR(file)) {
+		/*
+		 * We could hang on to this 'fd' on retrying, but seems like
+		 * marginal gain for something that is now known to be a slower
+		 * path. So just put it, and we'll get a new one when we retry.
+		 */
+		if (!fixed)
+			put_unused_fd(ret);
+
+		ret = PTR_ERR(file);
+		/* only retry if RESOLVE_CACHED wasn't already set by application */
+		if (ret == -EAGAIN &&
+		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
+			return -EAGAIN;
+		goto err;
+	}
+
+	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
+		file->f_flags &= ~O_NONBLOCK;
+	fsnotify_open(file);
+
+	if (!fixed)
+		fd_install(ret, file);
+	else
+		ret = io_fixed_fd_install(req, issue_flags, file,
+						open->file_slot);
+err:
+	putname(open->filename);
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_openat(struct io_kiocb *req, unsigned int issue_flags)
+{
+	return io_openat2(req, issue_flags);
+}
+
+void io_open_cleanup(struct io_kiocb *req)
+{
+	struct io_open *open = io_kiocb_to_cmd(req);
+
+	if (open->filename)
+		putname(open->filename);
+}
+
+int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+		     unsigned int offset)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_fixed_file *file_slot;
+	struct file *file;
+	int ret;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	ret = -ENXIO;
+	if (unlikely(!ctx->file_data))
+		goto out;
+	ret = -EINVAL;
+	if (offset >= ctx->nr_user_files)
+		goto out;
+	ret = io_rsrc_node_switch_start(ctx);
+	if (ret)
+		goto out;
+
+	offset = array_index_nospec(offset, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
+	ret = -EBADF;
+	if (!file_slot->file_ptr)
+		goto out;
+
+	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+	if (ret)
+		goto out;
+
+	file_slot->file_ptr = 0;
+	io_file_bitmap_clear(&ctx->file_table, offset);
+	io_rsrc_node_switch(ctx, ctx->file_data);
+	ret = 0;
+out:
+	io_ring_submit_unlock(ctx, issue_flags);
+	return ret;
+}
+
+static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_close *close = io_kiocb_to_cmd(req);
+
+	return __io_close_fixed(req, issue_flags, close->file_slot - 1);
+}
+
+int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_close *close = io_kiocb_to_cmd(req);
+
+	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
+		return -EINVAL;
+	if (req->flags & REQ_F_FIXED_FILE)
+		return -EBADF;
+
+	close->fd = READ_ONCE(sqe->fd);
+	close->file_slot = READ_ONCE(sqe->file_index);
+	if (close->file_slot && close->fd)
+		return -EINVAL;
+
+	return 0;
+}
+
+int io_close(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct files_struct *files = current->files;
+	struct io_close *close = io_kiocb_to_cmd(req);
+	struct fdtable *fdt;
+	struct file *file;
+	int ret = -EBADF;
+
+	if (close->file_slot) {
+		ret = io_close_fixed(req, issue_flags);
+		goto err;
+	}
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (close->fd >= fdt->max_fds) {
+		spin_unlock(&files->file_lock);
+		goto err;
+	}
+	file = rcu_dereference_protected(fdt->fd[close->fd],
+			lockdep_is_held(&files->file_lock));
+	if (!file || io_is_uring_fops(file)) {
+		spin_unlock(&files->file_lock);
+		goto err;
+	}
+
+	/* if the file has a flush method, be safe and punt to async */
+	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
+		spin_unlock(&files->file_lock);
+		return -EAGAIN;
+	}
+
+	file = __close_fd_get_file(close->fd);
+	spin_unlock(&files->file_lock);
+	if (!file)
+		goto err;
+
+	/* No ->flush() or already async, safely close from here */
+	ret = filp_close(file, current->files);
+err:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
new file mode 100644
index 00000000000000..9f578f3fad870d
--- /dev/null
+++ b/io_uring/openclose.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+		     unsigned int offset);
+
+int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_openat(struct io_kiocb *req, unsigned int issue_flags);
+void io_open_cleanup(struct io_kiocb *req);
+
+int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_close(struct io_kiocb *req, unsigned int issue_flags);

From 99f15d8d61364299ae780cc739c74068a6d2538d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 05:59:19 -0600
Subject: [PATCH 035/172] io_uring: move uring_cmd handling to its own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile    |   2 +-
 io_uring/io_uring.c  | 127 ++-----------------------------------------
 io_uring/io_uring.h  |   9 +++
 io_uring/uring_cmd.c | 115 +++++++++++++++++++++++++++++++++++++++
 io_uring/uring_cmd.h |  13 +++++
 5 files changed, 142 insertions(+), 124 deletions(-)
 create mode 100644 io_uring/uring_cmd.c
 create mode 100644 io_uring/uring_cmd.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index e60def39ca2c6a..2e2cbeb272a890 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
-					openclose.o
+					openclose.o uring_cmd.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a79186ba8c44ec..469d89f3cf0cd1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -99,6 +99,7 @@
 #include "sync.h"
 #include "advise.h"
 #include "openclose.h"
+#include "uring_cmd.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -473,14 +474,6 @@ struct io_cancel_data {
 	int seq;
 };
 
-/*
- * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
- * the following sqe if SQE128 is used.
- */
-#define uring_cmd_pdu_size(is_sqe128)				\
-	((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) -	\
-		offsetof(struct io_uring_sqe, cmd))
-
 struct io_op_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
@@ -988,11 +981,6 @@ static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 	return matched;
 }
 
-static inline bool req_has_async_data(struct io_kiocb *req)
-{
-	return req->flags & REQ_F_ASYNC_DATA;
-}
-
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
 {
 	req_set_fail(req);
@@ -1743,7 +1731,7 @@ static void io_req_complete_post(struct io_kiocb *req)
 	io_cqring_ev_posted(ctx);
 }
 
-static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
+inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 {
 	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 		req->flags |= REQ_F_COMPLETE_INLINE;
@@ -2151,7 +2139,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	}
 }
 
-static void io_req_task_work_add(struct io_kiocb *req)
+void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 
@@ -3268,7 +3256,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 	}
 }
 
-static inline bool io_alloc_async_data(struct io_kiocb *req)
+bool io_alloc_async_data(struct io_kiocb *req)
 {
 	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
 	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
@@ -3714,111 +3702,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
-static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
-{
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
-
-	ioucmd->task_work_cb(ioucmd);
-}
-
-void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *))
-{
-	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-
-	ioucmd->task_work_cb = task_work_cb;
-	req->io_task_work.func = io_uring_cmd_work;
-	io_req_task_work_add(req);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
-
-static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
-					  u64 extra1, u64 extra2)
-{
-	req->extra1 = extra1;
-	req->extra2 = extra2;
-	req->flags |= REQ_F_CQE32_INIT;
-}
-
-/*
- * Called by consumers of io_uring_cmd, if they originally returned
- * -EIOCBQUEUED upon receiving the command.
- */
-void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
-{
-	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-
-	if (ret < 0)
-		req_set_fail(req);
-
-	io_req_set_res(req, 0, ret);
-	if (req->ctx->flags & IORING_SETUP_CQE32)
-		io_req_set_cqe32_extra(req, res2, 0);
-	__io_req_complete(req, 0);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_done);
-
-static int io_uring_cmd_prep_async(struct io_kiocb *req)
-{
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
-	size_t cmd_size;
-
-	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
-
-	memcpy(req->async_data, ioucmd->cmd, cmd_size);
-	return 0;
-}
-
-static int io_uring_cmd_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
-{
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
-
-	if (sqe->rw_flags || sqe->__pad1)
-		return -EINVAL;
-	ioucmd->cmd = sqe->cmd;
-	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
-	return 0;
-}
-
-static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	struct file *file = req->file;
-	int ret;
-
-	if (!req->file->f_op->uring_cmd)
-		return -EOPNOTSUPP;
-
-	if (ctx->flags & IORING_SETUP_SQE128)
-		issue_flags |= IO_URING_F_SQE128;
-	if (ctx->flags & IORING_SETUP_CQE32)
-		issue_flags |= IO_URING_F_CQE32;
-	if (ctx->flags & IORING_SETUP_IOPOLL)
-		issue_flags |= IO_URING_F_IOPOLL;
-
-	if (req_has_async_data(req))
-		ioucmd->cmd = req->async_data;
-
-	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
-	if (ret == -EAGAIN) {
-		if (!req_has_async_data(req)) {
-			if (io_alloc_async_data(req))
-				return -ENOMEM;
-			io_uring_cmd_prep_async(req);
-		}
-		return -EAGAIN;
-	}
-
-	if (ret != -EIOCBQUEUED) {
-		io_uring_cmd_done(ioucmd, ret, 0);
-		return IOU_OK;
-	}
-
-	return IOU_ISSUE_SKIP_COMPLETE;
-}
-
 static int io_msg_ring_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -11533,8 +11416,6 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
 
-	BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
-
 	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
 		BUG_ON(!io_op_defs[i].prep);
 		BUG_ON(!io_op_defs[i].issue);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ebb225e8501251..6a07e902120ab3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -25,6 +25,11 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
 	req->cqe.flags = cflags;
 }
 
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_ASYNC_DATA;
+}
+
 static inline void io_put_file(struct file *file)
 {
 	if (file)
@@ -53,6 +58,8 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
 	lockdep_assert_held(&ctx->uring_lock);
 }
 
+void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
+
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
@@ -65,5 +72,7 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 bool io_is_uring_fops(struct file *file);
+bool io_alloc_async_data(struct io_kiocb *req);
+void io_req_task_work_add(struct io_kiocb *req);
 
 #endif
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
new file mode 100644
index 00000000000000..abf78918a0995f
--- /dev/null
+++ b/io_uring/uring_cmd.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "uring_cmd.h"
+
+static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+{
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+
+	ioucmd->task_work_cb(ioucmd);
+}
+
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *))
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+	ioucmd->task_work_cb = task_work_cb;
+	req->io_task_work.func = io_uring_cmd_work;
+	io_req_task_work_add(req);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+
+static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
+					  u64 extra1, u64 extra2)
+{
+	req->extra1 = extra1;
+	req->extra2 = extra2;
+	req->flags |= REQ_F_CQE32_INIT;
+}
+
+/*
+ * Called by consumers of io_uring_cmd, if they originally returned
+ * -EIOCBQUEUED upon receiving the command.
+ */
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+	if (ret < 0)
+		req_set_fail(req);
+
+	io_req_set_res(req, 0, ret);
+	if (req->ctx->flags & IORING_SETUP_CQE32)
+		io_req_set_cqe32_extra(req, res2, 0);
+	__io_req_complete(req, 0);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+
+int io_uring_cmd_prep_async(struct io_kiocb *req)
+{
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	size_t cmd_size;
+
+	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
+
+	memcpy(req->async_data, ioucmd->cmd, cmd_size);
+	return 0;
+}
+
+int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+
+	if (sqe->rw_flags || sqe->__pad1)
+		return -EINVAL;
+	ioucmd->cmd = sqe->cmd;
+	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
+	return 0;
+}
+
+int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file = req->file;
+	int ret;
+
+	if (!req->file->f_op->uring_cmd)
+		return -EOPNOTSUPP;
+
+	if (ctx->flags & IORING_SETUP_SQE128)
+		issue_flags |= IO_URING_F_SQE128;
+	if (ctx->flags & IORING_SETUP_CQE32)
+		issue_flags |= IO_URING_F_CQE32;
+	if (ctx->flags & IORING_SETUP_IOPOLL)
+		issue_flags |= IO_URING_F_IOPOLL;
+
+	if (req_has_async_data(req))
+		ioucmd->cmd = req->async_data;
+
+	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+	if (ret == -EAGAIN) {
+		if (!req_has_async_data(req)) {
+			if (io_alloc_async_data(req))
+				return -ENOMEM;
+			io_uring_cmd_prep_async(req);
+		}
+		return -EAGAIN;
+	}
+
+	if (ret != -EIOCBQUEUED) {
+		io_uring_cmd_done(ioucmd, ret, 0);
+		return IOU_OK;
+	}
+
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
new file mode 100644
index 00000000000000..7c6697d13cb2e4
--- /dev/null
+++ b/io_uring/uring_cmd.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
+int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_uring_cmd_prep_async(struct io_kiocb *req);
+
+/*
+ * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
+ * the following sqe if SQE128 is used.
+ */
+#define uring_cmd_pdu_size(is_sqe128)				\
+	((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) -	\
+		offsetof(struct io_uring_sqe, cmd))

From 4cf90495281b43f5f597ef4a9abcc83a63973571 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 06:04:14 -0600
Subject: [PATCH 036/172] io_uring: add a dummy -EOPNOTSUPP prep handler

Add it and use it for the epoll handling, if epoll isn't configured.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 469d89f3cf0cd1..15d0377b7d9d86 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4034,10 +4034,16 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
+static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
+					     const struct io_uring_sqe *sqe)
+{
+	return -EOPNOTSUPP;
+}
+
+#if defined(CONFIG_EPOLL)
 static int io_epoll_ctl_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
-#if defined(CONFIG_EPOLL)
 	struct io_epoll *epoll = io_kiocb_to_cmd(req);
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -4056,14 +4062,10 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 	}
 
 	return 0;
-#else
-	return -EOPNOTSUPP;
-#endif
 }
 
 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 {
-#if defined(CONFIG_EPOLL)
 	struct io_epoll *ie = io_kiocb_to_cmd(req);
 	int ret;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -4076,10 +4078,8 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
-#else
-	return -EOPNOTSUPP;
-#endif
 }
+#endif
 
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
@@ -11246,8 +11246,12 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_EPOLL_CTL] = {
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
+#if defined(CONFIG_EPOLL)
 		.prep			= io_epoll_ctl_prep,
 		.issue			= io_epoll_ctl,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_SPLICE] = {
 		.needs_file		= 1,
@@ -11418,7 +11422,8 @@ static int __init io_uring_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
 		BUG_ON(!io_op_defs[i].prep);
-		BUG_ON(!io_op_defs[i].issue);
+		if (io_op_defs[i].prep != io_eopnotsupp_prep)
+			BUG_ON(!io_op_defs[i].issue);
 	}
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |

From a9c210cebe13c36487a239ae7f4671a389fed127 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 06:09:18 -0600
Subject: [PATCH 037/172] io_uring: move epoll handler to its own file

Would be nice to sort out Kconfig for this and don't even compile
epoll.c if we don't have epoll configured.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |  2 +-
 io_uring/epoll.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++
 io_uring/epoll.h    |  6 +++++
 io_uring/io_uring.c | 50 +-----------------------------------
 4 files changed, 70 insertions(+), 50 deletions(-)
 create mode 100644 io_uring/epoll.c
 create mode 100644 io_uring/epoll.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 2e2cbeb272a890..59e70f2d8c56d4 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
-					openclose.o uring_cmd.o
+					openclose.o uring_cmd.o epoll.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
new file mode 100644
index 00000000000000..acbb32498127ad
--- /dev/null
+++ b/io_uring/epoll.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/io_uring.h>
+#include <linux/eventpoll.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "epoll.h"
+
+#if defined(CONFIG_EPOLL)
+struct io_epoll {
+	struct file			*file;
+	int				epfd;
+	int				op;
+	int				fd;
+	struct epoll_event		event;
+};
+
+int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_epoll *epoll = io_kiocb_to_cmd(req);
+
+	if (sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+
+	epoll->epfd = READ_ONCE(sqe->fd);
+	epoll->op = READ_ONCE(sqe->len);
+	epoll->fd = READ_ONCE(sqe->off);
+
+	if (ep_op_has_event(epoll->op)) {
+		struct epoll_event __user *ev;
+
+		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
+		if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_epoll *ie = io_kiocb_to_cmd(req);
+	int ret;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+
+	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
+	if (force_nonblock && ret == -EAGAIN)
+		return -EAGAIN;
+
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+#endif
diff --git a/io_uring/epoll.h b/io_uring/epoll.h
new file mode 100644
index 00000000000000..870cce11ba982d
--- /dev/null
+++ b/io_uring/epoll.h
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#if defined(CONFIG_EPOLL)
+int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 15d0377b7d9d86..78828e294e53b7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -100,6 +100,7 @@
 #include "advise.h"
 #include "openclose.h"
 #include "uring_cmd.h"
+#include "epoll.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -374,14 +375,6 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
-struct io_epoll {
-	struct file			*file;
-	int				epfd;
-	int				op;
-	int				fd;
-	struct epoll_event		event;
-};
-
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -4040,47 +4033,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-#if defined(CONFIG_EPOLL)
-static int io_epoll_ctl_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
-{
-	struct io_epoll *epoll = io_kiocb_to_cmd(req);
-
-	if (sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-
-	epoll->epfd = READ_ONCE(sqe->fd);
-	epoll->op = READ_ONCE(sqe->len);
-	epoll->fd = READ_ONCE(sqe->off);
-
-	if (ep_op_has_event(epoll->op)) {
-		struct epoll_event __user *ev;
-
-		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
-		if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_epoll *ie = io_kiocb_to_cmd(req);
-	int ret;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
-	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
-	if (force_nonblock && ret == -EAGAIN)
-		return -EAGAIN;
-
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-#endif
-
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_statx *sx = io_kiocb_to_cmd(req);

From e0da14def1ee0a9cd9c88893321e9a3d900f8e23 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 06:12:18 -0600
Subject: [PATCH 038/172] io_uring: move statx handling to its own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |  3 +-
 io_uring/io_uring.c | 62 +------------------------------------
 io_uring/statx.c    | 74 +++++++++++++++++++++++++++++++++++++++++++++
 io_uring/statx.h    |  5 +++
 4 files changed, 82 insertions(+), 62 deletions(-)
 create mode 100644 io_uring/statx.c
 create mode 100644 io_uring/statx.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 59e70f2d8c56d4..de953c022c6ebd 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -4,5 +4,6 @@
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
-					openclose.o uring_cmd.o epoll.o
+					openclose.o uring_cmd.o epoll.o \
+					statx.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 78828e294e53b7..eb01d1aadeb486 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -101,6 +101,7 @@
 #include "openclose.h"
 #include "uring_cmd.h"
 #include "epoll.h"
+#include "statx.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -384,15 +385,6 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
-struct io_statx {
-	struct file			*file;
-	int				dfd;
-	unsigned int			mask;
-	unsigned int			flags;
-	struct filename			*filename;
-	struct statx __user		*buffer;
-};
-
 struct io_shutdown {
 	struct file			*file;
 	int				how;
@@ -4033,58 +4025,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_statx *sx = io_kiocb_to_cmd(req);
-	const char __user *path;
-
-	if (sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (req->flags & REQ_F_FIXED_FILE)
-		return -EBADF;
-
-	sx->dfd = READ_ONCE(sqe->fd);
-	sx->mask = READ_ONCE(sqe->len);
-	path = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	sx->flags = READ_ONCE(sqe->statx_flags);
-
-	sx->filename = getname_flags(path,
-				     getname_statx_lookup_flags(sx->flags),
-				     NULL);
-
-	if (IS_ERR(sx->filename)) {
-		int ret = PTR_ERR(sx->filename);
-
-		sx->filename = NULL;
-		return ret;
-	}
-
-	req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_statx *sx = io_kiocb_to_cmd(req);
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static void io_statx_cleanup(struct io_kiocb *req)
-{
-	struct io_statx *sx = io_kiocb_to_cmd(req);
-
-	if (sx->filename)
-		putname(sx->filename);
-}
-
 #if defined(CONFIG_NET)
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
diff --git a/io_uring/statx.c b/io_uring/statx.c
new file mode 100644
index 00000000000000..83b15687e9c5ee
--- /dev/null
+++ b/io_uring/statx.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../fs/internal.h"
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "statx.h"
+
+struct io_statx {
+	struct file			*file;
+	int				dfd;
+	unsigned int			mask;
+	unsigned int			flags;
+	struct filename			*filename;
+	struct statx __user		*buffer;
+};
+
+int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_statx *sx = io_kiocb_to_cmd(req);
+	const char __user *path;
+
+	if (sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	if (req->flags & REQ_F_FIXED_FILE)
+		return -EBADF;
+
+	sx->dfd = READ_ONCE(sqe->fd);
+	sx->mask = READ_ONCE(sqe->len);
+	path = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	sx->flags = READ_ONCE(sqe->statx_flags);
+
+	sx->filename = getname_flags(path,
+				     getname_statx_lookup_flags(sx->flags),
+				     NULL);
+
+	if (IS_ERR(sx->filename)) {
+		int ret = PTR_ERR(sx->filename);
+
+		sx->filename = NULL;
+		return ret;
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_statx(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_statx *sx = io_kiocb_to_cmd(req);
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+void io_statx_cleanup(struct io_kiocb *req)
+{
+	struct io_statx *sx = io_kiocb_to_cmd(req);
+
+	if (sx->filename)
+		putname(sx->filename);
+}
diff --git a/io_uring/statx.h b/io_uring/statx.h
new file mode 100644
index 00000000000000..9a17f4d45a7d5f
--- /dev/null
+++ b/io_uring/statx.h
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_statx(struct io_kiocb *req, unsigned int issue_flags);
+void io_statx_cleanup(struct io_kiocb *req);

From f9ead18c10589a351f395ac5aa107360f2f6ce53 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 06:25:13 -0600
Subject: [PATCH 039/172] io_uring: split network related opcodes into its own
 file

While at it, convert the handlers to just use io_eopnotsupp_prep()
if CONFIG_NET isn't set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/io_uring.c | 873 ++------------------------------------------
 io_uring/io_uring.h |  22 ++
 io_uring/net.c      | 779 +++++++++++++++++++++++++++++++++++++++
 io_uring/net.h      |  43 +++
 5 files changed, 884 insertions(+), 835 deletions(-)
 create mode 100644 io_uring/net.c
 create mode 100644 io_uring/net.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index de953c022c6ebd..c9ec1bbabfbd36 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
-					statx.o
+					statx.o net.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eb01d1aadeb486..cbc20985cd6f34 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -102,6 +102,7 @@
 #include "uring_cmd.h"
 #include "epoll.h"
 #include "statx.h"
+#include "net.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -131,8 +132,6 @@
 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
 				 IO_REQ_CLEAN_FLAGS)
 
-#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
-
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
 struct io_mapped_ubuf {
@@ -295,25 +294,6 @@ struct io_timeout_data {
 	u32				flags;
 };
 
-struct io_accept {
-	struct file			*file;
-	struct sockaddr __user		*addr;
-	int __user			*addr_len;
-	int				flags;
-	u32				file_slot;
-	unsigned long			nofile;
-};
-
-struct io_socket {
-	struct file			*file;
-	int				domain;
-	int				type;
-	int				protocol;
-	int				flags;
-	u32				file_slot;
-	unsigned long			nofile;
-};
-
 struct io_cancel {
 	struct file			*file;
 	u64				addr;
@@ -350,25 +330,6 @@ struct io_rw {
 	rwf_t				flags;
 };
 
-struct io_connect {
-	struct file			*file;
-	struct sockaddr __user		*addr;
-	int				addr_len;
-};
-
-struct io_sr_msg {
-	struct file			*file;
-	union {
-		struct compat_msghdr __user	*umsg_compat;
-		struct user_msghdr __user	*umsg;
-		void __user			*buf;
-	};
-	int				msg_flags;
-	size_t				len;
-	size_t				done_io;
-	unsigned int			flags;
-};
-
 struct io_rsrc_update {
 	struct file			*file;
 	u64				arg;
@@ -385,30 +346,12 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
-struct io_shutdown {
-	struct file			*file;
-	int				how;
-};
-
 struct io_msg {
 	struct file			*file;
 	u64 user_data;
 	u32 len;
 };
 
-struct io_async_connect {
-	struct sockaddr_storage		address;
-};
-
-struct io_async_msghdr {
-	struct iovec			fast_iov[UIO_FASTIOV];
-	/* points to an allocated iov, if NULL we use fast_iov instead */
-	struct iovec			*free_iov;
-	struct sockaddr __user		*uaddr;
-	struct msghdr			msg;
-	struct sockaddr_storage		addr;
-};
-
 struct io_rw_state {
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
@@ -517,9 +460,6 @@ static void io_req_task_queue(struct io_kiocb *req);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 static int io_req_prep_async(struct io_kiocb *req);
 
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-				 unsigned int issue_flags, u32 slot_index);
-
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 static void io_eventfd_signal(struct io_ring_ctx *ctx);
 static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
@@ -808,8 +748,7 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
 	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
 }
 
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
-				       unsigned issue_flags)
+inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
 {
 	unsigned int cflags;
 
@@ -1291,12 +1230,6 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 	spin_unlock_irq(&ctx->timeout_lock);
 }
 
-static inline void io_commit_cqring(struct io_ring_ctx *ctx)
-{
-	/* order cqe stores with ring update */
-	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
-}
-
 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
 	if (ctx->off_timeout_used || ctx->drain_active) {
@@ -1418,7 +1351,7 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
  * 1:1 relationship between how many times this function is called (and
  * hence the eventfd count) and number of CQEs posted to the CQ ring.
  */
-static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
 		     ctx->has_evfd))
@@ -1639,8 +1572,8 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 	}
 }
 
-static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
-				     s32 res, u32 cflags)
+bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
+		     u32 cflags)
 {
 	struct io_uring_cqe *cqe;
 
@@ -2980,8 +2913,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	return u64_to_user_ptr(buf->addr);
 }
 
-static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-				     unsigned int issue_flags)
+void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+			      unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
@@ -3073,13 +3006,6 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	return __io_iov_buffer_select(req, iov, issue_flags);
 }
 
-static inline bool io_do_buffer_select(struct io_kiocb *req)
-{
-	if (!(req->flags & REQ_F_BUFFER_SELECT))
-		return false;
-	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
-}
-
 static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 				       struct io_rw_state *s,
 				       unsigned int issue_flags)
@@ -4025,755 +3951,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-#if defined(CONFIG_NET)
-static int io_shutdown_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
-
-	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
-		     sqe->buf_index || sqe->splice_fd_in))
-		return -EINVAL;
-
-	shutdown->how = READ_ONCE(sqe->len);
-	return 0;
-}
-
-static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
-	struct socket *sock;
-	int ret;
-
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
-	sock = sock_from_file(req->file);
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
-	ret = __sys_shutdown_sock(sock, shutdown->how);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static bool io_net_retry(struct socket *sock, int flags)
-{
-	if (!(flags & MSG_WAITALL))
-		return false;
-	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
-}
-
-static int io_setup_async_msg(struct io_kiocb *req,
-			      struct io_async_msghdr *kmsg)
-{
-	struct io_async_msghdr *async_msg = req->async_data;
-
-	if (async_msg)
-		return -EAGAIN;
-	if (io_alloc_async_data(req)) {
-		kfree(kmsg->free_iov);
-		return -ENOMEM;
-	}
-	async_msg = req->async_data;
-	req->flags |= REQ_F_NEED_CLEANUP;
-	memcpy(async_msg, kmsg, sizeof(*kmsg));
-	async_msg->msg.msg_name = &async_msg->addr;
-	/* if were using fast_iov, set it to the new one */
-	if (!async_msg->free_iov)
-		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
-
-	return -EAGAIN;
-}
-
-static int io_sendmsg_copy_hdr(struct io_kiocb *req,
-			       struct io_async_msghdr *iomsg)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-
-	iomsg->msg.msg_name = &iomsg->addr;
-	iomsg->free_iov = iomsg->fast_iov;
-	return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
-					&iomsg->free_iov);
-}
-
-static int io_sendmsg_prep_async(struct io_kiocb *req)
-{
-	int ret;
-
-	ret = io_sendmsg_copy_hdr(req, req->async_data);
-	if (!ret)
-		req->flags |= REQ_F_NEED_CLEANUP;
-	return ret;
-}
-
-static void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
-{
-	struct io_async_msghdr *io = req->async_data;
-
-	kfree(io->free_iov);
-}
-
-static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-
-	if (unlikely(sqe->file_index || sqe->addr2))
-		return -EINVAL;
-
-	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	sr->len = READ_ONCE(sqe->len);
-	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
-		return -EINVAL;
-	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
-	if (sr->msg_flags & MSG_DONTWAIT)
-		req->flags |= REQ_F_NOWAIT;
-
-#ifdef CONFIG_COMPAT
-	if (req->ctx->compat)
-		sr->msg_flags |= MSG_CMSG_COMPAT;
-#endif
-	sr->done_io = 0;
-	return 0;
-}
-
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct io_async_msghdr iomsg, *kmsg;
-	struct socket *sock;
-	unsigned flags;
-	int min_ret = 0;
-	int ret;
-
-	sock = sock_from_file(req->file);
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
-	if (req_has_async_data(req)) {
-		kmsg = req->async_data;
-	} else {
-		ret = io_sendmsg_copy_hdr(req, &iomsg);
-		if (ret)
-			return ret;
-		kmsg = &iomsg;
-	}
-
-	if (!(req->flags & REQ_F_POLLED) &&
-	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return io_setup_async_msg(req, kmsg);
-
-	flags = sr->msg_flags;
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		flags |= MSG_DONTWAIT;
-	if (flags & MSG_WAITALL)
-		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
-	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-
-	if (ret < min_ret) {
-		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return io_setup_async_msg(req, kmsg);
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		if (ret > 0 && io_net_retry(sock, flags)) {
-			sr->done_io += ret;
-			req->flags |= REQ_F_PARTIAL_IO;
-			return io_setup_async_msg(req, kmsg);
-		}
-		req_set_fail(req);
-	}
-	/* fast path, check for non-NULL to avoid function call */
-	if (kmsg->free_iov)
-		kfree(kmsg->free_iov);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	if (ret >= 0)
-		ret += sr->done_io;
-	else if (sr->done_io)
-		ret = sr->done_io;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int io_send(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct msghdr msg;
-	struct iovec iov;
-	struct socket *sock;
-	unsigned flags;
-	int min_ret = 0;
-	int ret;
-
-	if (!(req->flags & REQ_F_POLLED) &&
-	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return -EAGAIN;
-
-	sock = sock_from_file(req->file);
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
-	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
-	if (unlikely(ret))
-		return ret;
-
-	msg.msg_name = NULL;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_namelen = 0;
-
-	flags = sr->msg_flags;
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		flags |= MSG_DONTWAIT;
-	if (flags & MSG_WAITALL)
-		min_ret = iov_iter_count(&msg.msg_iter);
-
-	msg.msg_flags = flags;
-	ret = sock_sendmsg(sock, &msg);
-	if (ret < min_ret) {
-		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return -EAGAIN;
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		if (ret > 0 && io_net_retry(sock, flags)) {
-			sr->len -= ret;
-			sr->buf += ret;
-			sr->done_io += ret;
-			req->flags |= REQ_F_PARTIAL_IO;
-			return -EAGAIN;
-		}
-		req_set_fail(req);
-	}
-	if (ret >= 0)
-		ret += sr->done_io;
-	else if (sr->done_io)
-		ret = sr->done_io;
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
-				 struct io_async_msghdr *iomsg)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct iovec __user *uiov;
-	size_t iov_len;
-	int ret;
-
-	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
-					&iomsg->uaddr, &uiov, &iov_len);
-	if (ret)
-		return ret;
-
-	if (req->flags & REQ_F_BUFFER_SELECT) {
-		if (iov_len > 1)
-			return -EINVAL;
-		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
-			return -EFAULT;
-		sr->len = iomsg->fast_iov[0].iov_len;
-		iomsg->free_iov = NULL;
-	} else {
-		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
-				     &iomsg->free_iov, &iomsg->msg.msg_iter,
-				     false);
-		if (ret > 0)
-			ret = 0;
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
-					struct io_async_msghdr *iomsg)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct compat_iovec __user *uiov;
-	compat_uptr_t ptr;
-	compat_size_t len;
-	int ret;
-
-	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
-				  &ptr, &len);
-	if (ret)
-		return ret;
-
-	uiov = compat_ptr(ptr);
-	if (req->flags & REQ_F_BUFFER_SELECT) {
-		compat_ssize_t clen;
-
-		if (len > 1)
-			return -EINVAL;
-		if (!access_ok(uiov, sizeof(*uiov)))
-			return -EFAULT;
-		if (__get_user(clen, &uiov->iov_len))
-			return -EFAULT;
-		if (clen < 0)
-			return -EINVAL;
-		sr->len = clen;
-		iomsg->free_iov = NULL;
-	} else {
-		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
-				   UIO_FASTIOV, &iomsg->free_iov,
-				   &iomsg->msg.msg_iter, true);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
-}
-#endif
-
-static int io_recvmsg_copy_hdr(struct io_kiocb *req,
-			       struct io_async_msghdr *iomsg)
-{
-	iomsg->msg.msg_name = &iomsg->addr;
-
-#ifdef CONFIG_COMPAT
-	if (req->ctx->compat)
-		return __io_compat_recvmsg_copy_hdr(req, iomsg);
-#endif
-
-	return __io_recvmsg_copy_hdr(req, iomsg);
-}
-
-static int io_recvmsg_prep_async(struct io_kiocb *req)
-{
-	int ret;
-
-	ret = io_recvmsg_copy_hdr(req, req->async_data);
-	if (!ret)
-		req->flags |= REQ_F_NEED_CLEANUP;
-	return ret;
-}
-
-static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-
-	if (unlikely(sqe->file_index || sqe->addr2))
-		return -EINVAL;
-
-	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	sr->len = READ_ONCE(sqe->len);
-	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
-		return -EINVAL;
-	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
-	if (sr->msg_flags & MSG_DONTWAIT)
-		req->flags |= REQ_F_NOWAIT;
-	if (sr->msg_flags & MSG_ERRQUEUE)
-		req->flags |= REQ_F_CLEAR_POLLIN;
-
-#ifdef CONFIG_COMPAT
-	if (req->ctx->compat)
-		sr->msg_flags |= MSG_CMSG_COMPAT;
-#endif
-	sr->done_io = 0;
-	return 0;
-}
-
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct io_async_msghdr iomsg, *kmsg;
-	struct socket *sock;
-	unsigned int cflags;
-	unsigned flags;
-	int ret, min_ret = 0;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
-	sock = sock_from_file(req->file);
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
-	if (req_has_async_data(req)) {
-		kmsg = req->async_data;
-	} else {
-		ret = io_recvmsg_copy_hdr(req, &iomsg);
-		if (ret)
-			return ret;
-		kmsg = &iomsg;
-	}
-
-	if (!(req->flags & REQ_F_POLLED) &&
-	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return io_setup_async_msg(req, kmsg);
-
-	if (io_do_buffer_select(req)) {
-		void __user *buf;
-
-		buf = io_buffer_select(req, &sr->len, issue_flags);
-		if (!buf)
-			return -ENOBUFS;
-		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
-		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
-	}
-
-	flags = sr->msg_flags;
-	if (force_nonblock)
-		flags |= MSG_DONTWAIT;
-	if (flags & MSG_WAITALL)
-		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
-	kmsg->msg.msg_get_inq = 1;
-	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
-	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg);
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		if (ret > 0 && io_net_retry(sock, flags)) {
-			sr->done_io += ret;
-			req->flags |= REQ_F_PARTIAL_IO;
-			return io_setup_async_msg(req, kmsg);
-		}
-		req_set_fail(req);
-	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
-		req_set_fail(req);
-	}
-
-	/* fast path, check for non-NULL to avoid function call */
-	if (kmsg->free_iov)
-		kfree(kmsg->free_iov);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	if (ret >= 0)
-		ret += sr->done_io;
-	else if (sr->done_io)
-		ret = sr->done_io;
-	cflags = io_put_kbuf(req, issue_flags);
-	if (kmsg->msg.msg_inq)
-		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
-}
-
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct msghdr msg;
-	struct socket *sock;
-	struct iovec iov;
-	unsigned int cflags;
-	unsigned flags;
-	int ret, min_ret = 0;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
-	if (!(req->flags & REQ_F_POLLED) &&
-	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return -EAGAIN;
-
-	sock = sock_from_file(req->file);
-	if (unlikely(!sock))
-		return -ENOTSOCK;
-
-	if (io_do_buffer_select(req)) {
-		void __user *buf;
-
-		buf = io_buffer_select(req, &sr->len, issue_flags);
-		if (!buf)
-			return -ENOBUFS;
-		sr->buf = buf;
-	}
-
-	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
-	if (unlikely(ret))
-		goto out_free;
-
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-	msg.msg_get_inq = 1;
-	msg.msg_flags = 0;
-	msg.msg_controllen = 0;
-	msg.msg_iocb = NULL;
-
-	flags = sr->msg_flags;
-	if (force_nonblock)
-		flags |= MSG_DONTWAIT;
-	if (flags & MSG_WAITALL)
-		min_ret = iov_iter_count(&msg.msg_iter);
-
-	ret = sock_recvmsg(sock, &msg, flags);
-	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return -EAGAIN;
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		if (ret > 0 && io_net_retry(sock, flags)) {
-			sr->len -= ret;
-			sr->buf += ret;
-			sr->done_io += ret;
-			req->flags |= REQ_F_PARTIAL_IO;
-			return -EAGAIN;
-		}
-		req_set_fail(req);
-	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
-out_free:
-		req_set_fail(req);
-	}
-
-	if (ret >= 0)
-		ret += sr->done_io;
-	else if (sr->done_io)
-		ret = sr->done_io;
-	cflags = io_put_kbuf(req, issue_flags);
-	if (msg.msg_inq)
-		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
-}
-
-static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_accept *accept = io_kiocb_to_cmd(req);
-	unsigned flags;
-
-	if (sqe->len || sqe->buf_index)
-		return -EINVAL;
-
-	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	accept->flags = READ_ONCE(sqe->accept_flags);
-	accept->nofile = rlimit(RLIMIT_NOFILE);
-	flags = READ_ONCE(sqe->ioprio);
-	if (flags & ~IORING_ACCEPT_MULTISHOT)
-		return -EINVAL;
-
-	accept->file_slot = READ_ONCE(sqe->file_index);
-	if (accept->file_slot) {
-		if (accept->flags & SOCK_CLOEXEC)
-			return -EINVAL;
-		if (flags & IORING_ACCEPT_MULTISHOT &&
-		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
-			return -EINVAL;
-	}
-	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
-		return -EINVAL;
-	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
-		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
-	if (flags & IORING_ACCEPT_MULTISHOT)
-		req->flags |= REQ_F_APOLL_MULTISHOT;
-	return 0;
-}
-
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_accept *accept = io_kiocb_to_cmd(req);
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
-	bool fixed = !!accept->file_slot;
-	struct file *file;
-	int ret, fd;
-
-retry:
-	if (!fixed) {
-		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
-		if (unlikely(fd < 0))
-			return fd;
-	}
-	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
-			 accept->flags);
-	if (IS_ERR(file)) {
-		if (!fixed)
-			put_unused_fd(fd);
-		ret = PTR_ERR(file);
-		if (ret == -EAGAIN && force_nonblock) {
-			/*
-			 * if it's multishot and polled, we don't need to
-			 * return EAGAIN to arm the poll infra since it
-			 * has already been done
-			 */
-			if ((req->flags & IO_APOLL_MULTI_POLLED) ==
-			    IO_APOLL_MULTI_POLLED)
-				ret = IOU_ISSUE_SKIP_COMPLETE;
-			return ret;
-		}
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		req_set_fail(req);
-	} else if (!fixed) {
-		fd_install(fd, file);
-		ret = fd;
-	} else {
-		ret = io_fixed_fd_install(req, issue_flags, file,
-						accept->file_slot);
-	}
-
-	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
-		io_req_set_res(req, ret, 0);
-		return IOU_OK;
-	}
-	if (ret >= 0) {
-		bool filled;
-
-		spin_lock(&ctx->completion_lock);
-		filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
-					 IORING_CQE_F_MORE);
-		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
-		if (filled) {
-			io_cqring_ev_posted(ctx);
-			goto retry;
-		}
-		ret = -ECANCELED;
-	}
-
-	return ret;
-}
-
-static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_socket *sock = io_kiocb_to_cmd(req);
-
-	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
-		return -EINVAL;
-
-	sock->domain = READ_ONCE(sqe->fd);
-	sock->type = READ_ONCE(sqe->off);
-	sock->protocol = READ_ONCE(sqe->len);
-	sock->file_slot = READ_ONCE(sqe->file_index);
-	sock->nofile = rlimit(RLIMIT_NOFILE);
-
-	sock->flags = sock->type & ~SOCK_TYPE_MASK;
-	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
-		return -EINVAL;
-	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
-		return -EINVAL;
-	return 0;
-}
-
-static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_socket *sock = io_kiocb_to_cmd(req);
-	bool fixed = !!sock->file_slot;
-	struct file *file;
-	int ret, fd;
-
-	if (!fixed) {
-		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
-		if (unlikely(fd < 0))
-			return fd;
-	}
-	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
-	if (IS_ERR(file)) {
-		if (!fixed)
-			put_unused_fd(fd);
-		ret = PTR_ERR(file);
-		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return -EAGAIN;
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		req_set_fail(req);
-	} else if (!fixed) {
-		fd_install(fd, file);
-		ret = fd;
-	} else {
-		ret = io_fixed_fd_install(req, issue_flags, file,
-					    sock->file_slot);
-	}
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int io_connect_prep_async(struct io_kiocb *req)
-{
-	struct io_async_connect *io = req->async_data;
-	struct io_connect *conn = io_kiocb_to_cmd(req);
-
-	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
-}
-
-static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_connect *conn = io_kiocb_to_cmd(req);
-
-	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
-		return -EINVAL;
-
-	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	conn->addr_len =  READ_ONCE(sqe->addr2);
-	return 0;
-}
-
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_connect *connect = io_kiocb_to_cmd(req);
-	struct io_async_connect __io, *io;
-	unsigned file_flags;
-	int ret;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
-	if (req_has_async_data(req)) {
-		io = req->async_data;
-	} else {
-		ret = move_addr_to_kernel(connect->addr,
-						connect->addr_len,
-						&__io.address);
-		if (ret)
-			goto out;
-		io = &__io;
-	}
-
-	file_flags = force_nonblock ? O_NONBLOCK : 0;
-
-	ret = __sys_connect_file(req->file, &io->address,
-					connect->addr_len, file_flags);
-	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
-		if (req_has_async_data(req))
-			return -EAGAIN;
-		if (io_alloc_async_data(req)) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		memcpy(req->async_data, &__io, sizeof(__io));
-		return -EAGAIN;
-	}
-	if (ret == -ERESTARTSYS)
-		ret = -EINTR;
-out:
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-#else /* !CONFIG_NET */
-#define IO_NETOP_FN(op)							\
-static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
-{									\
-	return -EOPNOTSUPP;						\
-}
-
-#define IO_NETOP_PREP(op)						\
-IO_NETOP_FN(op)								\
-static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
-{									\
-	return -EOPNOTSUPP;						\
-}									\
-
-#define IO_NETOP_PREP_ASYNC(op)						\
-IO_NETOP_PREP(op)							\
-static int io_##op##_prep_async(struct io_kiocb *req)			\
-{									\
-	return -EOPNOTSUPP;						\
-}
-
-IO_NETOP_PREP_ASYNC(sendmsg);
-IO_NETOP_PREP_ASYNC(recvmsg);
-IO_NETOP_PREP_ASYNC(connect);
-IO_NETOP_PREP(accept);
-IO_NETOP_PREP(socket);
-IO_NETOP_PREP(shutdown);
-IO_NETOP_FN(send);
-IO_NETOP_FN(recv);
-#endif /* CONFIG_NET */
-
 struct io_poll_table {
 	struct poll_table_struct pt;
 	struct io_kiocb *req;
@@ -7874,8 +7051,8 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 	return 0;
 }
 
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-				 unsigned int issue_flags, u32 slot_index)
+int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+			  unsigned int issue_flags, u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -10986,12 +10163,14 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 		.ioprio			= 1,
+#if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_sendmsg_prep,
 		.issue			= io_sendmsg,
 		.prep_async		= io_sendmsg_prep_async,
-#if defined(CONFIG_NET)
 		.cleanup		= io_sendmsg_recvmsg_cleanup,
+#else
+		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_RECVMSG] = {
@@ -11000,12 +10179,14 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.buffer_select		= 1,
 		.ioprio			= 1,
+#if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recvmsg,
 		.prep_async		= io_recvmsg_prep_async,
-#if defined(CONFIG_NET)
 		.cleanup		= io_sendmsg_recvmsg_cleanup,
+#else
+		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_TIMEOUT] = {
@@ -11026,8 +10207,12 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.poll_exclusive		= 1,
 		.ioprio			= 1,	/* used for flags */
+#if defined(CONFIG_NET)
 		.prep			= io_accept_prep,
 		.issue			= io_accept,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_ASYNC_CANCEL] = {
 		.audit_skip		= 1,
@@ -11044,10 +10229,14 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+#if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_connect),
 		.prep			= io_connect_prep,
 		.issue			= io_connect,
 		.prep_async		= io_connect_prep_async,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_FALLOCATE] = {
 		.needs_file		= 1,
@@ -11117,8 +10306,12 @@ static const struct io_op_def io_op_defs[] = {
 		.pollout		= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
+#if defined(CONFIG_NET)
 		.prep			= io_sendmsg_prep,
 		.issue			= io_send,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_RECV] = {
 		.needs_file		= 1,
@@ -11127,8 +10320,12 @@ static const struct io_op_def io_op_defs[] = {
 		.buffer_select		= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
+#if defined(CONFIG_NET)
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recv,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_OPENAT2] = {
 		.prep			= io_openat2_prep,
@@ -11175,8 +10372,12 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_SHUTDOWN] = {
 		.needs_file		= 1,
+#if defined(CONFIG_NET)
 		.prep			= io_shutdown_prep,
 		.issue			= io_shutdown,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_RENAMEAT] = {
 		.prep			= io_renameat_prep,
@@ -11233,8 +10434,12 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_SOCKET] = {
 		.audit_skip		= 1,
+#if defined(CONFIG_NET)
 		.prep			= io_socket_prep,
 		.issue			= io_socket,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
 	},
 	[IORING_OP_URING_CMD] = {
 		.needs_file		= 1,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 6a07e902120ab3..4b46385720c585 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -58,13 +58,35 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
 	lockdep_assert_held(&ctx->uring_lock);
 }
 
+static inline void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	/* order cqe stores with ring update */
+	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
+}
+
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 
+bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
+		     u32 cflags);
+void io_cqring_ev_posted(struct io_ring_ctx *ctx);
+void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+			      unsigned int issue_flags);
+unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_BUFFER_SELECT))
+		return false;
+	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
+}
+
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
 int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 			struct file *file, unsigned int file_slot);
+int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+			  unsigned int issue_flags, u32 slot_index);
 
 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
diff --git a/io_uring/net.c b/io_uring/net.c
new file mode 100644
index 00000000000000..2434548d0c1fb3
--- /dev/null
+++ b/io_uring/net.c
@@ -0,0 +1,779 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/compat.h>
+#include <net/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "net.h"
+
+#if defined(CONFIG_NET)
+struct io_shutdown {
+	struct file			*file;
+	int				how;
+};
+
+struct io_accept {
+	struct file			*file;
+	struct sockaddr __user		*addr;
+	int __user			*addr_len;
+	int				flags;
+	u32				file_slot;
+	unsigned long			nofile;
+};
+
+struct io_socket {
+	struct file			*file;
+	int				domain;
+	int				type;
+	int				protocol;
+	int				flags;
+	u32				file_slot;
+	unsigned long			nofile;
+};
+
+struct io_connect {
+	struct file			*file;
+	struct sockaddr __user		*addr;
+	int				addr_len;
+};
+
+struct io_sr_msg {
+	struct file			*file;
+	union {
+		struct compat_msghdr __user	*umsg_compat;
+		struct user_msghdr __user	*umsg;
+		void __user			*buf;
+	};
+	int				msg_flags;
+	size_t				len;
+	size_t				done_io;
+	unsigned int			flags;
+};
+
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
+int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+
+	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
+		     sqe->buf_index || sqe->splice_fd_in))
+		return -EINVAL;
+
+	shutdown->how = READ_ONCE(sqe->len);
+	return 0;
+}
+
+int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+	struct socket *sock;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	ret = __sys_shutdown_sock(sock, shutdown->how);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+static bool io_net_retry(struct socket *sock, int flags)
+{
+	if (!(flags & MSG_WAITALL))
+		return false;
+	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
+static int io_setup_async_msg(struct io_kiocb *req,
+			      struct io_async_msghdr *kmsg)
+{
+	struct io_async_msghdr *async_msg = req->async_data;
+
+	if (async_msg)
+		return -EAGAIN;
+	if (io_alloc_async_data(req)) {
+		kfree(kmsg->free_iov);
+		return -ENOMEM;
+	}
+	async_msg = req->async_data;
+	req->flags |= REQ_F_NEED_CLEANUP;
+	memcpy(async_msg, kmsg, sizeof(*kmsg));
+	async_msg->msg.msg_name = &async_msg->addr;
+	/* if were using fast_iov, set it to the new one */
+	if (!async_msg->free_iov)
+		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
+
+	return -EAGAIN;
+}
+
+static int io_sendmsg_copy_hdr(struct io_kiocb *req,
+			       struct io_async_msghdr *iomsg)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+	iomsg->msg.msg_name = &iomsg->addr;
+	iomsg->free_iov = iomsg->fast_iov;
+	return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
+					&iomsg->free_iov);
+}
+
+int io_sendmsg_prep_async(struct io_kiocb *req)
+{
+	int ret;
+
+	ret = io_sendmsg_copy_hdr(req, req->async_data);
+	if (!ret)
+		req->flags |= REQ_F_NEED_CLEANUP;
+	return ret;
+}
+
+void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
+{
+	struct io_async_msghdr *io = req->async_data;
+
+	kfree(io->free_iov);
+}
+
+int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+	if (unlikely(sqe->file_index || sqe->addr2))
+		return -EINVAL;
+
+	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->len = READ_ONCE(sqe->len);
+	sr->flags = READ_ONCE(sqe->ioprio);
+	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
+	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+	if (sr->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+	sr->done_io = 0;
+	return 0;
+}
+
+int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_async_msghdr iomsg, *kmsg;
+	struct socket *sock;
+	unsigned flags;
+	int min_ret = 0;
+	int ret;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	if (req_has_async_data(req)) {
+		kmsg = req->async_data;
+	} else {
+		ret = io_sendmsg_copy_hdr(req, &iomsg);
+		if (ret)
+			return ret;
+		kmsg = &iomsg;
+	}
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg);
+
+	flags = sr->msg_flags;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
+	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return io_setup_async_msg(req, kmsg);
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return io_setup_async_msg(req, kmsg);
+		}
+		req_set_fail(req);
+	}
+	/* fast path, check for non-NULL to avoid function call */
+	if (kmsg->free_iov)
+		kfree(kmsg->free_iov);
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_send(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct msghdr msg;
+	struct iovec iov;
+	struct socket *sock;
+	unsigned flags;
+	int min_ret = 0;
+	int ret;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		return ret;
+
+	msg.msg_name = NULL;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+
+	flags = sr->msg_flags;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
+	msg.msg_flags = flags;
+	ret = sock_sendmsg(sock, &msg);
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return -EAGAIN;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->len -= ret;
+			sr->buf += ret;
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
+		req_set_fail(req);
+	}
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
+				 struct io_async_msghdr *iomsg)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct iovec __user *uiov;
+	size_t iov_len;
+	int ret;
+
+	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
+					&iomsg->uaddr, &uiov, &iov_len);
+	if (ret)
+		return ret;
+
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		if (iov_len > 1)
+			return -EINVAL;
+		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
+			return -EFAULT;
+		sr->len = iomsg->fast_iov[0].iov_len;
+		iomsg->free_iov = NULL;
+	} else {
+		iomsg->free_iov = iomsg->fast_iov;
+		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+				     &iomsg->free_iov, &iomsg->msg.msg_iter,
+				     false);
+		if (ret > 0)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
+					struct io_async_msghdr *iomsg)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct compat_iovec __user *uiov;
+	compat_uptr_t ptr;
+	compat_size_t len;
+	int ret;
+
+	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
+				  &ptr, &len);
+	if (ret)
+		return ret;
+
+	uiov = compat_ptr(ptr);
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		compat_ssize_t clen;
+
+		if (len > 1)
+			return -EINVAL;
+		if (!access_ok(uiov, sizeof(*uiov)))
+			return -EFAULT;
+		if (__get_user(clen, &uiov->iov_len))
+			return -EFAULT;
+		if (clen < 0)
+			return -EINVAL;
+		sr->len = clen;
+		iomsg->free_iov = NULL;
+	} else {
+		iomsg->free_iov = iomsg->fast_iov;
+		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
+				   UIO_FASTIOV, &iomsg->free_iov,
+				   &iomsg->msg.msg_iter, true);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+#endif
+
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+			       struct io_async_msghdr *iomsg)
+{
+	iomsg->msg.msg_name = &iomsg->addr;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		return __io_compat_recvmsg_copy_hdr(req, iomsg);
+#endif
+
+	return __io_recvmsg_copy_hdr(req, iomsg);
+}
+
+int io_recvmsg_prep_async(struct io_kiocb *req)
+{
+	int ret;
+
+	ret = io_recvmsg_copy_hdr(req, req->async_data);
+	if (!ret)
+		req->flags |= REQ_F_NEED_CLEANUP;
+	return ret;
+}
+
+int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+	if (unlikely(sqe->file_index || sqe->addr2))
+		return -EINVAL;
+
+	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->len = READ_ONCE(sqe->len);
+	sr->flags = READ_ONCE(sqe->ioprio);
+	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
+	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+	if (sr->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+	if (sr->msg_flags & MSG_ERRQUEUE)
+		req->flags |= REQ_F_CLEAR_POLLIN;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+	sr->done_io = 0;
+	return 0;
+}
+
+int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_async_msghdr iomsg, *kmsg;
+	struct socket *sock;
+	unsigned int cflags;
+	unsigned flags;
+	int ret, min_ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	if (req_has_async_data(req)) {
+		kmsg = req->async_data;
+	} else {
+		ret = io_recvmsg_copy_hdr(req, &iomsg);
+		if (ret)
+			return ret;
+		kmsg = &iomsg;
+	}
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg);
+
+	if (io_do_buffer_select(req)) {
+		void __user *buf;
+
+		buf = io_buffer_select(req, &sr->len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+		kmsg->fast_iov[0].iov_base = buf;
+		kmsg->fast_iov[0].iov_len = sr->len;
+		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+				sr->len);
+	}
+
+	flags = sr->msg_flags;
+	if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
+	kmsg->msg.msg_get_inq = 1;
+	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && force_nonblock)
+			return io_setup_async_msg(req, kmsg);
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return io_setup_async_msg(req, kmsg);
+		}
+		req_set_fail(req);
+	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+		req_set_fail(req);
+	}
+
+	/* fast path, check for non-NULL to avoid function call */
+	if (kmsg->free_iov)
+		kfree(kmsg->free_iov);
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
+	cflags = io_put_kbuf(req, issue_flags);
+	if (kmsg->msg.msg_inq)
+		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+	io_req_set_res(req, ret, cflags);
+	return IOU_OK;
+}
+
+int io_recv(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct msghdr msg;
+	struct socket *sock;
+	struct iovec iov;
+	unsigned int cflags;
+	unsigned flags;
+	int ret, min_ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	if (io_do_buffer_select(req)) {
+		void __user *buf;
+
+		buf = io_buffer_select(req, &sr->len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+		sr->buf = buf;
+	}
+
+	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		goto out_free;
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+	msg.msg_get_inq = 1;
+	msg.msg_flags = 0;
+	msg.msg_controllen = 0;
+	msg.msg_iocb = NULL;
+
+	flags = sr->msg_flags;
+	if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
+	ret = sock_recvmsg(sock, &msg, flags);
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && force_nonblock)
+			return -EAGAIN;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->len -= ret;
+			sr->buf += ret;
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
+		req_set_fail(req);
+	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+out_free:
+		req_set_fail(req);
+	}
+
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
+	cflags = io_put_kbuf(req, issue_flags);
+	if (msg.msg_inq)
+		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+	io_req_set_res(req, ret, cflags);
+	return IOU_OK;
+}
+
+int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_accept *accept = io_kiocb_to_cmd(req);
+	unsigned flags;
+
+	if (sqe->len || sqe->buf_index)
+		return -EINVAL;
+
+	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	accept->flags = READ_ONCE(sqe->accept_flags);
+	accept->nofile = rlimit(RLIMIT_NOFILE);
+	flags = READ_ONCE(sqe->ioprio);
+	if (flags & ~IORING_ACCEPT_MULTISHOT)
+		return -EINVAL;
+
+	accept->file_slot = READ_ONCE(sqe->file_index);
+	if (accept->file_slot) {
+		if (accept->flags & SOCK_CLOEXEC)
+			return -EINVAL;
+		if (flags & IORING_ACCEPT_MULTISHOT &&
+		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
+			return -EINVAL;
+	}
+	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+	if (flags & IORING_ACCEPT_MULTISHOT)
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+	return 0;
+}
+
+int io_accept(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_accept *accept = io_kiocb_to_cmd(req);
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
+	bool fixed = !!accept->file_slot;
+	struct file *file;
+	int ret, fd;
+
+retry:
+	if (!fixed) {
+		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+		if (unlikely(fd < 0))
+			return fd;
+	}
+	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+			 accept->flags);
+	if (IS_ERR(file)) {
+		if (!fixed)
+			put_unused_fd(fd);
+		ret = PTR_ERR(file);
+		if (ret == -EAGAIN && force_nonblock) {
+			/*
+			 * if it's multishot and polled, we don't need to
+			 * return EAGAIN to arm the poll infra since it
+			 * has already been done
+			 */
+			if ((req->flags & IO_APOLL_MULTI_POLLED) ==
+			    IO_APOLL_MULTI_POLLED)
+				ret = IOU_ISSUE_SKIP_COMPLETE;
+			return ret;
+		}
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		req_set_fail(req);
+	} else if (!fixed) {
+		fd_install(fd, file);
+		ret = fd;
+	} else {
+		ret = io_fixed_fd_install(req, issue_flags, file,
+						accept->file_slot);
+	}
+
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		io_req_set_res(req, ret, 0);
+		return IOU_OK;
+	}
+	if (ret >= 0) {
+		bool filled;
+
+		spin_lock(&ctx->completion_lock);
+		filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
+					 IORING_CQE_F_MORE);
+		io_commit_cqring(ctx);
+		spin_unlock(&ctx->completion_lock);
+		if (filled) {
+			io_cqring_ev_posted(ctx);
+			goto retry;
+		}
+		ret = -ECANCELED;
+	}
+
+	return ret;
+}
+
+int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_socket *sock = io_kiocb_to_cmd(req);
+
+	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
+		return -EINVAL;
+
+	sock->domain = READ_ONCE(sqe->fd);
+	sock->type = READ_ONCE(sqe->off);
+	sock->protocol = READ_ONCE(sqe->len);
+	sock->file_slot = READ_ONCE(sqe->file_index);
+	sock->nofile = rlimit(RLIMIT_NOFILE);
+
+	sock->flags = sock->type & ~SOCK_TYPE_MASK;
+	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
+		return -EINVAL;
+	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+	return 0;
+}
+
+int io_socket(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_socket *sock = io_kiocb_to_cmd(req);
+	bool fixed = !!sock->file_slot;
+	struct file *file;
+	int ret, fd;
+
+	if (!fixed) {
+		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
+		if (unlikely(fd < 0))
+			return fd;
+	}
+	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
+	if (IS_ERR(file)) {
+		if (!fixed)
+			put_unused_fd(fd);
+		ret = PTR_ERR(file);
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return -EAGAIN;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		req_set_fail(req);
+	} else if (!fixed) {
+		fd_install(fd, file);
+		ret = fd;
+	} else {
+		ret = io_fixed_fd_install(req, issue_flags, file,
+					    sock->file_slot);
+	}
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_connect_prep_async(struct io_kiocb *req)
+{
+	struct io_async_connect *io = req->async_data;
+	struct io_connect *conn = io_kiocb_to_cmd(req);
+
+	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
+}
+
+int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_connect *conn = io_kiocb_to_cmd(req);
+
+	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
+		return -EINVAL;
+
+	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	conn->addr_len =  READ_ONCE(sqe->addr2);
+	return 0;
+}
+
+int io_connect(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_connect *connect = io_kiocb_to_cmd(req);
+	struct io_async_connect __io, *io;
+	unsigned file_flags;
+	int ret;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+
+	if (req_has_async_data(req)) {
+		io = req->async_data;
+	} else {
+		ret = move_addr_to_kernel(connect->addr,
+						connect->addr_len,
+						&__io.address);
+		if (ret)
+			goto out;
+		io = &__io;
+	}
+
+	file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+	ret = __sys_connect_file(req->file, &io->address,
+					connect->addr_len, file_flags);
+	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
+		if (req_has_async_data(req))
+			return -EAGAIN;
+		if (io_alloc_async_data(req)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		memcpy(req->async_data, &__io, sizeof(__io));
+		return -EAGAIN;
+	}
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
+out:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+#endif
diff --git a/io_uring/net.h b/io_uring/net.h
new file mode 100644
index 00000000000000..81d71d1647704d
--- /dev/null
+++ b/io_uring/net.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/net.h>
+#include <linux/uio.h>
+
+#if defined(CONFIG_NET)
+struct io_async_msghdr {
+	struct iovec			fast_iov[UIO_FASTIOV];
+	/* points to an allocated iov, if NULL we use fast_iov instead */
+	struct iovec			*free_iov;
+	struct sockaddr __user		*uaddr;
+	struct msghdr			msg;
+	struct sockaddr_storage		addr;
+};
+
+struct io_async_connect {
+	struct sockaddr_storage		address;
+};
+
+int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_shutdown(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_sendmsg_prep_async(struct io_kiocb *req);
+void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req);
+int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags);
+int io_send(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_recvmsg_prep_async(struct io_kiocb *req);
+int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags);
+int io_recv(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_accept(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_socket(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_connect_prep_async(struct io_kiocb *req);
+int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_connect(struct io_kiocb *req, unsigned int issue_flags);
+#endif

From 36404b09aa609e00f8f0108356830c22b99b3cbf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 06:42:08 -0600
Subject: [PATCH 040/172] io_uring: move msg_ring into its own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |  2 +-
 io_uring/io_uring.c | 55 +-------------------------------------
 io_uring/msg_ring.c | 65 +++++++++++++++++++++++++++++++++++++++++++++
 io_uring/msg_ring.h |  4 +++
 4 files changed, 71 insertions(+), 55 deletions(-)
 create mode 100644 io_uring/msg_ring.c
 create mode 100644 io_uring/msg_ring.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index c9ec1bbabfbd36..d7cf992c841a38 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
-					statx.o net.o
+					statx.o net.o msg_ring.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cbc20985cd6f34..a0173ab5178c95 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -103,6 +103,7 @@
 #include "epoll.h"
 #include "statx.h"
 #include "net.h"
+#include "msg_ring.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -346,12 +347,6 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
-struct io_msg {
-	struct file			*file;
-	u64 user_data;
-	u32 len;
-};
-
 struct io_rw_state {
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
@@ -3613,54 +3608,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
-static int io_msg_ring_prep(struct io_kiocb *req,
-			    const struct io_uring_sqe *sqe)
-{
-	struct io_msg *msg = io_kiocb_to_cmd(req);
-
-	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
-		     sqe->buf_index || sqe->personality))
-		return -EINVAL;
-
-	msg->user_data = READ_ONCE(sqe->off);
-	msg->len = READ_ONCE(sqe->len);
-	return 0;
-}
-
-static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_msg *msg = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *target_ctx;
-	bool filled;
-	int ret;
-
-	ret = -EBADFD;
-	if (req->file->f_op != &io_uring_fops)
-		goto done;
-
-	ret = -EOVERFLOW;
-	target_ctx = req->file->private_data;
-
-	spin_lock(&target_ctx->completion_lock);
-	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
-	io_commit_cqring(target_ctx);
-	spin_unlock(&target_ctx->completion_lock);
-
-	if (filled) {
-		io_cqring_ev_posted(target_ctx);
-		ret = 0;
-	}
-
-done:
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	/* put file to avoid an attempt to IOPOLL the req */
-	io_put_file(req->file);
-	req->file = NULL;
-	return IOU_OK;
-}
-
 /*
  * Note when io_fixed_fd_install() returns error value, it will ensure
  * fput() is called correspondingly.
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
new file mode 100644
index 00000000000000..3b89f9a0a0b459
--- /dev/null
+++ b/io_uring/msg_ring.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "msg_ring.h"
+
+struct io_msg {
+	struct file			*file;
+	u64 user_data;
+	u32 len;
+};
+
+int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+
+	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+		     sqe->buf_index || sqe->personality))
+		return -EINVAL;
+
+	msg->user_data = READ_ONCE(sqe->off);
+	msg->len = READ_ONCE(sqe->len);
+	return 0;
+}
+
+int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *target_ctx;
+	bool filled;
+	int ret;
+
+	ret = -EBADFD;
+	if (!io_is_uring_fops(req->file))
+		goto done;
+
+	ret = -EOVERFLOW;
+	target_ctx = req->file->private_data;
+
+	spin_lock(&target_ctx->completion_lock);
+	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
+	io_commit_cqring(target_ctx);
+	spin_unlock(&target_ctx->completion_lock);
+
+	if (filled) {
+		io_cqring_ev_posted(target_ctx);
+		ret = 0;
+	}
+
+done:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	/* put file to avoid an attempt to IOPOLL the req */
+	io_put_file(req->file);
+	req->file = NULL;
+	return IOU_OK;
+}
diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h
new file mode 100644
index 00000000000000..fb9601f202d07d
--- /dev/null
+++ b/io_uring/msg_ring.h
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);

From e418bbc97bffda868934acfdf8a1173ab044be69 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 08:56:52 -0600
Subject: [PATCH 041/172] io_uring: move our reference counting into a header

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 43 +---------------------------------------
 io_uring/refs.h     | 48 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 42 deletions(-)
 create mode 100644 io_uring/refs.h

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a0173ab5178c95..eea5282b1ca21b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -91,6 +91,7 @@
 
 #include "io_uring_types.h"
 #include "io_uring.h"
+#include "refs.h"
 
 #include "xattr.h"
 #include "nop.h"
@@ -611,54 +612,12 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-/*
- * Shamelessly stolen from the mm implementation of page reference checking,
- * see commit f958d7b528b1 for details.
- */
-#define req_ref_zero_or_close_to_overflow(req)	\
-	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
-
-static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
-	return atomic_inc_not_zero(&req->refs);
-}
-
-static inline bool req_ref_put_and_test(struct io_kiocb *req)
-{
-	if (likely(!(req->flags & REQ_F_REFCOUNT)))
-		return true;
-
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_dec_and_test(&req->refs);
-}
-
-static inline void req_ref_get(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	atomic_inc(&req->refs);
-}
-
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
 		__io_submit_flush_completions(ctx);
 }
 
-static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
-{
-	if (!(req->flags & REQ_F_REFCOUNT)) {
-		req->flags |= REQ_F_REFCOUNT;
-		atomic_set(&req->refs, nr);
-	}
-}
-
-static inline void io_req_set_refcount(struct io_kiocb *req)
-{
-	__io_req_set_refcount(req, 1);
-}
-
 #define IO_RSRC_REF_BATCH	100
 
 static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
diff --git a/io_uring/refs.h b/io_uring/refs.h
new file mode 100644
index 00000000000000..334c5ead4c43d9
--- /dev/null
+++ b/io_uring/refs.h
@@ -0,0 +1,48 @@
+#ifndef IOU_REQ_REF_H
+#define IOU_REQ_REF_H
+
+#include <linux/atomic.h>
+#include "io_uring_types.h"
+
+/*
+ * Shamelessly stolen from the mm implementation of page reference checking,
+ * see commit f958d7b528b1 for details.
+ */
+#define req_ref_zero_or_close_to_overflow(req)	\
+	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
+	return atomic_inc_not_zero(&req->refs);
+}
+
+static inline bool req_ref_put_and_test(struct io_kiocb *req)
+{
+	if (likely(!(req->flags & REQ_F_REFCOUNT)))
+		return true;
+
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_dec_and_test(&req->refs);
+}
+
+static inline void req_ref_get(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	atomic_inc(&req->refs);
+}
+
+static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
+{
+	if (!(req->flags & REQ_F_REFCOUNT)) {
+		req->flags |= REQ_F_REFCOUNT;
+		atomic_set(&req->refs, nr);
+	}
+}
+
+static inline void io_req_set_refcount(struct io_kiocb *req)
+{
+	__io_req_set_refcount(req, 1);
+}
+#endif

From 59915143e89fb8dc7b5bd9dcaf628d8181fd54ac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 08:57:27 -0600
Subject: [PATCH 042/172] io_uring: move timeout opcodes and handling into its
 own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile         |   2 +-
 io_uring/io_uring.c       | 667 +-------------------------------------
 io_uring/io_uring.h       |  13 +-
 io_uring/io_uring_types.h |  10 +
 io_uring/timeout.c        | 634 ++++++++++++++++++++++++++++++++++++
 io_uring/timeout.h        |  35 ++
 6 files changed, 701 insertions(+), 660 deletions(-)
 create mode 100644 io_uring/timeout.c
 create mode 100644 io_uring/timeout.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index d7cf992c841a38..6ae4e45a15dbfc 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
-					statx.o net.o msg_ring.o
+					statx.o net.o msg_ring.o timeout.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eea5282b1ca21b..3fc59a22d54e89 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -105,6 +105,7 @@
 #include "statx.h"
 #include "net.h"
 #include "msg_ring.h"
+#include "timeout.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -288,14 +289,6 @@ struct io_poll_update {
 	bool				update_user_data;
 };
 
-struct io_timeout_data {
-	struct io_kiocb			*req;
-	struct hrtimer			timer;
-	struct timespec64		ts;
-	enum hrtimer_mode		mode;
-	u32				flags;
-};
-
 struct io_cancel {
 	struct file			*file;
 	u64				addr;
@@ -303,27 +296,6 @@ struct io_cancel {
 	s32				fd;
 };
 
-struct io_timeout {
-	struct file			*file;
-	u32				off;
-	u32				target_seq;
-	struct list_head		list;
-	/* head of the link, used by linked timeouts only */
-	struct io_kiocb			*head;
-	/* for linked completions */
-	struct io_kiocb			*prev;
-};
-
-struct io_timeout_rem {
-	struct file			*file;
-	u64				addr;
-
-	/* timeout update */
-	struct timespec64		ts;
-	u32				flags;
-	bool				ltimeout;
-};
-
 struct io_rw {
 	/* NOTE: kiocb has the file as the first member, so don't do it here */
 	struct kiocb			kiocb;
@@ -388,16 +360,6 @@ struct io_defer_entry {
 	u32			seq;
 };
 
-struct io_cancel_data {
-	struct io_ring_ctx *ctx;
-	union {
-		u64 data;
-		struct file *file;
-	};
-	u32 flags;
-	int seq;
-};
-
 struct io_op_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
@@ -436,7 +398,6 @@ static const struct io_op_def io_op_defs[];
 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
 
-static bool io_disarm_next(struct io_kiocb *req);
 static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
@@ -444,7 +405,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
 static void io_dismantle_req(struct io_kiocb *req);
-static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args);
@@ -456,9 +416,7 @@ static void io_req_task_queue(struct io_kiocb *req);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 static int io_req_prep_async(struct io_kiocb *req);
 
-static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 static void io_eventfd_signal(struct io_ring_ctx *ctx);
-static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 
 static struct kmem_cache *req_cachep;
 
@@ -609,9 +567,6 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 	}
 }
 
-#define io_for_each_link(pos, head) \
-	for (pos = (head); pos; pos = pos->link)
-
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
@@ -803,24 +758,6 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	io_ring_submit_unlock(ctx, issue_flags);
 }
 
-static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
-			  bool cancel_all)
-	__must_hold(&req->ctx->timeout_lock)
-{
-	struct io_kiocb *req;
-
-	if (task && head->task != task)
-		return false;
-	if (cancel_all)
-		return true;
-
-	io_for_each_link(req, head) {
-		if (req->flags & REQ_F_INFLIGHT)
-			return true;
-	}
-	return false;
-}
-
 static bool io_match_linked(struct io_kiocb *head)
 {
 	struct io_kiocb *req;
@@ -877,13 +814,6 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 	complete(&ctx->ref_comp);
 }
 
-static inline bool io_is_timeout_noseq(struct io_kiocb *req)
-{
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-
-	return !timeout->off;
-}
-
 static __cold void io_fallback_req_func(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -1120,24 +1050,6 @@ static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 		io_queue_linked_timeout(link);
 }
 
-static void io_kill_timeout(struct io_kiocb *req, int status)
-	__must_hold(&req->ctx->completion_lock)
-	__must_hold(&req->ctx->timeout_lock)
-{
-	struct io_timeout_data *io = req->async_data;
-
-	if (hrtimer_try_to_cancel(&io->timer) != -1) {
-		struct io_timeout *timeout = io_kiocb_to_cmd(req);
-
-		if (status)
-			req_set_fail(req);
-		atomic_set(&req->ctx->cq_timeouts,
-			atomic_read(&req->ctx->cq_timeouts) + 1);
-		list_del_init(&timeout->list);
-		io_req_tw_post_queue(req, status, 0);
-	}
-}
-
 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	while (!list_empty(&ctx->defer_list)) {
@@ -1152,38 +1064,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 	}
 }
 
-static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->completion_lock)
-{
-	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-	struct io_timeout *timeout, *tmp;
-
-	spin_lock_irq(&ctx->timeout_lock);
-	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
-		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
-		u32 events_needed, events_got;
-
-		if (io_is_timeout_noseq(req))
-			break;
-
-		/*
-		 * Since seq can easily wrap around over time, subtract
-		 * the last seq at which timeouts were flushed before comparing.
-		 * Assuming not more than 2^31-1 events have happened since,
-		 * these subtractions won't have wrapped, so we can check if
-		 * target is in [last_seq, current_seq] by comparing the two.
-		 */
-		events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
-		events_got = seq - ctx->cq_last_tm_flush;
-		if (events_got < events_needed)
-			break;
-
-		io_kill_timeout(req, 0);
-	}
-	ctx->cq_last_tm_flush = seq;
-	spin_unlock_irq(&ctx->timeout_lock);
-}
-
 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
 	if (ctx->off_timeout_used || ctx->drain_active) {
@@ -1585,14 +1465,14 @@ static void __io_req_complete_put(struct io_kiocb *req)
 	}
 }
 
-static void __io_req_complete_post(struct io_kiocb *req)
+void __io_req_complete_post(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_CQE_SKIP))
 		__io_fill_cqe_req(req->ctx, req);
 	__io_req_complete_put(req);
 }
 
-static void io_req_complete_post(struct io_kiocb *req)
+void io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -1717,7 +1597,7 @@ static inline void io_dismantle_req(struct io_kiocb *req)
 		io_put_file(req->file);
 }
 
-static __cold void io_free_req(struct io_kiocb *req)
+__cold void io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -1731,96 +1611,6 @@ static __cold void io_free_req(struct io_kiocb *req)
 	spin_unlock(&ctx->completion_lock);
 }
 
-static inline void io_remove_next_linked(struct io_kiocb *req)
-{
-	struct io_kiocb *nxt = req->link;
-
-	req->link = nxt->link;
-	nxt->link = NULL;
-}
-
-static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
-	__must_hold(&req->ctx->completion_lock)
-	__must_hold(&req->ctx->timeout_lock)
-{
-	struct io_kiocb *link = req->link;
-
-	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
-		struct io_timeout_data *io = link->async_data;
-		struct io_timeout *timeout = io_kiocb_to_cmd(link);
-
-		io_remove_next_linked(req);
-		timeout->head = NULL;
-		if (hrtimer_try_to_cancel(&io->timer) != -1) {
-			list_del(&timeout->list);
-			return link;
-		}
-	}
-	return NULL;
-}
-
-static void io_fail_links(struct io_kiocb *req)
-	__must_hold(&req->ctx->completion_lock)
-{
-	struct io_kiocb *nxt, *link = req->link;
-	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
-
-	req->link = NULL;
-	while (link) {
-		long res = -ECANCELED;
-
-		if (link->flags & REQ_F_FAIL)
-			res = link->cqe.res;
-
-		nxt = link->link;
-		link->link = NULL;
-
-		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
-					req->opcode, link);
-
-		if (ignore_cqes)
-			link->flags |= REQ_F_CQE_SKIP;
-		else
-			link->flags &= ~REQ_F_CQE_SKIP;
-		io_req_set_res(link, res, 0);
-		__io_req_complete_post(link);
-		link = nxt;
-	}
-}
-
-static bool io_disarm_next(struct io_kiocb *req)
-	__must_hold(&req->ctx->completion_lock)
-{
-	struct io_kiocb *link = NULL;
-	bool posted = false;
-
-	if (req->flags & REQ_F_ARM_LTIMEOUT) {
-		link = req->link;
-		req->flags &= ~REQ_F_ARM_LTIMEOUT;
-		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
-			io_remove_next_linked(req);
-			io_req_tw_post_queue(link, -ECANCELED, 0);
-			posted = true;
-		}
-	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
-		struct io_ring_ctx *ctx = req->ctx;
-
-		spin_lock_irq(&ctx->timeout_lock);
-		link = io_disarm_linked_timeout(req);
-		spin_unlock_irq(&ctx->timeout_lock);
-		if (link) {
-			posted = true;
-			io_req_tw_post_queue(link, -ECANCELED, 0);
-		}
-	}
-	if (unlikely((req->flags & REQ_F_FAIL) &&
-		     !(req->flags & REQ_F_HARDLINK))) {
-		posted |= (req->link != NULL);
-		io_fail_links(req);
-	}
-	return posted;
-}
-
 static void __io_req_find_next_prep(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -2033,7 +1823,7 @@ static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 	io_req_complete_post(req);
 }
 
-static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
+void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	io_req_set_res(req, res, cflags);
 	req->io_task_work.func = io_req_tw_post;
@@ -2057,7 +1847,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 		io_req_complete_failed(req, -EFAULT);
 }
 
-static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
 	io_req_set_res(req, ret, 0);
 	req->io_task_work.func = io_req_task_cancel;
@@ -2076,7 +1866,7 @@ static void io_req_task_queue_reissue(struct io_kiocb *req)
 	io_req_task_work_add(req);
 }
 
-static void io_queue_next(struct io_kiocb *req)
+void io_queue_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = io_req_find_next(req);
 
@@ -2177,14 +1967,6 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 	return nxt;
 }
 
-static inline void io_put_req(struct io_kiocb *req)
-{
-	if (req_ref_put_and_test(req)) {
-		io_queue_next(req);
-		io_free_req(req);
-	}
-}
-
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
 	/* See comment at the top of this file */
@@ -2451,7 +2233,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 	return false;
 }
 
-static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
+inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
 	if (*locked) {
 		req->cqe.flags |= io_put_kbuf(req, 0);
@@ -4600,334 +4382,6 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
-{
-	struct io_timeout_data *data = container_of(timer,
-						struct io_timeout_data, timer);
-	struct io_kiocb *req = data->req;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ctx->timeout_lock, flags);
-	list_del_init(&timeout->list);
-	atomic_set(&req->ctx->cq_timeouts,
-		atomic_read(&req->ctx->cq_timeouts) + 1);
-	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
-
-	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
-		req_set_fail(req);
-
-	io_req_set_res(req, -ETIME, 0);
-	req->io_task_work.func = io_req_task_complete;
-	io_req_task_work_add(req);
-	return HRTIMER_NORESTART;
-}
-
-static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
-					   struct io_cancel_data *cd)
-	__must_hold(&ctx->timeout_lock)
-{
-	struct io_timeout *timeout;
-	struct io_timeout_data *io;
-	struct io_kiocb *req = NULL;
-
-	list_for_each_entry(timeout, &ctx->timeout_list, list) {
-		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
-
-		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
-		    cd->data != tmp->cqe.user_data)
-			continue;
-		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
-			if (cd->seq == tmp->work.cancel_seq)
-				continue;
-			tmp->work.cancel_seq = cd->seq;
-		}
-		req = tmp;
-		break;
-	}
-	if (!req)
-		return ERR_PTR(-ENOENT);
-
-	io = req->async_data;
-	if (hrtimer_try_to_cancel(&io->timer) == -1)
-		return ERR_PTR(-EALREADY);
-	timeout = io_kiocb_to_cmd(req);
-	list_del_init(&timeout->list);
-	return req;
-}
-
-static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
-{
-	struct io_kiocb *req;
-
-	spin_lock_irq(&ctx->timeout_lock);
-	req = io_timeout_extract(ctx, cd);
-	spin_unlock_irq(&ctx->timeout_lock);
-
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	io_req_task_queue_fail(req, -ECANCELED);
-	return 0;
-}
-
-static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
-{
-	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
-	case IORING_TIMEOUT_BOOTTIME:
-		return CLOCK_BOOTTIME;
-	case IORING_TIMEOUT_REALTIME:
-		return CLOCK_REALTIME;
-	default:
-		/* can't happen, vetted at prep time */
-		WARN_ON_ONCE(1);
-		fallthrough;
-	case 0:
-		return CLOCK_MONOTONIC;
-	}
-}
-
-static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
-				    struct timespec64 *ts, enum hrtimer_mode mode)
-	__must_hold(&ctx->timeout_lock)
-{
-	struct io_timeout_data *io;
-	struct io_timeout *timeout;
-	struct io_kiocb *req = NULL;
-
-	list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
-		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
-
-		if (user_data == tmp->cqe.user_data) {
-			req = tmp;
-			break;
-		}
-	}
-	if (!req)
-		return -ENOENT;
-
-	io = req->async_data;
-	if (hrtimer_try_to_cancel(&io->timer) == -1)
-		return -EALREADY;
-	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
-	io->timer.function = io_link_timeout_fn;
-	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
-	return 0;
-}
-
-static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
-			     struct timespec64 *ts, enum hrtimer_mode mode)
-	__must_hold(&ctx->timeout_lock)
-{
-	struct io_cancel_data cd = { .data = user_data, };
-	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_timeout_data *data;
-
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	timeout->off = 0; /* noseq */
-	data = req->async_data;
-	list_add_tail(&timeout->list, &ctx->timeout_list);
-	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
-	data->timer.function = io_timeout_fn;
-	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
-	return 0;
-}
-
-static int io_timeout_remove_prep(struct io_kiocb *req,
-				  const struct io_uring_sqe *sqe)
-{
-	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
-
-	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
-		return -EINVAL;
-	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
-		return -EINVAL;
-
-	tr->ltimeout = false;
-	tr->addr = READ_ONCE(sqe->addr);
-	tr->flags = READ_ONCE(sqe->timeout_flags);
-	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
-		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
-			return -EINVAL;
-		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
-			tr->ltimeout = true;
-		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
-			return -EINVAL;
-		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
-			return -EFAULT;
-		if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
-			return -EINVAL;
-	} else if (tr->flags) {
-		/* timeout removal doesn't support flags */
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
-{
-	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
-					    : HRTIMER_MODE_REL;
-}
-
-/*
- * Remove or update an existing timeout command
- */
-static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret;
-
-	if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
-		struct io_cancel_data cd = { .data = tr->addr, };
-
-		spin_lock(&ctx->completion_lock);
-		ret = io_timeout_cancel(ctx, &cd);
-		spin_unlock(&ctx->completion_lock);
-	} else {
-		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
-
-		spin_lock_irq(&ctx->timeout_lock);
-		if (tr->ltimeout)
-			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
-		else
-			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
-		spin_unlock_irq(&ctx->timeout_lock);
-	}
-
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
-static int __io_timeout_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe,
-			     bool is_timeout_link)
-{
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_timeout_data *data;
-	unsigned flags;
-	u32 off = READ_ONCE(sqe->off);
-
-	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
-		return -EINVAL;
-	if (off && is_timeout_link)
-		return -EINVAL;
-	flags = READ_ONCE(sqe->timeout_flags);
-	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
-		      IORING_TIMEOUT_ETIME_SUCCESS))
-		return -EINVAL;
-	/* more than one clock specified is invalid, obviously */
-	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&timeout->list);
-	timeout->off = off;
-	if (unlikely(off && !req->ctx->off_timeout_used))
-		req->ctx->off_timeout_used = true;
-
-	if (WARN_ON_ONCE(req_has_async_data(req)))
-		return -EFAULT;
-	if (io_alloc_async_data(req))
-		return -ENOMEM;
-
-	data = req->async_data;
-	data->req = req;
-	data->flags = flags;
-
-	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
-		return -EFAULT;
-
-	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&timeout->list);
-	data->mode = io_translate_timeout_mode(flags);
-	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
-
-	if (is_timeout_link) {
-		struct io_submit_link *link = &req->ctx->submit_state.link;
-
-		if (!link->head)
-			return -EINVAL;
-		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
-			return -EINVAL;
-		timeout->head = link->last;
-		link->last->flags |= REQ_F_ARM_LTIMEOUT;
-	}
-	return 0;
-}
-
-static int io_timeout_prep(struct io_kiocb *req,
-			   const struct io_uring_sqe *sqe)
-{
-	return __io_timeout_prep(req, sqe, false);
-}
-
-static int io_link_timeout_prep(struct io_kiocb *req,
-				const struct io_uring_sqe *sqe)
-{
-	return __io_timeout_prep(req, sqe, true);
-}
-
-static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_timeout_data *data = req->async_data;
-	struct list_head *entry;
-	u32 tail, off = timeout->off;
-
-	spin_lock_irq(&ctx->timeout_lock);
-
-	/*
-	 * sqe->off holds how many events that need to occur for this
-	 * timeout event to be satisfied. If it isn't set, then this is
-	 * a pure timeout request, sequence isn't used.
-	 */
-	if (io_is_timeout_noseq(req)) {
-		entry = ctx->timeout_list.prev;
-		goto add;
-	}
-
-	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-	timeout->target_seq = tail + off;
-
-	/* Update the last seq here in case io_flush_timeouts() hasn't.
-	 * This is safe because ->completion_lock is held, and submissions
-	 * and completions are never mixed in the same ->completion_lock section.
-	 */
-	ctx->cq_last_tm_flush = tail;
-
-	/*
-	 * Insertion sort, ensuring the first entry in the list is always
-	 * the one we need first.
-	 */
-	list_for_each_prev(entry, &ctx->timeout_list) {
-		struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
-		struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);
-
-		if (io_is_timeout_noseq(nxt))
-			continue;
-		/* nxt.seq is behind @tail, otherwise would've been completed */
-		if (off >= nextt->target_seq - tail)
-			break;
-	}
-add:
-	list_add(&timeout->list, entry);
-	data->timer.function = io_timeout_fn;
-	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
-	spin_unlock_irq(&ctx->timeout_lock);
-	return IOU_ISSUE_SKIP_COMPLETE;
-}
-
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -4979,7 +4433,7 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 	return ret;
 }
 
-static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
+int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -5462,84 +4916,6 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 	return file;
 }
 
-static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
-{
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_kiocb *prev = timeout->prev;
-	int ret = -ENOENT;
-
-	if (prev) {
-		if (!(req->task->flags & PF_EXITING)) {
-			struct io_cancel_data cd = {
-				.ctx		= req->ctx,
-				.data		= prev->cqe.user_data,
-			};
-
-			ret = io_try_cancel(req, &cd);
-		}
-		io_req_set_res(req, ret ?: -ETIME, 0);
-		io_req_complete_post(req);
-		io_put_req(prev);
-	} else {
-		io_req_set_res(req, -ETIME, 0);
-		io_req_complete_post(req);
-	}
-}
-
-static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
-{
-	struct io_timeout_data *data = container_of(timer,
-						struct io_timeout_data, timer);
-	struct io_kiocb *prev, *req = data->req;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ctx->timeout_lock, flags);
-	prev = timeout->head;
-	timeout->head = NULL;
-
-	/*
-	 * We don't expect the list to be empty, that will only happen if we
-	 * race with the completion of the linked work.
-	 */
-	if (prev) {
-		io_remove_next_linked(prev);
-		if (!req_ref_inc_not_zero(prev))
-			prev = NULL;
-	}
-	list_del(&timeout->list);
-	timeout->prev = prev;
-	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
-
-	req->io_task_work.func = io_req_task_link_timeout;
-	io_req_task_work_add(req);
-	return HRTIMER_NORESTART;
-}
-
-static void io_queue_linked_timeout(struct io_kiocb *req)
-{
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-
-	spin_lock_irq(&ctx->timeout_lock);
-	/*
-	 * If the back reference is NULL, then our linked request finished
-	 * before we got a chance to setup the timer
-	 */
-	if (timeout->head) {
-		struct io_timeout_data *data = req->async_data;
-
-		data->timer.function = io_link_timeout_fn;
-		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
-				data->mode);
-		list_add_tail(&timeout->list, &ctx->ltimeout_list);
-	}
-	spin_unlock_irq(&ctx->timeout_lock);
-	/* drop submission reference */
-	io_put_req(req);
-}
-
 static void io_queue_async(struct io_kiocb *req, int ret)
 	__must_hold(&req->ctx->uring_lock)
 {
@@ -8116,31 +7492,6 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 	io_ring_ctx_free(ctx);
 }
 
-/* Returns true if we found and killed one or more timeouts */
-static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
-				    struct task_struct *tsk, bool cancel_all)
-{
-	struct io_timeout *timeout, *tmp;
-	int canceled = 0;
-
-	spin_lock(&ctx->completion_lock);
-	spin_lock_irq(&ctx->timeout_lock);
-	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
-		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
-
-		if (io_match_task(req, tsk, cancel_all)) {
-			io_kill_timeout(req, -ECANCELED);
-			canceled++;
-		}
-	}
-	spin_unlock_irq(&ctx->timeout_lock);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	if (canceled != 0)
-		io_cqring_ev_posted(ctx);
-	return canceled != 0;
-}
-
 static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
 	unsigned long index;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 4b46385720c585..e285e12ccbdbb1 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -65,7 +65,8 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 }
 
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
-
+void io_req_complete_post(struct io_kiocb *req);
+void __io_req_complete_post(struct io_kiocb *req);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 		     u32 cflags);
 void io_cqring_ev_posted(struct io_ring_ctx *ctx);
@@ -96,5 +97,15 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
+void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
+void io_req_task_complete(struct io_kiocb *req, bool *locked);
+void io_req_task_queue_fail(struct io_kiocb *req, int ret);
+int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
+
+void io_free_req(struct io_kiocb *req);
+void io_queue_next(struct io_kiocb *req);
+
+#define io_for_each_link(pos, head) \
+	for (pos = (head); pos; pos = pos->link)
 
 #endif
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index dba72113c59d67..349524907b6bfc 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -488,4 +488,14 @@ struct io_kiocb {
 	struct io_wq_work		work;
 };
 
+struct io_cancel_data {
+	struct io_ring_ctx *ctx;
+	union {
+		u64 data;
+		struct file *file;
+	};
+	u32 flags;
+	int seq;
+};
+
 #endif
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
new file mode 100644
index 00000000000000..5e42bfcd683e2a
--- /dev/null
+++ b/io_uring/timeout.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <trace/events/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "refs.h"
+#include "timeout.h"
+
+struct io_timeout {
+	struct file			*file;
+	u32				off;
+	u32				target_seq;
+	struct list_head		list;
+	/* head of the link, used by linked timeouts only */
+	struct io_kiocb			*head;
+	/* for linked completions */
+	struct io_kiocb			*prev;
+};
+
+struct io_timeout_rem {
+	struct file			*file;
+	u64				addr;
+
+	/* timeout update */
+	struct timespec64		ts;
+	u32				flags;
+	bool				ltimeout;
+};
+
+static inline bool io_is_timeout_noseq(struct io_kiocb *req)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+
+	return !timeout->off;
+}
+
+static inline void io_put_req(struct io_kiocb *req)
+{
+	if (req_ref_put_and_test(req)) {
+		io_queue_next(req);
+		io_free_req(req);
+	}
+}
+
+static void io_kill_timeout(struct io_kiocb *req, int status)
+	__must_hold(&req->ctx->completion_lock)
+	__must_hold(&req->ctx->timeout_lock)
+{
+	struct io_timeout_data *io = req->async_data;
+
+	if (hrtimer_try_to_cancel(&io->timer) != -1) {
+		struct io_timeout *timeout = io_kiocb_to_cmd(req);
+
+		if (status)
+			req_set_fail(req);
+		atomic_set(&req->ctx->cq_timeouts,
+			atomic_read(&req->ctx->cq_timeouts) + 1);
+		list_del_init(&timeout->list);
+		io_req_tw_post_queue(req, status, 0);
+	}
+}
+
+__cold void io_flush_timeouts(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->completion_lock)
+{
+	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+	struct io_timeout *timeout, *tmp;
+
+	spin_lock_irq(&ctx->timeout_lock);
+	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
+		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
+		u32 events_needed, events_got;
+
+		if (io_is_timeout_noseq(req))
+			break;
+
+		/*
+		 * Since seq can easily wrap around over time, subtract
+		 * the last seq at which timeouts were flushed before comparing.
+		 * Assuming not more than 2^31-1 events have happened since,
+		 * these subtractions won't have wrapped, so we can check if
+		 * target is in [last_seq, current_seq] by comparing the two.
+		 */
+		events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
+		events_got = seq - ctx->cq_last_tm_flush;
+		if (events_got < events_needed)
+			break;
+
+		io_kill_timeout(req, 0);
+	}
+	ctx->cq_last_tm_flush = seq;
+	spin_unlock_irq(&ctx->timeout_lock);
+}
+
+static void io_fail_links(struct io_kiocb *req)
+	__must_hold(&req->ctx->completion_lock)
+{
+	struct io_kiocb *nxt, *link = req->link;
+	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
+
+	req->link = NULL;
+	while (link) {
+		long res = -ECANCELED;
+
+		if (link->flags & REQ_F_FAIL)
+			res = link->cqe.res;
+
+		nxt = link->link;
+		link->link = NULL;
+
+		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
+					req->opcode, link);
+
+		if (ignore_cqes)
+			link->flags |= REQ_F_CQE_SKIP;
+		else
+			link->flags &= ~REQ_F_CQE_SKIP;
+		io_req_set_res(link, res, 0);
+		__io_req_complete_post(link);
+		link = nxt;
+	}
+}
+
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+	struct io_kiocb *nxt = req->link;
+
+	req->link = nxt->link;
+	nxt->link = NULL;
+}
+
+bool io_disarm_next(struct io_kiocb *req)
+	__must_hold(&req->ctx->completion_lock)
+{
+	struct io_kiocb *link = NULL;
+	bool posted = false;
+
+	if (req->flags & REQ_F_ARM_LTIMEOUT) {
+		link = req->link;
+		req->flags &= ~REQ_F_ARM_LTIMEOUT;
+		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
+			io_remove_next_linked(req);
+			io_req_tw_post_queue(link, -ECANCELED, 0);
+			posted = true;
+		}
+	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		spin_lock_irq(&ctx->timeout_lock);
+		link = io_disarm_linked_timeout(req);
+		spin_unlock_irq(&ctx->timeout_lock);
+		if (link) {
+			posted = true;
+			io_req_tw_post_queue(link, -ECANCELED, 0);
+		}
+	}
+	if (unlikely((req->flags & REQ_F_FAIL) &&
+		     !(req->flags & REQ_F_HARDLINK))) {
+		posted |= (req->link != NULL);
+		io_fail_links(req);
+	}
+	return posted;
+}
+
+struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+					    struct io_kiocb *link)
+	__must_hold(&req->ctx->completion_lock)
+	__must_hold(&req->ctx->timeout_lock)
+{
+	struct io_timeout_data *io = link->async_data;
+	struct io_timeout *timeout = io_kiocb_to_cmd(link);
+
+	io_remove_next_linked(req);
+	timeout->head = NULL;
+	if (hrtimer_try_to_cancel(&io->timer) != -1) {
+		list_del(&timeout->list);
+		return link;
+	}
+
+	return NULL;
+}
+
+static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
+{
+	struct io_timeout_data *data = container_of(timer,
+						struct io_timeout_data, timer);
+	struct io_kiocb *req = data->req;
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->timeout_lock, flags);
+	list_del_init(&timeout->list);
+	atomic_set(&req->ctx->cq_timeouts,
+		atomic_read(&req->ctx->cq_timeouts) + 1);
+	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
+
+	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+		req_set_fail(req);
+
+	io_req_set_res(req, -ETIME, 0);
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(req);
+	return HRTIMER_NORESTART;
+}
+
+static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+					   struct io_cancel_data *cd)
+	__must_hold(&ctx->timeout_lock)
+{
+	struct io_timeout *timeout;
+	struct io_timeout_data *io;
+	struct io_kiocb *req = NULL;
+
+	list_for_each_entry(timeout, &ctx->timeout_list, list) {
+		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
+
+		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+		    cd->data != tmp->cqe.user_data)
+			continue;
+		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+			if (cd->seq == tmp->work.cancel_seq)
+				continue;
+			tmp->work.cancel_seq = cd->seq;
+		}
+		req = tmp;
+		break;
+	}
+	if (!req)
+		return ERR_PTR(-ENOENT);
+
+	io = req->async_data;
+	if (hrtimer_try_to_cancel(&io->timer) == -1)
+		return ERR_PTR(-EALREADY);
+	timeout = io_kiocb_to_cmd(req);
+	list_del_init(&timeout->list);
+	return req;
+}
+
+int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
+	__must_hold(&ctx->completion_lock)
+{
+	struct io_kiocb *req;
+
+	spin_lock_irq(&ctx->timeout_lock);
+	req = io_timeout_extract(ctx, cd);
+	spin_unlock_irq(&ctx->timeout_lock);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	io_req_task_queue_fail(req, -ECANCELED);
+	return 0;
+}
+
+static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_kiocb *prev = timeout->prev;
+	int ret = -ENOENT;
+
+	if (prev) {
+		if (!(req->task->flags & PF_EXITING)) {
+			struct io_cancel_data cd = {
+				.ctx		= req->ctx,
+				.data		= prev->cqe.user_data,
+			};
+
+			ret = io_try_cancel(req, &cd);
+		}
+		io_req_set_res(req, ret ?: -ETIME, 0);
+		io_req_complete_post(req);
+		io_put_req(prev);
+	} else {
+		io_req_set_res(req, -ETIME, 0);
+		io_req_complete_post(req);
+	}
+}
+
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
+{
+	struct io_timeout_data *data = container_of(timer,
+						struct io_timeout_data, timer);
+	struct io_kiocb *prev, *req = data->req;
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->timeout_lock, flags);
+	prev = timeout->head;
+	timeout->head = NULL;
+
+	/*
+	 * We don't expect the list to be empty, that will only happen if we
+	 * race with the completion of the linked work.
+	 */
+	if (prev) {
+		io_remove_next_linked(prev);
+		if (!req_ref_inc_not_zero(prev))
+			prev = NULL;
+	}
+	list_del(&timeout->list);
+	timeout->prev = prev;
+	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
+
+	req->io_task_work.func = io_req_task_link_timeout;
+	io_req_task_work_add(req);
+	return HRTIMER_NORESTART;
+}
+
+static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
+{
+	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
+	case IORING_TIMEOUT_BOOTTIME:
+		return CLOCK_BOOTTIME;
+	case IORING_TIMEOUT_REALTIME:
+		return CLOCK_REALTIME;
+	default:
+		/* can't happen, vetted at prep time */
+		WARN_ON_ONCE(1);
+		fallthrough;
+	case 0:
+		return CLOCK_MONOTONIC;
+	}
+}
+
+static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+				    struct timespec64 *ts, enum hrtimer_mode mode)
+	__must_hold(&ctx->timeout_lock)
+{
+	struct io_timeout_data *io;
+	struct io_timeout *timeout;
+	struct io_kiocb *req = NULL;
+
+	list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
+		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
+
+		if (user_data == tmp->cqe.user_data) {
+			req = tmp;
+			break;
+		}
+	}
+	if (!req)
+		return -ENOENT;
+
+	io = req->async_data;
+	if (hrtimer_try_to_cancel(&io->timer) == -1)
+		return -EALREADY;
+	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
+	io->timer.function = io_link_timeout_fn;
+	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
+	return 0;
+}
+
+static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+			     struct timespec64 *ts, enum hrtimer_mode mode)
+	__must_hold(&ctx->timeout_lock)
+{
+	struct io_cancel_data cd = { .data = user_data, };
+	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout_data *data;
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	timeout->off = 0; /* noseq */
+	data = req->async_data;
+	list_add_tail(&timeout->list, &ctx->timeout_list);
+	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
+	data->timer.function = io_timeout_fn;
+	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+	return 0;
+}
+
+int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
+
+	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+		return -EINVAL;
+	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
+		return -EINVAL;
+
+	tr->ltimeout = false;
+	tr->addr = READ_ONCE(sqe->addr);
+	tr->flags = READ_ONCE(sqe->timeout_flags);
+	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
+		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+			return -EINVAL;
+		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
+			tr->ltimeout = true;
+		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
+			return -EINVAL;
+		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+			return -EFAULT;
+		if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
+			return -EINVAL;
+	} else if (tr->flags) {
+		/* timeout removal doesn't support flags */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
+{
+	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
+					    : HRTIMER_MODE_REL;
+}
+
+/*
+ * Remove or update an existing timeout command
+ */
+int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
+		struct io_cancel_data cd = { .data = tr->addr, };
+
+		spin_lock(&ctx->completion_lock);
+		ret = io_timeout_cancel(ctx, &cd);
+		spin_unlock(&ctx->completion_lock);
+	} else {
+		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
+
+		spin_lock_irq(&ctx->timeout_lock);
+		if (tr->ltimeout)
+			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+		else
+			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+		spin_unlock_irq(&ctx->timeout_lock);
+	}
+
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+static int __io_timeout_prep(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe,
+			     bool is_timeout_link)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout_data *data;
+	unsigned flags;
+	u32 off = READ_ONCE(sqe->off);
+
+	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
+		return -EINVAL;
+	if (off && is_timeout_link)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->timeout_flags);
+	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+		      IORING_TIMEOUT_ETIME_SUCCESS))
+		return -EINVAL;
+	/* more than one clock specified is invalid, obviously */
+	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&timeout->list);
+	timeout->off = off;
+	if (unlikely(off && !req->ctx->off_timeout_used))
+		req->ctx->off_timeout_used = true;
+
+	if (WARN_ON_ONCE(req_has_async_data(req)))
+		return -EFAULT;
+	if (io_alloc_async_data(req))
+		return -ENOMEM;
+
+	data = req->async_data;
+	data->req = req;
+	data->flags = flags;
+
+	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
+		return -EFAULT;
+
+	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&timeout->list);
+	data->mode = io_translate_timeout_mode(flags);
+	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
+
+	if (is_timeout_link) {
+		struct io_submit_link *link = &req->ctx->submit_state.link;
+
+		if (!link->head)
+			return -EINVAL;
+		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
+			return -EINVAL;
+		timeout->head = link->last;
+		link->last->flags |= REQ_F_ARM_LTIMEOUT;
+	}
+	return 0;
+}
+
+int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return __io_timeout_prep(req, sqe, false);
+}
+
+int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return __io_timeout_prep(req, sqe, true);
+}
+
+int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_timeout_data *data = req->async_data;
+	struct list_head *entry;
+	u32 tail, off = timeout->off;
+
+	spin_lock_irq(&ctx->timeout_lock);
+
+	/*
+	 * sqe->off holds how many events that need to occur for this
+	 * timeout event to be satisfied. If it isn't set, then this is
+	 * a pure timeout request, sequence isn't used.
+	 */
+	if (io_is_timeout_noseq(req)) {
+		entry = ctx->timeout_list.prev;
+		goto add;
+	}
+
+	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+	timeout->target_seq = tail + off;
+
+	/* Update the last seq here in case io_flush_timeouts() hasn't.
+	 * This is safe because ->completion_lock is held, and submissions
+	 * and completions are never mixed in the same ->completion_lock section.
+	 */
+	ctx->cq_last_tm_flush = tail;
+
+	/*
+	 * Insertion sort, ensuring the first entry in the list is always
+	 * the one we need first.
+	 */
+	list_for_each_prev(entry, &ctx->timeout_list) {
+		struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
+		struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);
+
+		if (io_is_timeout_noseq(nxt))
+			continue;
+		/* nxt.seq is behind @tail, otherwise would've been completed */
+		if (off >= nextt->target_seq - tail)
+			break;
+	}
+add:
+	list_add(&timeout->list, entry);
+	data->timer.function = io_timeout_fn;
+	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
+	spin_unlock_irq(&ctx->timeout_lock);
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+void io_queue_linked_timeout(struct io_kiocb *req)
+{
+	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+
+	spin_lock_irq(&ctx->timeout_lock);
+	/*
+	 * If the back reference is NULL, then our linked request finished
+	 * before we got a chance to setup the timer
+	 */
+	if (timeout->head) {
+		struct io_timeout_data *data = req->async_data;
+
+		data->timer.function = io_link_timeout_fn;
+		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
+				data->mode);
+		list_add_tail(&timeout->list, &ctx->ltimeout_list);
+	}
+	spin_unlock_irq(&ctx->timeout_lock);
+	/* drop submission reference */
+	io_put_req(req);
+}
+
+static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
+			  bool cancel_all)
+	__must_hold(&req->ctx->timeout_lock)
+{
+	struct io_kiocb *req;
+
+	if (task && head->task != task)
+		return false;
+	if (cancel_all)
+		return true;
+
+	io_for_each_link(req, head) {
+		if (req->flags & REQ_F_INFLIGHT)
+			return true;
+	}
+	return false;
+}
+
+/* Returns true if we found and killed one or more timeouts */
+__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+			     bool cancel_all)
+{
+	struct io_timeout *timeout, *tmp;
+	int canceled = 0;
+
+	spin_lock(&ctx->completion_lock);
+	spin_lock_irq(&ctx->timeout_lock);
+	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
+		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
+
+		if (io_match_task(req, tsk, cancel_all)) {
+			io_kill_timeout(req, -ECANCELED);
+			canceled++;
+		}
+	}
+	spin_unlock_irq(&ctx->timeout_lock);
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	if (canceled != 0)
+		io_cqring_ev_posted(ctx);
+	return canceled != 0;
+}
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
new file mode 100644
index 00000000000000..dd7cfb0d936671
--- /dev/null
+++ b/io_uring/timeout.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+struct io_timeout_data {
+	struct io_kiocb			*req;
+	struct hrtimer			timer;
+	struct timespec64		ts;
+	enum hrtimer_mode		mode;
+	u32				flags;
+};
+
+struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+					    struct io_kiocb *link);
+
+static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
+{
+	struct io_kiocb *link = req->link;
+
+	if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
+		return __io_disarm_linked_timeout(req, link);
+
+	return NULL;
+}
+
+__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
+int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
+__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+			     bool cancel_all);
+void io_queue_linked_timeout(struct io_kiocb *req);
+bool io_disarm_next(struct io_kiocb *req);
+
+int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_timeout(struct io_kiocb *req, unsigned int issue_flags);
+int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags);

From 17437f311490d873a5157f65a84317d16270fd38 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 09:13:39 -0600
Subject: [PATCH 043/172] io_uring: move SQPOLL related handling into its own
 file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   3 +-
 io_uring/io_uring.c | 467 +-------------------------------------------
 io_uring/io_uring.h |  34 ++++
 io_uring/sqpoll.c   | 426 ++++++++++++++++++++++++++++++++++++++++
 io_uring/sqpoll.h   |  29 +++
 5 files changed, 497 insertions(+), 462 deletions(-)
 create mode 100644 io_uring/sqpoll.c
 create mode 100644 io_uring/sqpoll.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 6ae4e45a15dbfc..c59a9ca74262a4 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -5,5 +5,6 @@
 obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
-					statx.o net.o msg_ring.o timeout.o
+					statx.o net.o msg_ring.o timeout.o \
+					sqpoll.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3fc59a22d54e89..17c555aa03bc9c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 #include "io_uring_types.h"
 #include "io_uring.h"
 #include "refs.h"
+#include "sqpoll.h"
 
 #include "xattr.h"
 #include "nop.h"
@@ -109,7 +110,6 @@
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
-#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
 
 /* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 20)
@@ -214,31 +214,6 @@ struct io_buffer {
 	__u16 bgid;
 };
 
-enum {
-	IO_SQ_THREAD_SHOULD_STOP = 0,
-	IO_SQ_THREAD_SHOULD_PARK,
-};
-
-struct io_sq_data {
-	refcount_t		refs;
-	atomic_t		park_pending;
-	struct mutex		lock;
-
-	/* ctx's that are using this sqd */
-	struct list_head	ctx_list;
-
-	struct task_struct	*thread;
-	struct wait_queue_head	wait;
-
-	unsigned		sq_thread_idle;
-	int			sq_cpu;
-	pid_t			task_pid;
-	pid_t			task_tgid;
-
-	unsigned long		state;
-	struct completion	exited;
-};
-
 #define IO_COMPL_BATCH			32
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
@@ -402,7 +377,6 @@ static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
 static void io_dismantle_req(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1079,13 +1053,6 @@ static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_signal(ctx);
 }
 
-static inline bool io_sqring_full(struct io_ring_ctx *ctx)
-{
-	struct io_rings *r = ctx->rings;
-
-	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
-}
-
 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 {
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
@@ -1974,28 +1941,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 	return __io_cqring_events(ctx);
 }
 
-static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
-{
-	struct io_rings *rings = ctx->rings;
-
-	/* make sure SQ entry isn't read before tail */
-	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
-}
-
-static inline bool io_run_task_work(void)
-{
-	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
-		__set_current_state(TASK_RUNNING);
-		clear_notify_signal();
-		if (task_work_pending(current))
-			task_work_run();
-		return true;
-	}
-
-	return false;
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
+int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
 	unsigned int poll_flags = BLK_POLL_NOSLEEP;
@@ -5297,7 +5243,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	__must_hold(&ctx->uring_lock)
 {
 	unsigned int entries = io_sqring_entries(ctx);
@@ -5349,173 +5295,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	return ret;
 }
 
-static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
-{
-	return READ_ONCE(sqd->state);
-}
-
-static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
-{
-	unsigned int to_submit;
-	int ret = 0;
-
-	to_submit = io_sqring_entries(ctx);
-	/* if we're handling multiple rings, cap submit size for fairness */
-	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
-		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
-
-	if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
-		const struct cred *creds = NULL;
-
-		if (ctx->sq_creds != current_cred())
-			creds = override_creds(ctx->sq_creds);
-
-		mutex_lock(&ctx->uring_lock);
-		if (!wq_list_empty(&ctx->iopoll_list))
-			io_do_iopoll(ctx, true);
-
-		/*
-		 * Don't submit if refs are dying, good for io_uring_register(),
-		 * but also it is relied upon by io_ring_exit_work()
-		 */
-		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
-		    !(ctx->flags & IORING_SETUP_R_DISABLED))
-			ret = io_submit_sqes(ctx, to_submit);
-		mutex_unlock(&ctx->uring_lock);
-
-		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
-			wake_up(&ctx->sqo_sq_wait);
-		if (creds)
-			revert_creds(creds);
-	}
-
-	return ret;
-}
-
-static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
-{
-	struct io_ring_ctx *ctx;
-	unsigned sq_thread_idle = 0;
-
-	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
-	sqd->sq_thread_idle = sq_thread_idle;
-}
-
-static bool io_sqd_handle_event(struct io_sq_data *sqd)
-{
-	bool did_sig = false;
-	struct ksignal ksig;
-
-	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
-	    signal_pending(current)) {
-		mutex_unlock(&sqd->lock);
-		if (signal_pending(current))
-			did_sig = get_signal(&ksig);
-		cond_resched();
-		mutex_lock(&sqd->lock);
-	}
-	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-}
-
-static int io_sq_thread(void *data)
-{
-	struct io_sq_data *sqd = data;
-	struct io_ring_ctx *ctx;
-	unsigned long timeout = 0;
-	char buf[TASK_COMM_LEN];
-	DEFINE_WAIT(wait);
-
-	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
-	set_task_comm(current, buf);
-
-	if (sqd->sq_cpu != -1)
-		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
-	else
-		set_cpus_allowed_ptr(current, cpu_online_mask);
-	current->flags |= PF_NO_SETAFFINITY;
-
-	audit_alloc_kernel(current);
-
-	mutex_lock(&sqd->lock);
-	while (1) {
-		bool cap_entries, sqt_spin = false;
-
-		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
-			if (io_sqd_handle_event(sqd))
-				break;
-			timeout = jiffies + sqd->sq_thread_idle;
-		}
-
-		cap_entries = !list_is_singular(&sqd->ctx_list);
-		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-			int ret = __io_sq_thread(ctx, cap_entries);
-
-			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
-				sqt_spin = true;
-		}
-		if (io_run_task_work())
-			sqt_spin = true;
-
-		if (sqt_spin || !time_after(jiffies, timeout)) {
-			cond_resched();
-			if (sqt_spin)
-				timeout = jiffies + sqd->sq_thread_idle;
-			continue;
-		}
-
-		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
-		if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
-			bool needs_sched = true;
-
-			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-				atomic_or(IORING_SQ_NEED_WAKEUP,
-						&ctx->rings->sq_flags);
-				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-				    !wq_list_empty(&ctx->iopoll_list)) {
-					needs_sched = false;
-					break;
-				}
-
-				/*
-				 * Ensure the store of the wakeup flag is not
-				 * reordered with the load of the SQ tail
-				 */
-				smp_mb__after_atomic();
-
-				if (io_sqring_entries(ctx)) {
-					needs_sched = false;
-					break;
-				}
-			}
-
-			if (needs_sched) {
-				mutex_unlock(&sqd->lock);
-				schedule();
-				mutex_lock(&sqd->lock);
-			}
-			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-				atomic_andnot(IORING_SQ_NEED_WAKEUP,
-						&ctx->rings->sq_flags);
-		}
-
-		finish_wait(&sqd->wait, &wait);
-		timeout = jiffies + sqd->sq_thread_idle;
-	}
-
-	io_uring_cancel_generic(true, sqd);
-	sqd->thread = NULL;
-	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
-	io_run_task_work();
-	mutex_unlock(&sqd->lock);
-
-	audit_free(current);
-
-	complete(&sqd->exited);
-	do_exit(0);
-}
-
 struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
@@ -5934,131 +5713,6 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
-	__releases(&sqd->lock)
-{
-	WARN_ON_ONCE(sqd->thread == current);
-
-	/*
-	 * Do the dance but not conditional clear_bit() because it'd race with
-	 * other threads incrementing park_pending and setting the bit.
-	 */
-	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-	if (atomic_dec_return(&sqd->park_pending))
-		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-	mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
-	__acquires(&sqd->lock)
-{
-	WARN_ON_ONCE(sqd->thread == current);
-
-	atomic_inc(&sqd->park_pending);
-	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-	mutex_lock(&sqd->lock);
-	if (sqd->thread)
-		wake_up_process(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_sq_data *sqd)
-{
-	WARN_ON_ONCE(sqd->thread == current);
-	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
-
-	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-	mutex_lock(&sqd->lock);
-	if (sqd->thread)
-		wake_up_process(sqd->thread);
-	mutex_unlock(&sqd->lock);
-	wait_for_completion(&sqd->exited);
-}
-
-static void io_put_sq_data(struct io_sq_data *sqd)
-{
-	if (refcount_dec_and_test(&sqd->refs)) {
-		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
-
-		io_sq_thread_stop(sqd);
-		kfree(sqd);
-	}
-}
-
-static void io_sq_thread_finish(struct io_ring_ctx *ctx)
-{
-	struct io_sq_data *sqd = ctx->sq_data;
-
-	if (sqd) {
-		io_sq_thread_park(sqd);
-		list_del_init(&ctx->sqd_list);
-		io_sqd_update_thread_idle(sqd);
-		io_sq_thread_unpark(sqd);
-
-		io_put_sq_data(sqd);
-		ctx->sq_data = NULL;
-	}
-}
-
-static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
-{
-	struct io_ring_ctx *ctx_attach;
-	struct io_sq_data *sqd;
-	struct fd f;
-
-	f = fdget(p->wq_fd);
-	if (!f.file)
-		return ERR_PTR(-ENXIO);
-	if (f.file->f_op != &io_uring_fops) {
-		fdput(f);
-		return ERR_PTR(-EINVAL);
-	}
-
-	ctx_attach = f.file->private_data;
-	sqd = ctx_attach->sq_data;
-	if (!sqd) {
-		fdput(f);
-		return ERR_PTR(-EINVAL);
-	}
-	if (sqd->task_tgid != current->tgid) {
-		fdput(f);
-		return ERR_PTR(-EPERM);
-	}
-
-	refcount_inc(&sqd->refs);
-	fdput(f);
-	return sqd;
-}
-
-static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
-					 bool *attached)
-{
-	struct io_sq_data *sqd;
-
-	*attached = false;
-	if (p->flags & IORING_SETUP_ATTACH_WQ) {
-		sqd = io_attach_sq_data(p);
-		if (!IS_ERR(sqd)) {
-			*attached = true;
-			return sqd;
-		}
-		/* fall through for EPERM case, setup new sqd/task */
-		if (PTR_ERR(sqd) != -EPERM)
-			return sqd;
-	}
-
-	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
-	if (!sqd)
-		return ERR_PTR(-ENOMEM);
-
-	atomic_set(&sqd->park_pending, 0);
-	refcount_set(&sqd->refs, 1);
-	INIT_LIST_HEAD(&sqd->ctx_list);
-	mutex_init(&sqd->lock);
-	init_waitqueue_head(&sqd->wait);
-	init_completion(&sqd->exited);
-	return sqd;
-}
-
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
  * the io_uring can be safely unregistered on process exit, even if we have
@@ -6495,8 +6149,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 	return io_wq_create(concurrency, &data);
 }
 
-static __cold int io_uring_alloc_task_context(struct task_struct *task,
-					      struct io_ring_ctx *ctx)
+__cold int io_uring_alloc_task_context(struct task_struct *task,
+				       struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx;
 	int ret;
@@ -6554,96 +6208,6 @@ void __io_uring_free(struct task_struct *tsk)
 	tsk->io_uring = NULL;
 }
 
-static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
-				       struct io_uring_params *p)
-{
-	int ret;
-
-	/* Retain compatibility with failing for an invalid attach attempt */
-	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
-				IORING_SETUP_ATTACH_WQ) {
-		struct fd f;
-
-		f = fdget(p->wq_fd);
-		if (!f.file)
-			return -ENXIO;
-		if (f.file->f_op != &io_uring_fops) {
-			fdput(f);
-			return -EINVAL;
-		}
-		fdput(f);
-	}
-	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		struct task_struct *tsk;
-		struct io_sq_data *sqd;
-		bool attached;
-
-		ret = security_uring_sqpoll();
-		if (ret)
-			return ret;
-
-		sqd = io_get_sq_data(p, &attached);
-		if (IS_ERR(sqd)) {
-			ret = PTR_ERR(sqd);
-			goto err;
-		}
-
-		ctx->sq_creds = get_current_cred();
-		ctx->sq_data = sqd;
-		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
-		if (!ctx->sq_thread_idle)
-			ctx->sq_thread_idle = HZ;
-
-		io_sq_thread_park(sqd);
-		list_add(&ctx->sqd_list, &sqd->ctx_list);
-		io_sqd_update_thread_idle(sqd);
-		/* don't attach to a dying SQPOLL thread, would be racy */
-		ret = (attached && !sqd->thread) ? -ENXIO : 0;
-		io_sq_thread_unpark(sqd);
-
-		if (ret < 0)
-			goto err;
-		if (attached)
-			return 0;
-
-		if (p->flags & IORING_SETUP_SQ_AFF) {
-			int cpu = p->sq_thread_cpu;
-
-			ret = -EINVAL;
-			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
-				goto err_sqpoll;
-			sqd->sq_cpu = cpu;
-		} else {
-			sqd->sq_cpu = -1;
-		}
-
-		sqd->task_pid = current->pid;
-		sqd->task_tgid = current->tgid;
-		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
-		if (IS_ERR(tsk)) {
-			ret = PTR_ERR(tsk);
-			goto err_sqpoll;
-		}
-
-		sqd->thread = tsk;
-		ret = io_uring_alloc_task_context(tsk, ctx);
-		wake_up_new_task(tsk);
-		if (ret)
-			goto err;
-	} else if (p->flags & IORING_SETUP_SQ_AFF) {
-		/* Can't have SQ_AFF without SQPOLL */
-		ret = -EINVAL;
-		goto err;
-	}
-
-	return 0;
-err_sqpoll:
-	complete(&ctx->sq_data->exited);
-err:
-	io_sq_thread_finish(ctx);
-	return ret;
-}
-
 static inline void __io_unaccount_mem(struct user_struct *user,
 				      unsigned long nr_pages)
 {
@@ -7755,8 +7319,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
  * Find any io_uring ctx that this task has registered or done IO on, and cancel
  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
  */
-static __cold void io_uring_cancel_generic(bool cancel_all,
-					   struct io_sq_data *sqd)
+__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx;
@@ -8034,24 +7597,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 
 #endif /* !CONFIG_MMU */
 
-static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
-{
-	DEFINE_WAIT(wait);
-
-	do {
-		if (!io_sqring_full(ctx))
-			break;
-		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
-
-		if (!io_sqring_full(ctx))
-			break;
-		schedule();
-	} while (!signal_pending(current));
-
-	finish_wait(&ctx->sqo_sq_wait, &wait);
-	return 0;
-}
-
 static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
 {
 	if (flags & IORING_ENTER_EXT_ARG) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e285e12ccbdbb1..1da8e66507a350 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -64,6 +64,34 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 }
 
+static inline bool io_sqring_full(struct io_ring_ctx *ctx)
+{
+	struct io_rings *r = ctx->rings;
+
+	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
+}
+
+static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+
+	/* make sure SQ entry isn't read before tail */
+	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
+}
+
+static inline bool io_run_task_work(void)
+{
+	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
+		__set_current_state(TASK_RUNNING);
+		clear_notify_signal();
+		if (task_work_pending(current))
+			task_work_run();
+		return true;
+	}
+
+	return false;
+}
+
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
 void __io_req_complete_post(struct io_kiocb *req);
@@ -101,6 +129,12 @@ void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_complete(struct io_kiocb *req, bool *locked);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
 int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
+__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
+int io_uring_alloc_task_context(struct task_struct *task,
+				struct io_ring_ctx *ctx);
+
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
+int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
new file mode 100644
index 00000000000000..149d5c976f1467
--- /dev/null
+++ b/io_uring/sqpoll.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Contains the core associated with submission side polling of the SQ
+ * ring, offloading submissions from the application to a kernel thread.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/audit.h>
+#include <linux/security.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "sqpoll.h"
+
+#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
+
+enum {
+	IO_SQ_THREAD_SHOULD_STOP = 0,
+	IO_SQ_THREAD_SHOULD_PARK,
+};
+
+void io_sq_thread_unpark(struct io_sq_data *sqd)
+	__releases(&sqd->lock)
+{
+	WARN_ON_ONCE(sqd->thread == current);
+
+	/*
+	 * Do the dance but not conditional clear_bit() because it'd race with
+	 * other threads incrementing park_pending and setting the bit.
+	 */
+	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+	if (atomic_dec_return(&sqd->park_pending))
+		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+	mutex_unlock(&sqd->lock);
+}
+
+void io_sq_thread_park(struct io_sq_data *sqd)
+	__acquires(&sqd->lock)
+{
+	WARN_ON_ONCE(sqd->thread == current);
+
+	atomic_inc(&sqd->park_pending);
+	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+	mutex_lock(&sqd->lock);
+	if (sqd->thread)
+		wake_up_process(sqd->thread);
+}
+
+void io_sq_thread_stop(struct io_sq_data *sqd)
+{
+	WARN_ON_ONCE(sqd->thread == current);
+	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
+
+	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+	mutex_lock(&sqd->lock);
+	if (sqd->thread)
+		wake_up_process(sqd->thread);
+	mutex_unlock(&sqd->lock);
+	wait_for_completion(&sqd->exited);
+}
+
+void io_put_sq_data(struct io_sq_data *sqd)
+{
+	if (refcount_dec_and_test(&sqd->refs)) {
+		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
+
+		io_sq_thread_stop(sqd);
+		kfree(sqd);
+	}
+}
+
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+{
+	struct io_ring_ctx *ctx;
+	unsigned sq_thread_idle = 0;
+
+	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
+	sqd->sq_thread_idle = sq_thread_idle;
+}
+
+void io_sq_thread_finish(struct io_ring_ctx *ctx)
+{
+	struct io_sq_data *sqd = ctx->sq_data;
+
+	if (sqd) {
+		io_sq_thread_park(sqd);
+		list_del_init(&ctx->sqd_list);
+		io_sqd_update_thread_idle(sqd);
+		io_sq_thread_unpark(sqd);
+
+		io_put_sq_data(sqd);
+		ctx->sq_data = NULL;
+	}
+}
+
+static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
+{
+	struct io_ring_ctx *ctx_attach;
+	struct io_sq_data *sqd;
+	struct fd f;
+
+	f = fdget(p->wq_fd);
+	if (!f.file)
+		return ERR_PTR(-ENXIO);
+	if (!io_is_uring_fops(f.file)) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	ctx_attach = f.file->private_data;
+	sqd = ctx_attach->sq_data;
+	if (!sqd) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+	if (sqd->task_tgid != current->tgid) {
+		fdput(f);
+		return ERR_PTR(-EPERM);
+	}
+
+	refcount_inc(&sqd->refs);
+	fdput(f);
+	return sqd;
+}
+
+static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
+					 bool *attached)
+{
+	struct io_sq_data *sqd;
+
+	*attached = false;
+	if (p->flags & IORING_SETUP_ATTACH_WQ) {
+		sqd = io_attach_sq_data(p);
+		if (!IS_ERR(sqd)) {
+			*attached = true;
+			return sqd;
+		}
+		/* fall through for EPERM case, setup new sqd/task */
+		if (PTR_ERR(sqd) != -EPERM)
+			return sqd;
+	}
+
+	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
+	if (!sqd)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&sqd->park_pending, 0);
+	refcount_set(&sqd->refs, 1);
+	INIT_LIST_HEAD(&sqd->ctx_list);
+	mutex_init(&sqd->lock);
+	init_waitqueue_head(&sqd->wait);
+	init_completion(&sqd->exited);
+	return sqd;
+}
+
+static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
+{
+	return READ_ONCE(sqd->state);
+}
+
+static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
+{
+	unsigned int to_submit;
+	int ret = 0;
+
+	to_submit = io_sqring_entries(ctx);
+	/* if we're handling multiple rings, cap submit size for fairness */
+	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
+		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
+
+	if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
+		const struct cred *creds = NULL;
+
+		if (ctx->sq_creds != current_cred())
+			creds = override_creds(ctx->sq_creds);
+
+		mutex_lock(&ctx->uring_lock);
+		if (!wq_list_empty(&ctx->iopoll_list))
+			io_do_iopoll(ctx, true);
+
+		/*
+		 * Don't submit if refs are dying, good for io_uring_register(),
+		 * but also it is relied upon by io_ring_exit_work()
+		 */
+		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
+		    !(ctx->flags & IORING_SETUP_R_DISABLED))
+			ret = io_submit_sqes(ctx, to_submit);
+		mutex_unlock(&ctx->uring_lock);
+
+		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
+			wake_up(&ctx->sqo_sq_wait);
+		if (creds)
+			revert_creds(creds);
+	}
+
+	return ret;
+}
+
+static bool io_sqd_handle_event(struct io_sq_data *sqd)
+{
+	bool did_sig = false;
+	struct ksignal ksig;
+
+	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
+	    signal_pending(current)) {
+		mutex_unlock(&sqd->lock);
+		if (signal_pending(current))
+			did_sig = get_signal(&ksig);
+		cond_resched();
+		mutex_lock(&sqd->lock);
+	}
+	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+}
+
+static int io_sq_thread(void *data)
+{
+	struct io_sq_data *sqd = data;
+	struct io_ring_ctx *ctx;
+	unsigned long timeout = 0;
+	char buf[TASK_COMM_LEN];
+	DEFINE_WAIT(wait);
+
+	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
+	set_task_comm(current, buf);
+
+	if (sqd->sq_cpu != -1)
+		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
+	else
+		set_cpus_allowed_ptr(current, cpu_online_mask);
+	current->flags |= PF_NO_SETAFFINITY;
+
+	audit_alloc_kernel(current);
+
+	mutex_lock(&sqd->lock);
+	while (1) {
+		bool cap_entries, sqt_spin = false;
+
+		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
+			if (io_sqd_handle_event(sqd))
+				break;
+			timeout = jiffies + sqd->sq_thread_idle;
+		}
+
+		cap_entries = !list_is_singular(&sqd->ctx_list);
+		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+			int ret = __io_sq_thread(ctx, cap_entries);
+
+			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
+				sqt_spin = true;
+		}
+		if (io_run_task_work())
+			sqt_spin = true;
+
+		if (sqt_spin || !time_after(jiffies, timeout)) {
+			cond_resched();
+			if (sqt_spin)
+				timeout = jiffies + sqd->sq_thread_idle;
+			continue;
+		}
+
+		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+		if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
+			bool needs_sched = true;
+
+			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+				atomic_or(IORING_SQ_NEED_WAKEUP,
+						&ctx->rings->sq_flags);
+				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+				    !wq_list_empty(&ctx->iopoll_list)) {
+					needs_sched = false;
+					break;
+				}
+
+				/*
+				 * Ensure the store of the wakeup flag is not
+				 * reordered with the load of the SQ tail
+				 */
+				smp_mb__after_atomic();
+
+				if (io_sqring_entries(ctx)) {
+					needs_sched = false;
+					break;
+				}
+			}
+
+			if (needs_sched) {
+				mutex_unlock(&sqd->lock);
+				schedule();
+				mutex_lock(&sqd->lock);
+			}
+			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+				atomic_andnot(IORING_SQ_NEED_WAKEUP,
+						&ctx->rings->sq_flags);
+		}
+
+		finish_wait(&sqd->wait, &wait);
+		timeout = jiffies + sqd->sq_thread_idle;
+	}
+
+	io_uring_cancel_generic(true, sqd);
+	sqd->thread = NULL;
+	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
+	io_run_task_work();
+	mutex_unlock(&sqd->lock);
+
+	audit_free(current);
+
+	complete(&sqd->exited);
+	do_exit(0);
+}
+
+int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
+{
+	DEFINE_WAIT(wait);
+
+	do {
+		if (!io_sqring_full(ctx))
+			break;
+		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
+
+		if (!io_sqring_full(ctx))
+			break;
+		schedule();
+	} while (!signal_pending(current));
+
+	finish_wait(&ctx->sqo_sq_wait, &wait);
+	return 0;
+}
+
+__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+				struct io_uring_params *p)
+{
+	int ret;
+
+	/* Retain compatibility with failing for an invalid attach attempt */
+	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+				IORING_SETUP_ATTACH_WQ) {
+		struct fd f;
+
+		f = fdget(p->wq_fd);
+		if (!f.file)
+			return -ENXIO;
+		if (!io_is_uring_fops(f.file)) {
+			fdput(f);
+			return -EINVAL;
+		}
+		fdput(f);
+	}
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		struct task_struct *tsk;
+		struct io_sq_data *sqd;
+		bool attached;
+
+		ret = security_uring_sqpoll();
+		if (ret)
+			return ret;
+
+		sqd = io_get_sq_data(p, &attached);
+		if (IS_ERR(sqd)) {
+			ret = PTR_ERR(sqd);
+			goto err;
+		}
+
+		ctx->sq_creds = get_current_cred();
+		ctx->sq_data = sqd;
+		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+		if (!ctx->sq_thread_idle)
+			ctx->sq_thread_idle = HZ;
+
+		io_sq_thread_park(sqd);
+		list_add(&ctx->sqd_list, &sqd->ctx_list);
+		io_sqd_update_thread_idle(sqd);
+		/* don't attach to a dying SQPOLL thread, would be racy */
+		ret = (attached && !sqd->thread) ? -ENXIO : 0;
+		io_sq_thread_unpark(sqd);
+
+		if (ret < 0)
+			goto err;
+		if (attached)
+			return 0;
+
+		if (p->flags & IORING_SETUP_SQ_AFF) {
+			int cpu = p->sq_thread_cpu;
+
+			ret = -EINVAL;
+			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+				goto err_sqpoll;
+			sqd->sq_cpu = cpu;
+		} else {
+			sqd->sq_cpu = -1;
+		}
+
+		sqd->task_pid = current->pid;
+		sqd->task_tgid = current->tgid;
+		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+		if (IS_ERR(tsk)) {
+			ret = PTR_ERR(tsk);
+			goto err_sqpoll;
+		}
+
+		sqd->thread = tsk;
+		ret = io_uring_alloc_task_context(tsk, ctx);
+		wake_up_new_task(tsk);
+		if (ret)
+			goto err;
+	} else if (p->flags & IORING_SETUP_SQ_AFF) {
+		/* Can't have SQ_AFF without SQPOLL */
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+err_sqpoll:
+	complete(&ctx->sq_data->exited);
+err:
+	io_sq_thread_finish(ctx);
+	return ret;
+}
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
new file mode 100644
index 00000000000000..0c3fbcd1f583f6
--- /dev/null
+++ b/io_uring/sqpoll.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+
+struct io_sq_data {
+	refcount_t		refs;
+	atomic_t		park_pending;
+	struct mutex		lock;
+
+	/* ctx's that are using this sqd */
+	struct list_head	ctx_list;
+
+	struct task_struct	*thread;
+	struct wait_queue_head	wait;
+
+	unsigned		sq_thread_idle;
+	int			sq_cpu;
+	pid_t			task_pid;
+	pid_t			task_tgid;
+
+	unsigned long		state;
+	struct completion	exited;
+};
+
+int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
+void io_sq_thread_finish(struct io_ring_ctx *ctx);
+void io_sq_thread_stop(struct io_sq_data *sqd);
+void io_sq_thread_park(struct io_sq_data *sqd);
+void io_sq_thread_unpark(struct io_sq_data *sqd);
+void io_put_sq_data(struct io_sq_data *sqd);
+int io_sqpoll_wait_sq(struct io_ring_ctx *ctx);

From e5550a1447bf8d82f32b58e2ba54792a6985d080 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 10:28:04 -0600
Subject: [PATCH 044/172] io_uring: use io_is_uring_fops() consistently

Convert the last spots that check for io_uring_fops to use the provided
helper instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 17c555aa03bc9c..687900b84ce08c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2309,7 +2309,7 @@ static bool __io_file_supports_nowait(struct file *file, umode_t mode)
 	if (S_ISREG(mode)) {
 		if (IS_ENABLED(CONFIG_BLOCK) &&
 		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
-		    file->f_op != &io_uring_fops)
+		    !io_is_uring_fops(file))
 			return true;
 		return false;
 	}
@@ -4857,7 +4857,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
 
 	/* we don't allow fixed io_uring files */
-	if (file && file->f_op == &io_uring_fops)
+	if (file && io_is_uring_fops(file))
 		io_req_track_inflight(req);
 	return file;
 }
@@ -5949,7 +5949,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		 * handle it just fine, but there's still no point in allowing
 		 * a ring fd as it doesn't support regular read/write anyway.
 		 */
-		if (file->f_op == &io_uring_fops) {
+		if (io_is_uring_fops(file)) {
 			fput(file);
 			goto fail;
 		}
@@ -5996,7 +5996,7 @@ int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 	struct io_fixed_file *file_slot;
 	int ret;
 
-	if (file->f_op == &io_uring_fops)
+	if (io_is_uring_fops(file))
 		return -EBADF;
 	if (!ctx->file_data)
 		return -ENXIO;
@@ -6096,7 +6096,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			 * still no point in allowing a ring fd as it doesn't
 			 * support regular read/write anyway.
 			 */
-			if (file->f_op == &io_uring_fops) {
+			if (io_is_uring_fops(file)) {
 				fput(file);
 				err = -EBADF;
 				break;
@@ -7416,7 +7416,7 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
 		file = fget(fd);
 		if (!file) {
 			return -EBADF;
-		} else if (file->f_op != &io_uring_fops) {
+		} else if (!io_is_uring_fops(file)) {
 			fput(file);
 			return -EOPNOTSUPP;
 		}
@@ -7677,7 +7677,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		return -EBADF;
 
 	ret = -EOPNOTSUPP;
-	if (unlikely(f.file->f_op != &io_uring_fops))
+	if (unlikely(!io_is_uring_fops(f.file)))
 		goto out_fput;
 
 	ret = -ENXIO;
@@ -8852,7 +8852,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 		return -EBADF;
 
 	ret = -EOPNOTSUPP;
-	if (f.file->f_op != &io_uring_fops)
+	if (!io_is_uring_fops(f.file))
 		goto out_fput;
 
 	ctx = f.file->private_data;

From a4ad4f748ea96202451f88b6385d7283d0f2e513 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 10:40:19 -0600
Subject: [PATCH 045/172] io_uring: move fdinfo helpers to its own file

This also means moving a bit more of the fixed file handling to the
filetable side, which makes sense separately too.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile         |   2 +-
 io_uring/fdinfo.c         | 191 ++++++++++++++++++++++++++++++++++
 io_uring/fdinfo.h         |   3 +
 io_uring/filetable.h      |  19 ++++
 io_uring/io_uring.c       | 210 +-------------------------------------
 io_uring/io_uring_types.h |  13 +++
 6 files changed, 230 insertions(+), 208 deletions(-)
 create mode 100644 io_uring/fdinfo.c
 create mode 100644 io_uring/fdinfo.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index c59a9ca74262a4..ed0bf42db4ae4c 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
-					sqpoll.o
+					sqpoll.o fdinfo.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
new file mode 100644
index 00000000000000..fcedde4b4b1e1f
--- /dev/null
+++ b/io_uring/fdinfo.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "sqpoll.h"
+#include "fdinfo.h"
+
+#ifdef CONFIG_PROC_FS
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
+		const struct cred *cred)
+{
+	struct user_namespace *uns = seq_user_ns(m);
+	struct group_info *gi;
+	kernel_cap_t cap;
+	unsigned __capi;
+	int g;
+
+	seq_printf(m, "%5d\n", id);
+	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
+	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
+	seq_puts(m, "\n\tGroups:\t");
+	gi = cred->group_info;
+	for (g = 0; g < gi->ngroups; g++) {
+		seq_put_decimal_ull(m, g ? " " : "",
+					from_kgid_munged(uns, gi->gid[g]));
+	}
+	seq_puts(m, "\n\tCapEff:\t");
+	cap = cred->cap_effective;
+	CAP_FOR_EACH_U32(__capi)
+		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+					  struct seq_file *m)
+{
+	struct io_sq_data *sq = NULL;
+	struct io_overflow_cqe *ocqe;
+	struct io_rings *r = ctx->rings;
+	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+	unsigned int sq_head = READ_ONCE(r->sq.head);
+	unsigned int sq_tail = READ_ONCE(r->sq.tail);
+	unsigned int cq_head = READ_ONCE(r->cq.head);
+	unsigned int cq_tail = READ_ONCE(r->cq.tail);
+	unsigned int cq_shift = 0;
+	unsigned int sq_entries, cq_entries;
+	bool has_lock;
+	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+	unsigned int i;
+
+	if (is_cqe32)
+		cq_shift = 1;
+
+	/*
+	 * we may get imprecise sqe and cqe info if uring is actively running
+	 * since we get cached_sq_head and cached_cq_tail without uring_lock
+	 * and sq_tail and cq_head are changed by userspace. But it's ok since
+	 * we usually use these info when it is stuck.
+	 */
+	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
+	seq_printf(m, "SqHead:\t%u\n", sq_head);
+	seq_printf(m, "SqTail:\t%u\n", sq_tail);
+	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+	seq_printf(m, "CqHead:\t%u\n", cq_head);
+	seq_printf(m, "CqTail:\t%u\n", cq_tail);
+	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+	seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+	for (i = 0; i < sq_entries; i++) {
+		unsigned int entry = i + sq_head;
+		unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+		struct io_uring_sqe *sqe;
+
+		if (sq_idx > sq_mask)
+			continue;
+		sqe = &ctx->sq_sqes[sq_idx];
+		seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+			   sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+			   sqe->user_data);
+	}
+	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+	for (i = 0; i < cq_entries; i++) {
+		unsigned int entry = i + cq_head;
+		struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
+
+		if (!is_cqe32) {
+			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+			   entry & cq_mask, cqe->user_data, cqe->res,
+			   cqe->flags);
+		} else {
+			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
+				"extra1:%llu, extra2:%llu\n",
+				entry & cq_mask, cqe->user_data, cqe->res,
+				cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
+		}
+	}
+
+	/*
+	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+	 * since fdinfo case grabs it in the opposite direction of normal use
+	 * cases. If we fail to get the lock, we just don't iterate any
+	 * structures that could be going away outside the io_uring mutex.
+	 */
+	has_lock = mutex_trylock(&ctx->uring_lock);
+
+	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
+		sq = ctx->sq_data;
+		if (!sq->thread)
+			sq = NULL;
+	}
+
+	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
+	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
+	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
+	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
+		struct file *f = io_file_from_index(&ctx->file_table, i);
+
+		if (f)
+			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
+		else
+			seq_printf(m, "%5u: <none>\n", i);
+	}
+	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
+	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+		unsigned int len = buf->ubuf_end - buf->ubuf;
+
+		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
+	}
+	if (has_lock && !xa_empty(&ctx->personalities)) {
+		unsigned long index;
+		const struct cred *cred;
+
+		seq_printf(m, "Personalities:\n");
+		xa_for_each(&ctx->personalities, index, cred)
+			io_uring_show_cred(m, index, cred);
+	}
+	if (has_lock)
+		mutex_unlock(&ctx->uring_lock);
+
+	seq_puts(m, "PollList:\n");
+	spin_lock(&ctx->completion_lock);
+	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+		struct hlist_head *list = &ctx->cancel_hash[i];
+		struct io_kiocb *req;
+
+		hlist_for_each_entry(req, list, hash_node)
+			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
+					task_work_pending(req->task));
+	}
+
+	seq_puts(m, "CqOverflowList:\n");
+	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+		struct io_uring_cqe *cqe = &ocqe->cqe;
+
+		seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
+			   cqe->user_data, cqe->res, cqe->flags);
+
+	}
+
+	spin_unlock(&ctx->completion_lock);
+}
+
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct io_ring_ctx *ctx = f->private_data;
+
+	if (percpu_ref_tryget(&ctx->refs)) {
+		__io_uring_show_fdinfo(ctx, m);
+		percpu_ref_put(&ctx->refs);
+	}
+}
+#endif
diff --git a/io_uring/fdinfo.h b/io_uring/fdinfo.h
new file mode 100644
index 00000000000000..6fde48c450e300
--- /dev/null
+++ b/io_uring/fdinfo.h
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+
+void io_uring_show_fdinfo(struct seq_file *m, struct file *f);
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index fe1ec581958dda..6e1675f406b729 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -36,6 +36,8 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
 int io_file_bitmap_get(struct io_ring_ctx *ctx);
 
+unsigned int io_file_get_flags(struct file *file);
+
 static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
 {
 	__clear_bit(bit, table->bitmap);
@@ -55,4 +57,21 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
 	return &table->files[i];
 }
 
+static inline struct file *io_file_from_index(struct io_file_table *table,
+					      int index)
+{
+	struct io_fixed_file *slot = io_fixed_file_slot(table, index);
+
+	return (struct file *) (slot->file_ptr & FFS_MASK);
+}
+
+static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
+				     struct file *file)
+{
+	unsigned long file_ptr = (unsigned long) file;
+
+	file_ptr |= io_file_get_flags(file);
+	file_slot->file_ptr = file_ptr;
+}
+
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 687900b84ce08c..95c2083a0da8b3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -93,6 +93,7 @@
 #include "io_uring.h"
 #include "refs.h"
 #include "sqpoll.h"
+#include "fdinfo.h"
 
 #include "xattr.h"
 #include "nop.h"
@@ -137,21 +138,8 @@
 
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
-struct io_mapped_ubuf {
-	u64		ubuf;
-	u64		ubuf_end;
-	unsigned int	nr_bvecs;
-	unsigned long	acct_pages;
-	struct bio_vec	bvec[];
-};
-
 struct io_ring_ctx;
 
-struct io_overflow_cqe {
-	struct list_head list;
-	struct io_uring_cqe cqe;
-};
-
 struct io_rsrc_put {
 	struct list_head list;
 	u64 tag;
@@ -2325,7 +2313,7 @@ static bool __io_file_supports_nowait(struct file *file, umode_t mode)
  * any file. For now, just ensure that anything potentially problematic is done
  * inline.
  */
-static unsigned int io_file_get_flags(struct file *file)
+unsigned int io_file_get_flags(struct file *file)
 {
 	umode_t mode = file_inode(file)->i_mode;
 	unsigned int res = 0;
@@ -4810,22 +4798,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
 		io_req_task_queue_fail(req, ret);
 }
 
-static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
-					      int index)
-{
-	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
-
-	return (struct file *) (slot->file_ptr & FFS_MASK);
-}
-
-static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
-{
-	unsigned long file_ptr = (unsigned long) file;
-
-	file_ptr |= io_file_get_flags(file);
-	file_slot->file_ptr = file_ptr;
-}
-
 inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 				      unsigned int issue_flags)
 {
@@ -5667,7 +5639,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	int i;
 
 	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(ctx, i);
+		struct file *file = io_file_from_index(&ctx->file_table, i);
 
 		if (!file)
 			continue;
@@ -7777,182 +7749,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	return ret;
 }
 
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
-		const struct cred *cred)
-{
-	struct user_namespace *uns = seq_user_ns(m);
-	struct group_info *gi;
-	kernel_cap_t cap;
-	unsigned __capi;
-	int g;
-
-	seq_printf(m, "%5d\n", id);
-	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
-	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
-	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
-	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
-	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
-	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
-	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
-	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
-	seq_puts(m, "\n\tGroups:\t");
-	gi = cred->group_info;
-	for (g = 0; g < gi->ngroups; g++) {
-		seq_put_decimal_ull(m, g ? " " : "",
-					from_kgid_munged(uns, gi->gid[g]));
-	}
-	seq_puts(m, "\n\tCapEff:\t");
-	cap = cred->cap_effective;
-	CAP_FOR_EACH_U32(__capi)
-		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
-	seq_putc(m, '\n');
-	return 0;
-}
-
-static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
-					  struct seq_file *m)
-{
-	struct io_sq_data *sq = NULL;
-	struct io_overflow_cqe *ocqe;
-	struct io_rings *r = ctx->rings;
-	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
-	unsigned int sq_head = READ_ONCE(r->sq.head);
-	unsigned int sq_tail = READ_ONCE(r->sq.tail);
-	unsigned int cq_head = READ_ONCE(r->cq.head);
-	unsigned int cq_tail = READ_ONCE(r->cq.tail);
-	unsigned int cq_shift = 0;
-	unsigned int sq_entries, cq_entries;
-	bool has_lock;
-	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
-	unsigned int i;
-
-	if (is_cqe32)
-		cq_shift = 1;
-
-	/*
-	 * we may get imprecise sqe and cqe info if uring is actively running
-	 * since we get cached_sq_head and cached_cq_tail without uring_lock
-	 * and sq_tail and cq_head are changed by userspace. But it's ok since
-	 * we usually use these info when it is stuck.
-	 */
-	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
-	seq_printf(m, "SqHead:\t%u\n", sq_head);
-	seq_printf(m, "SqTail:\t%u\n", sq_tail);
-	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
-	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
-	seq_printf(m, "CqHead:\t%u\n", cq_head);
-	seq_printf(m, "CqTail:\t%u\n", cq_tail);
-	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
-	seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
-	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
-	for (i = 0; i < sq_entries; i++) {
-		unsigned int entry = i + sq_head;
-		unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
-		struct io_uring_sqe *sqe;
-
-		if (sq_idx > sq_mask)
-			continue;
-		sqe = &ctx->sq_sqes[sq_idx];
-		seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
-			   sq_idx, sqe->opcode, sqe->fd, sqe->flags,
-			   sqe->user_data);
-	}
-	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
-	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
-	for (i = 0; i < cq_entries; i++) {
-		unsigned int entry = i + cq_head;
-		struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
-
-		if (!is_cqe32) {
-			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
-			   entry & cq_mask, cqe->user_data, cqe->res,
-			   cqe->flags);
-		} else {
-			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
-				"extra1:%llu, extra2:%llu\n",
-				entry & cq_mask, cqe->user_data, cqe->res,
-				cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
-		}
-	}
-
-	/*
-	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
-	 * since fdinfo case grabs it in the opposite direction of normal use
-	 * cases. If we fail to get the lock, we just don't iterate any
-	 * structures that could be going away outside the io_uring mutex.
-	 */
-	has_lock = mutex_trylock(&ctx->uring_lock);
-
-	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
-		sq = ctx->sq_data;
-		if (!sq->thread)
-			sq = NULL;
-	}
-
-	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
-	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
-	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
-	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-		struct file *f = io_file_from_index(ctx, i);
-
-		if (f)
-			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
-		else
-			seq_printf(m, "%5u: <none>\n", i);
-	}
-	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
-	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
-		unsigned int len = buf->ubuf_end - buf->ubuf;
-
-		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
-	}
-	if (has_lock && !xa_empty(&ctx->personalities)) {
-		unsigned long index;
-		const struct cred *cred;
-
-		seq_printf(m, "Personalities:\n");
-		xa_for_each(&ctx->personalities, index, cred)
-			io_uring_show_cred(m, index, cred);
-	}
-	if (has_lock)
-		mutex_unlock(&ctx->uring_lock);
-
-	seq_puts(m, "PollList:\n");
-	spin_lock(&ctx->completion_lock);
-	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct hlist_head *list = &ctx->cancel_hash[i];
-		struct io_kiocb *req;
-
-		hlist_for_each_entry(req, list, hash_node)
-			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
-					task_work_pending(req->task));
-	}
-
-	seq_puts(m, "CqOverflowList:\n");
-	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
-		struct io_uring_cqe *cqe = &ocqe->cqe;
-
-		seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
-			   cqe->user_data, cqe->res, cqe->flags);
-
-	}
-
-	spin_unlock(&ctx->completion_lock);
-}
-
-static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
-{
-	struct io_ring_ctx *ctx = f->private_data;
-
-	if (percpu_ref_tryget(&ctx->refs)) {
-		__io_uring_show_fdinfo(ctx, m);
-		percpu_ref_put(&ctx->refs);
-	}
-}
-#endif
-
 static const struct file_operations io_uring_fops = {
 	.release	= io_uring_release,
 	.mmap		= io_uring_mmap,
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 349524907b6bfc..147e1e597530a9 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -498,4 +498,17 @@ struct io_cancel_data {
 	int seq;
 };
 
+struct io_overflow_cqe {
+	struct list_head list;
+	struct io_uring_cqe cqe;
+};
+
+struct io_mapped_ubuf {
+	u64		ubuf;
+	u64		ubuf_end;
+	unsigned int	nr_bvecs;
+	unsigned long	acct_pages;
+	struct bio_vec	bvec[];
+};
+
 #endif

From c9f06aa7de153cdbe424e7e38be39f2272cf78bc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 11:01:04 -0600
Subject: [PATCH 046/172] io_uring: move io_uring_task (tctx) helpers into its
 own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/io_uring.c | 368 +-------------------------------------------
 io_uring/io_uring.h |   4 +
 io_uring/tctx.c     | 332 +++++++++++++++++++++++++++++++++++++++
 io_uring/tctx.h     |  55 +++++++
 5 files changed, 396 insertions(+), 365 deletions(-)
 create mode 100644 io_uring/tctx.c
 create mode 100644 io_uring/tctx.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index ed0bf42db4ae4c..2db085cdedad84 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
-					sqpoll.o fdinfo.o
+					sqpoll.o fdinfo.o tctx.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 95c2083a0da8b3..d7336e6c9f2349 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 #include "io_uring_types.h"
 #include "io_uring.h"
 #include "refs.h"
+#include "tctx.h"
 #include "sqpoll.h"
 #include "fdinfo.h"
 
@@ -208,30 +209,6 @@ struct io_buffer {
 
 #define BGID_ARRAY			64
 
-/*
- * Arbitrary limit, can be raised if need be
- */
-#define IO_RINGFD_REG_MAX 16
-
-struct io_uring_task {
-	/* submission side */
-	int			cached_refs;
-	struct xarray		xa;
-	struct wait_queue_head	wait;
-	const struct io_ring_ctx *last;
-	struct io_wq		*io_wq;
-	struct percpu_counter	inflight;
-	atomic_t		inflight_tracked;
-	atomic_t		in_idle;
-
-	spinlock_t		task_lock;
-	struct io_wq_work_list	task_list;
-	struct io_wq_work_list	prio_task_list;
-	struct callback_head	task_work;
-	struct file		**registered_rings;
-	bool			task_running;
-};
-
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -311,12 +288,6 @@ enum {
 	IO_CHECK_CQ_DROPPED_BIT,
 };
 
-struct io_tctx_node {
-	struct list_head	ctx_node;
-	struct task_struct	*task;
-	struct io_ring_ctx	*ctx;
-};
-
 struct io_defer_entry {
 	struct list_head	list;
 	struct io_kiocb		*req;
@@ -361,7 +332,6 @@ static const struct io_op_def io_op_defs[];
 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
 
-static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
@@ -1677,7 +1647,7 @@ static void handle_tw_list(struct io_wq_work_node *node,
 	} while (node);
 }
 
-static void tctx_task_work(struct callback_head *cb)
+void tctx_task_work(struct callback_head *cb)
 {
 	bool uring_locked = false;
 	struct io_ring_ctx *ctx = NULL;
@@ -4725,7 +4695,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
-static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
@@ -4733,7 +4703,7 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 	return req ? &req->work : NULL;
 }
 
-static void io_wq_submit_work(struct io_wq_work *work)
+void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -6089,97 +6059,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	return done ? done : err;
 }
 
-static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
-					struct task_struct *task)
-{
-	struct io_wq_hash *hash;
-	struct io_wq_data data;
-	unsigned int concurrency;
-
-	mutex_lock(&ctx->uring_lock);
-	hash = ctx->hash_map;
-	if (!hash) {
-		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
-		if (!hash) {
-			mutex_unlock(&ctx->uring_lock);
-			return ERR_PTR(-ENOMEM);
-		}
-		refcount_set(&hash->refs, 1);
-		init_waitqueue_head(&hash->wait);
-		ctx->hash_map = hash;
-	}
-	mutex_unlock(&ctx->uring_lock);
-
-	data.hash = hash;
-	data.task = task;
-	data.free_work = io_wq_free_work;
-	data.do_work = io_wq_submit_work;
-
-	/* Do QD, or 4 * CPUS, whatever is smallest */
-	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-
-	return io_wq_create(concurrency, &data);
-}
-
-__cold int io_uring_alloc_task_context(struct task_struct *task,
-				       struct io_ring_ctx *ctx)
-{
-	struct io_uring_task *tctx;
-	int ret;
-
-	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
-	if (unlikely(!tctx))
-		return -ENOMEM;
-
-	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
-					 sizeof(struct file *), GFP_KERNEL);
-	if (unlikely(!tctx->registered_rings)) {
-		kfree(tctx);
-		return -ENOMEM;
-	}
-
-	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
-	if (unlikely(ret)) {
-		kfree(tctx->registered_rings);
-		kfree(tctx);
-		return ret;
-	}
-
-	tctx->io_wq = io_init_wq_offload(ctx, task);
-	if (IS_ERR(tctx->io_wq)) {
-		ret = PTR_ERR(tctx->io_wq);
-		percpu_counter_destroy(&tctx->inflight);
-		kfree(tctx->registered_rings);
-		kfree(tctx);
-		return ret;
-	}
-
-	xa_init(&tctx->xa);
-	init_waitqueue_head(&tctx->wait);
-	atomic_set(&tctx->in_idle, 0);
-	atomic_set(&tctx->inflight_tracked, 0);
-	task->io_uring = tctx;
-	spin_lock_init(&tctx->task_lock);
-	INIT_WQ_LIST(&tctx->task_list);
-	INIT_WQ_LIST(&tctx->prio_task_list);
-	init_task_work(&tctx->task_work, tctx_task_work);
-	return 0;
-}
-
-void __io_uring_free(struct task_struct *tsk)
-{
-	struct io_uring_task *tctx = tsk->io_uring;
-
-	WARN_ON_ONCE(!xa_empty(&tctx->xa));
-	WARN_ON_ONCE(tctx->io_wq);
-	WARN_ON_ONCE(tctx->cached_refs);
-
-	kfree(tctx->registered_rings);
-	percpu_counter_destroy(&tctx->inflight);
-	kfree(tctx);
-	tsk->io_uring = NULL;
-}
-
 static inline void __io_unaccount_mem(struct user_struct *user,
 				      unsigned long nr_pages)
 {
@@ -7179,107 +7058,6 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	}
 }
 
-static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	struct io_tctx_node *node;
-	int ret;
-
-	if (unlikely(!tctx)) {
-		ret = io_uring_alloc_task_context(current, ctx);
-		if (unlikely(ret))
-			return ret;
-
-		tctx = current->io_uring;
-		if (ctx->iowq_limits_set) {
-			unsigned int limits[2] = { ctx->iowq_limits[0],
-						   ctx->iowq_limits[1], };
-
-			ret = io_wq_max_workers(tctx->io_wq, limits);
-			if (ret)
-				return ret;
-		}
-	}
-	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
-		node = kmalloc(sizeof(*node), GFP_KERNEL);
-		if (!node)
-			return -ENOMEM;
-		node->ctx = ctx;
-		node->task = current;
-
-		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
-					node, GFP_KERNEL));
-		if (ret) {
-			kfree(node);
-			return ret;
-		}
-
-		mutex_lock(&ctx->uring_lock);
-		list_add(&node->ctx_node, &ctx->tctx_list);
-		mutex_unlock(&ctx->uring_lock);
-	}
-	tctx->last = ctx;
-	return 0;
-}
-
-/*
- * Note that this task has used io_uring. We use it for cancelation purposes.
- */
-static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
-{
-	struct io_uring_task *tctx = current->io_uring;
-
-	if (likely(tctx && tctx->last == ctx))
-		return 0;
-	return __io_uring_add_tctx_node(ctx);
-}
-
-/*
- * Remove this io_uring_file -> task mapping.
- */
-static __cold void io_uring_del_tctx_node(unsigned long index)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	struct io_tctx_node *node;
-
-	if (!tctx)
-		return;
-	node = xa_erase(&tctx->xa, index);
-	if (!node)
-		return;
-
-	WARN_ON_ONCE(current != node->task);
-	WARN_ON_ONCE(list_empty(&node->ctx_node));
-
-	mutex_lock(&node->ctx->uring_lock);
-	list_del(&node->ctx_node);
-	mutex_unlock(&node->ctx->uring_lock);
-
-	if (tctx->last == node->ctx)
-		tctx->last = NULL;
-	kfree(node);
-}
-
-static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
-{
-	struct io_wq *wq = tctx->io_wq;
-	struct io_tctx_node *node;
-	unsigned long index;
-
-	xa_for_each(&tctx->xa, index, node) {
-		io_uring_del_tctx_node(index);
-		cond_resched();
-	}
-	if (wq) {
-		/*
-		 * Must be after io_uring_del_tctx_node() (removes nodes under
-		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
-		 */
-		io_wq_put_and_exit(wq);
-		tctx->io_wq = NULL;
-	}
-}
-
 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 {
 	if (tracked)
@@ -7361,144 +7139,6 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
-void io_uring_unreg_ringfd(void)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	int i;
-
-	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
-		if (tctx->registered_rings[i]) {
-			fput(tctx->registered_rings[i]);
-			tctx->registered_rings[i] = NULL;
-		}
-	}
-}
-
-static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
-				     int start, int end)
-{
-	struct file *file;
-	int offset;
-
-	for (offset = start; offset < end; offset++) {
-		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
-		if (tctx->registered_rings[offset])
-			continue;
-
-		file = fget(fd);
-		if (!file) {
-			return -EBADF;
-		} else if (!io_is_uring_fops(file)) {
-			fput(file);
-			return -EOPNOTSUPP;
-		}
-		tctx->registered_rings[offset] = file;
-		return offset;
-	}
-
-	return -EBUSY;
-}
-
-/*
- * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
- * invocation. User passes in an array of struct io_uring_rsrc_update
- * with ->data set to the ring_fd, and ->offset given for the desired
- * index. If no index is desired, application may set ->offset == -1U
- * and we'll find an available index. Returns number of entries
- * successfully processed, or < 0 on error if none were processed.
- */
-static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
-			      unsigned nr_args)
-{
-	struct io_uring_rsrc_update __user *arg = __arg;
-	struct io_uring_rsrc_update reg;
-	struct io_uring_task *tctx;
-	int ret, i;
-
-	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
-		return -EINVAL;
-
-	mutex_unlock(&ctx->uring_lock);
-	ret = io_uring_add_tctx_node(ctx);
-	mutex_lock(&ctx->uring_lock);
-	if (ret)
-		return ret;
-
-	tctx = current->io_uring;
-	for (i = 0; i < nr_args; i++) {
-		int start, end;
-
-		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
-			ret = -EFAULT;
-			break;
-		}
-
-		if (reg.resv) {
-			ret = -EINVAL;
-			break;
-		}
-
-		if (reg.offset == -1U) {
-			start = 0;
-			end = IO_RINGFD_REG_MAX;
-		} else {
-			if (reg.offset >= IO_RINGFD_REG_MAX) {
-				ret = -EINVAL;
-				break;
-			}
-			start = reg.offset;
-			end = start + 1;
-		}
-
-		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
-		if (ret < 0)
-			break;
-
-		reg.offset = ret;
-		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
-			fput(tctx->registered_rings[reg.offset]);
-			tctx->registered_rings[reg.offset] = NULL;
-			ret = -EFAULT;
-			break;
-		}
-	}
-
-	return i ? i : ret;
-}
-
-static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
-				unsigned nr_args)
-{
-	struct io_uring_rsrc_update __user *arg = __arg;
-	struct io_uring_task *tctx = current->io_uring;
-	struct io_uring_rsrc_update reg;
-	int ret = 0, i;
-
-	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
-		return -EINVAL;
-	if (!tctx)
-		return 0;
-
-	for (i = 0; i < nr_args; i++) {
-		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
-			ret = -EFAULT;
-			break;
-		}
-		if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
-			ret = -EINVAL;
-			break;
-		}
-
-		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
-		if (tctx->registered_rings[reg.offset]) {
-			fput(tctx->registered_rings[reg.offset]);
-			tctx->registered_rings[reg.offset] = NULL;
-		}
-	}
-
-	return i ? i : ret;
-}
-
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 1da8e66507a350..60678e88a9b92c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -128,6 +128,7 @@ void io_req_task_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_complete(struct io_kiocb *req, bool *locked);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
+void tctx_task_work(struct callback_head *cb);
 int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
@@ -136,6 +137,9 @@ int io_uring_alloc_task_context(struct task_struct *task,
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
+void io_wq_submit_work(struct io_wq_work *work);
+
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
 
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
new file mode 100644
index 00000000000000..3f7e9feb6ca2bc
--- /dev/null
+++ b/io_uring/tctx.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/nospec.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "tctx.h"
+
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
+					struct task_struct *task)
+{
+	struct io_wq_hash *hash;
+	struct io_wq_data data;
+	unsigned int concurrency;
+
+	mutex_lock(&ctx->uring_lock);
+	hash = ctx->hash_map;
+	if (!hash) {
+		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+		if (!hash) {
+			mutex_unlock(&ctx->uring_lock);
+			return ERR_PTR(-ENOMEM);
+		}
+		refcount_set(&hash->refs, 1);
+		init_waitqueue_head(&hash->wait);
+		ctx->hash_map = hash;
+	}
+	mutex_unlock(&ctx->uring_lock);
+
+	data.hash = hash;
+	data.task = task;
+	data.free_work = io_wq_free_work;
+	data.do_work = io_wq_submit_work;
+
+	/* Do QD, or 4 * CPUS, whatever is smallest */
+	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+
+	return io_wq_create(concurrency, &data);
+}
+
+void __io_uring_free(struct task_struct *tsk)
+{
+	struct io_uring_task *tctx = tsk->io_uring;
+
+	WARN_ON_ONCE(!xa_empty(&tctx->xa));
+	WARN_ON_ONCE(tctx->io_wq);
+	WARN_ON_ONCE(tctx->cached_refs);
+
+	kfree(tctx->registered_rings);
+	percpu_counter_destroy(&tctx->inflight);
+	kfree(tctx);
+	tsk->io_uring = NULL;
+}
+
+__cold int io_uring_alloc_task_context(struct task_struct *task,
+				       struct io_ring_ctx *ctx)
+{
+	struct io_uring_task *tctx;
+	int ret;
+
+	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
+	if (unlikely(!tctx))
+		return -ENOMEM;
+
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
+	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
+	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
+		kfree(tctx);
+		return ret;
+	}
+
+	tctx->io_wq = io_init_wq_offload(ctx, task);
+	if (IS_ERR(tctx->io_wq)) {
+		ret = PTR_ERR(tctx->io_wq);
+		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
+		kfree(tctx);
+		return ret;
+	}
+
+	xa_init(&tctx->xa);
+	init_waitqueue_head(&tctx->wait);
+	atomic_set(&tctx->in_idle, 0);
+	atomic_set(&tctx->inflight_tracked, 0);
+	task->io_uring = tctx;
+	spin_lock_init(&tctx->task_lock);
+	INIT_WQ_LIST(&tctx->task_list);
+	INIT_WQ_LIST(&tctx->prio_task_list);
+	init_task_work(&tctx->task_work, tctx_task_work);
+	return 0;
+}
+
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_tctx_node *node;
+	int ret;
+
+	if (unlikely(!tctx)) {
+		ret = io_uring_alloc_task_context(current, ctx);
+		if (unlikely(ret))
+			return ret;
+
+		tctx = current->io_uring;
+		if (ctx->iowq_limits_set) {
+			unsigned int limits[2] = { ctx->iowq_limits[0],
+						   ctx->iowq_limits[1], };
+
+			ret = io_wq_max_workers(tctx->io_wq, limits);
+			if (ret)
+				return ret;
+		}
+	}
+	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
+		node = kmalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
+			return -ENOMEM;
+		node->ctx = ctx;
+		node->task = current;
+
+		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+					node, GFP_KERNEL));
+		if (ret) {
+			kfree(node);
+			return ret;
+		}
+
+		mutex_lock(&ctx->uring_lock);
+		list_add(&node->ctx_node, &ctx->tctx_list);
+		mutex_unlock(&ctx->uring_lock);
+	}
+	tctx->last = ctx;
+	return 0;
+}
+
+/*
+ * Remove this io_uring_file -> task mapping.
+ */
+__cold void io_uring_del_tctx_node(unsigned long index)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_tctx_node *node;
+
+	if (!tctx)
+		return;
+	node = xa_erase(&tctx->xa, index);
+	if (!node)
+		return;
+
+	WARN_ON_ONCE(current != node->task);
+	WARN_ON_ONCE(list_empty(&node->ctx_node));
+
+	mutex_lock(&node->ctx->uring_lock);
+	list_del(&node->ctx_node);
+	mutex_unlock(&node->ctx->uring_lock);
+
+	if (tctx->last == node->ctx)
+		tctx->last = NULL;
+	kfree(node);
+}
+
+__cold void io_uring_clean_tctx(struct io_uring_task *tctx)
+{
+	struct io_wq *wq = tctx->io_wq;
+	struct io_tctx_node *node;
+	unsigned long index;
+
+	xa_for_each(&tctx->xa, index, node) {
+		io_uring_del_tctx_node(index);
+		cond_resched();
+	}
+	if (wq) {
+		/*
+		 * Must be after io_uring_del_tctx_node() (removes nodes under
+		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
+		 */
+		io_wq_put_and_exit(wq);
+		tctx->io_wq = NULL;
+	}
+}
+
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+				     int start, int end)
+{
+	struct file *file;
+	int offset;
+
+	for (offset = start; offset < end; offset++) {
+		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+		if (tctx->registered_rings[offset])
+			continue;
+
+		file = fget(fd);
+		if (!file) {
+			return -EBADF;
+		} else if (!io_is_uring_fops(file)) {
+			fput(file);
+			return -EOPNOTSUPP;
+		}
+		tctx->registered_rings[offset] = file;
+		return offset;
+	}
+
+	return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+		       unsigned nr_args)
+{
+	struct io_uring_rsrc_update __user *arg = __arg;
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	int ret, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (ret)
+		return ret;
+
+	tctx = current->io_uring;
+	for (i = 0; i < nr_args; i++) {
+		int start, end;
+
+		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (reg.resv) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (reg.offset == -1U) {
+			start = 0;
+			end = IO_RINGFD_REG_MAX;
+		} else {
+			if (reg.offset >= IO_RINGFD_REG_MAX) {
+				ret = -EINVAL;
+				break;
+			}
+			start = reg.offset;
+			end = start + 1;
+		}
+
+		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+		if (ret < 0)
+			break;
+
+		reg.offset = ret;
+		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	return i ? i : ret;
+}
+
+int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+			 unsigned nr_args)
+{
+	struct io_uring_rsrc_update __user *arg = __arg;
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_uring_rsrc_update reg;
+	int ret = 0, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+	if (!tctx)
+		return 0;
+
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+		if (tctx->registered_rings[reg.offset]) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+		}
+	}
+
+	return i ? i : ret;
+}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
new file mode 100644
index 00000000000000..f4964e40d07e0d
--- /dev/null
+++ b/io_uring/tctx.h
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
+struct io_uring_task {
+	/* submission side */
+	int			cached_refs;
+	struct xarray		xa;
+	struct wait_queue_head	wait;
+	const struct io_ring_ctx *last;
+	struct io_wq		*io_wq;
+	struct percpu_counter	inflight;
+	atomic_t		inflight_tracked;
+	atomic_t		in_idle;
+
+	spinlock_t		task_lock;
+	struct io_wq_work_list	task_list;
+	struct io_wq_work_list	prio_task_list;
+	struct callback_head	task_work;
+	struct file		**registered_rings;
+	bool			task_running;
+};
+
+struct io_tctx_node {
+	struct list_head	ctx_node;
+	struct task_struct	*task;
+	struct io_ring_ctx	*ctx;
+};
+
+int io_uring_alloc_task_context(struct task_struct *task,
+				struct io_ring_ctx *ctx);
+void io_uring_del_tctx_node(unsigned long index);
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
+void io_uring_clean_tctx(struct io_uring_task *tctx);
+
+void io_uring_unreg_ringfd(void);
+int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+		       unsigned nr_args);
+int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+			 unsigned nr_args);
+
+/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+{
+	struct io_uring_task *tctx = current->io_uring;
+
+	if (likely(tctx && tctx->last == ctx))
+		return 0;
+	return __io_uring_add_tctx_node(ctx);
+}

From 92ac8beaea1f4931f932da3dcc850547621107fc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 11:48:35 -0600
Subject: [PATCH 047/172] io_uring: include and forward-declaration sanitation

Remove some dead headers we no longer need, and get rid of the
io_ring_ctx and io_uring_fops forward declarations.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d7336e6c9f2349..c1229704adfdf5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -70,11 +70,9 @@
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
 #include <linux/highmem.h>
-#include <linux/namei.h>
 #include <linux/fsnotify.h>
 #include <linux/fadvise.h>
 #include <linux/eventpoll.h>
-#include <linux/splice.h>
 #include <linux/task_work.h>
 #include <linux/pagemap.h>
 #include <linux/io_uring.h>
@@ -86,7 +84,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "../fs/internal.h"
 #include "io-wq.h"
 
 #include "io_uring_types.h"
@@ -139,8 +136,6 @@
 
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
-struct io_ring_ctx;
-
 struct io_rsrc_put {
 	struct list_head list;
 	u64 tag;
@@ -352,8 +347,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx);
 
 static struct kmem_cache *req_cachep;
 
-static const struct file_operations io_uring_fops;
-
 const char *io_uring_get_opcode(u8 opcode)
 {
 	switch ((enum io_uring_op)opcode) {
@@ -457,11 +450,6 @@ const char *io_uring_get_opcode(u8 opcode)
 	return "INVALID";
 }
 
-bool io_is_uring_fops(struct file *file)
-{
-	return file->f_op == &io_uring_fops;
-}
-
 struct sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
@@ -7402,6 +7390,11 @@ static const struct file_operations io_uring_fops = {
 #endif
 };
 
+bool io_is_uring_fops(struct file *file)
+{
+	return file->f_op == &io_uring_fops;
+}
+
 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 					 struct io_uring_params *p)
 {

From cfd22e6b3319adc7c2a8a092e19ac16575dabc86 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 11:57:03 -0600
Subject: [PATCH 048/172] io_uring: add opcode name to io_op_defs

This kills the last per-op switch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 150 +++++++++++++++-----------------------------
 1 file changed, 52 insertions(+), 98 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c1229704adfdf5..3f006907c8c5f4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -315,6 +315,8 @@ struct io_op_def {
 	/* size of async data needed, if any */
 	unsigned short		async_size;
 
+	const char		*name;
+
 	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
 	int (*issue)(struct io_kiocb *, unsigned int);
 	int (*prep_async)(struct io_kiocb *);
@@ -349,104 +351,8 @@ static struct kmem_cache *req_cachep;
 
 const char *io_uring_get_opcode(u8 opcode)
 {
-	switch ((enum io_uring_op)opcode) {
-	case IORING_OP_NOP:
-		return "NOP";
-	case IORING_OP_READV:
-		return "READV";
-	case IORING_OP_WRITEV:
-		return "WRITEV";
-	case IORING_OP_FSYNC:
-		return "FSYNC";
-	case IORING_OP_READ_FIXED:
-		return "READ_FIXED";
-	case IORING_OP_WRITE_FIXED:
-		return "WRITE_FIXED";
-	case IORING_OP_POLL_ADD:
-		return "POLL_ADD";
-	case IORING_OP_POLL_REMOVE:
-		return "POLL_REMOVE";
-	case IORING_OP_SYNC_FILE_RANGE:
-		return "SYNC_FILE_RANGE";
-	case IORING_OP_SENDMSG:
-		return "SENDMSG";
-	case IORING_OP_RECVMSG:
-		return "RECVMSG";
-	case IORING_OP_TIMEOUT:
-		return "TIMEOUT";
-	case IORING_OP_TIMEOUT_REMOVE:
-		return "TIMEOUT_REMOVE";
-	case IORING_OP_ACCEPT:
-		return "ACCEPT";
-	case IORING_OP_ASYNC_CANCEL:
-		return "ASYNC_CANCEL";
-	case IORING_OP_LINK_TIMEOUT:
-		return "LINK_TIMEOUT";
-	case IORING_OP_CONNECT:
-		return "CONNECT";
-	case IORING_OP_FALLOCATE:
-		return "FALLOCATE";
-	case IORING_OP_OPENAT:
-		return "OPENAT";
-	case IORING_OP_CLOSE:
-		return "CLOSE";
-	case IORING_OP_FILES_UPDATE:
-		return "FILES_UPDATE";
-	case IORING_OP_STATX:
-		return "STATX";
-	case IORING_OP_READ:
-		return "READ";
-	case IORING_OP_WRITE:
-		return "WRITE";
-	case IORING_OP_FADVISE:
-		return "FADVISE";
-	case IORING_OP_MADVISE:
-		return "MADVISE";
-	case IORING_OP_SEND:
-		return "SEND";
-	case IORING_OP_RECV:
-		return "RECV";
-	case IORING_OP_OPENAT2:
-		return "OPENAT2";
-	case IORING_OP_EPOLL_CTL:
-		return "EPOLL_CTL";
-	case IORING_OP_SPLICE:
-		return "SPLICE";
-	case IORING_OP_PROVIDE_BUFFERS:
-		return "PROVIDE_BUFFERS";
-	case IORING_OP_REMOVE_BUFFERS:
-		return "REMOVE_BUFFERS";
-	case IORING_OP_TEE:
-		return "TEE";
-	case IORING_OP_SHUTDOWN:
-		return "SHUTDOWN";
-	case IORING_OP_RENAMEAT:
-		return "RENAMEAT";
-	case IORING_OP_UNLINKAT:
-		return "UNLINKAT";
-	case IORING_OP_MKDIRAT:
-		return "MKDIRAT";
-	case IORING_OP_SYMLINKAT:
-		return "SYMLINKAT";
-	case IORING_OP_LINKAT:
-		return "LINKAT";
-	case IORING_OP_MSG_RING:
-		return "MSG_RING";
-	case IORING_OP_FSETXATTR:
-		return "FSETXATTR";
-	case IORING_OP_SETXATTR:
-		return "SETXATTR";
-	case IORING_OP_FGETXATTR:
-		return "FGETXATTR";
-	case IORING_OP_GETXATTR:
-		return "GETXATTR";
-	case IORING_OP_SOCKET:
-		return "SOCKET";
-	case IORING_OP_URING_CMD:
-		return "URING_CMD";
-	case IORING_OP_LAST:
-		return "INVALID";
-	}
+	if (opcode < IORING_OP_LAST)
+		return io_op_defs[opcode].name;
 	return "INVALID";
 }
 
@@ -8307,6 +8213,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_NOP] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
+		.name			= "NOP",
 		.prep			= io_nop_prep,
 		.issue			= io_nop,
 	},
@@ -8320,6 +8227,7 @@ static const struct io_op_def io_op_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READV",
 		.prep			= io_prep_rw,
 		.issue			= io_read,
 		.prep_async		= io_readv_prep_async,
@@ -8335,6 +8243,7 @@ static const struct io_op_def io_op_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITEV",
 		.prep			= io_prep_rw,
 		.issue			= io_write,
 		.prep_async		= io_writev_prep_async,
@@ -8343,6 +8252,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
 		.audit_skip		= 1,
+		.name			= "FSYNC",
 		.prep			= io_fsync_prep,
 		.issue			= io_fsync,
 	},
@@ -8355,6 +8265,7 @@ static const struct io_op_def io_op_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ_FIXED",
 		.prep			= io_prep_rw,
 		.issue			= io_read,
 	},
@@ -8368,6 +8279,7 @@ static const struct io_op_def io_op_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE_FIXED",
 		.prep			= io_prep_rw,
 		.issue			= io_write,
 	},
@@ -8375,17 +8287,20 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
+		.name			= "POLL_ADD",
 		.prep			= io_poll_add_prep,
 		.issue			= io_poll_add,
 	},
 	[IORING_OP_POLL_REMOVE] = {
 		.audit_skip		= 1,
+		.name			= "POLL_REMOVE",
 		.prep			= io_poll_remove_prep,
 		.issue			= io_poll_remove,
 	},
 	[IORING_OP_SYNC_FILE_RANGE] = {
 		.needs_file		= 1,
 		.audit_skip		= 1,
+		.name			= "SYNC_FILE_RANGE",
 		.prep			= io_sfr_prep,
 		.issue			= io_sync_file_range,
 	},
@@ -8394,6 +8309,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 		.ioprio			= 1,
+		.name			= "SENDMSG",
 #if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_sendmsg_prep,
@@ -8410,6 +8326,7 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.buffer_select		= 1,
 		.ioprio			= 1,
+		.name			= "RECVMSG",
 #if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_recvmsg_prep,
@@ -8423,12 +8340,14 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_TIMEOUT] = {
 		.audit_skip		= 1,
 		.async_size		= sizeof(struct io_timeout_data),
+		.name			= "TIMEOUT",
 		.prep			= io_timeout_prep,
 		.issue			= io_timeout,
 	},
 	[IORING_OP_TIMEOUT_REMOVE] = {
 		/* used by timeout updates' prep() */
 		.audit_skip		= 1,
+		.name			= "TIMEOUT_REMOVE",
 		.prep			= io_timeout_remove_prep,
 		.issue			= io_timeout_remove,
 	},
@@ -8438,6 +8357,7 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.poll_exclusive		= 1,
 		.ioprio			= 1,	/* used for flags */
+		.name			= "ACCEPT",
 #if defined(CONFIG_NET)
 		.prep			= io_accept_prep,
 		.issue			= io_accept,
@@ -8447,12 +8367,14 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_ASYNC_CANCEL] = {
 		.audit_skip		= 1,
+		.name			= "ASYNC_CANCEL",
 		.prep			= io_async_cancel_prep,
 		.issue			= io_async_cancel,
 	},
 	[IORING_OP_LINK_TIMEOUT] = {
 		.audit_skip		= 1,
 		.async_size		= sizeof(struct io_timeout_data),
+		.name			= "LINK_TIMEOUT",
 		.prep			= io_link_timeout_prep,
 		.issue			= io_no_issue,
 	},
@@ -8460,6 +8382,7 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.name			= "CONNECT",
 #if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_connect),
 		.prep			= io_connect_prep,
@@ -8471,26 +8394,31 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_FALLOCATE] = {
 		.needs_file		= 1,
+		.name			= "FALLOCATE",
 		.prep			= io_fallocate_prep,
 		.issue			= io_fallocate,
 	},
 	[IORING_OP_OPENAT] = {
+		.name			= "OPENAT",
 		.prep			= io_openat_prep,
 		.issue			= io_openat,
 		.cleanup		= io_open_cleanup,
 	},
 	[IORING_OP_CLOSE] = {
+		.name			= "CLOSE",
 		.prep			= io_close_prep,
 		.issue			= io_close,
 	},
 	[IORING_OP_FILES_UPDATE] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
+		.name			= "FILES_UPDATE",
 		.prep			= io_files_update_prep,
 		.issue			= io_files_update,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
+		.name			= "STATX",
 		.prep			= io_statx_prep,
 		.issue			= io_statx,
 		.cleanup		= io_statx_cleanup,
@@ -8505,6 +8433,7 @@ static const struct io_op_def io_op_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ",
 		.prep			= io_prep_rw,
 		.issue			= io_read,
 	},
@@ -8518,16 +8447,19 @@ static const struct io_op_def io_op_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE",
 		.prep			= io_prep_rw,
 		.issue			= io_write,
 	},
 	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
 		.audit_skip		= 1,
+		.name			= "FADVISE",
 		.prep			= io_fadvise_prep,
 		.issue			= io_fadvise,
 	},
 	[IORING_OP_MADVISE] = {
+		.name			= "MADVISE",
 		.prep			= io_madvise_prep,
 		.issue			= io_madvise,
 	},
@@ -8537,6 +8469,7 @@ static const struct io_op_def io_op_defs[] = {
 		.pollout		= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
+		.name			= "SEND",
 #if defined(CONFIG_NET)
 		.prep			= io_sendmsg_prep,
 		.issue			= io_send,
@@ -8551,6 +8484,7 @@ static const struct io_op_def io_op_defs[] = {
 		.buffer_select		= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
+		.name			= "RECV",
 #if defined(CONFIG_NET)
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recv,
@@ -8559,6 +8493,7 @@ static const struct io_op_def io_op_defs[] = {
 #endif
 	},
 	[IORING_OP_OPENAT2] = {
+		.name			= "OPENAT2",
 		.prep			= io_openat2_prep,
 		.issue			= io_openat2,
 		.cleanup		= io_open_cleanup,
@@ -8566,6 +8501,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_EPOLL_CTL] = {
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
+		.name			= "EPOLL",
 #if defined(CONFIG_EPOLL)
 		.prep			= io_epoll_ctl_prep,
 		.issue			= io_epoll_ctl,
@@ -8578,18 +8514,21 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
+		.name			= "SPLICE",
 		.prep			= io_splice_prep,
 		.issue			= io_splice,
 	},
 	[IORING_OP_PROVIDE_BUFFERS] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
+		.name			= "PROVIDE_BUFFERS",
 		.prep			= io_provide_buffers_prep,
 		.issue			= io_provide_buffers,
 	},
 	[IORING_OP_REMOVE_BUFFERS] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
+		.name			= "REMOVE_BUFFERS",
 		.prep			= io_remove_buffers_prep,
 		.issue			= io_remove_buffers,
 	},
@@ -8598,11 +8537,13 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
+		.name			= "TEE",
 		.prep			= io_tee_prep,
 		.issue			= io_tee,
 	},
 	[IORING_OP_SHUTDOWN] = {
 		.needs_file		= 1,
+		.name			= "SHUTDOWN",
 #if defined(CONFIG_NET)
 		.prep			= io_shutdown_prep,
 		.issue			= io_shutdown,
@@ -8611,26 +8552,31 @@ static const struct io_op_def io_op_defs[] = {
 #endif
 	},
 	[IORING_OP_RENAMEAT] = {
+		.name			= "RENAMEAT",
 		.prep			= io_renameat_prep,
 		.issue			= io_renameat,
 		.cleanup		= io_renameat_cleanup,
 	},
 	[IORING_OP_UNLINKAT] = {
+		.name			= "UNLINKAT",
 		.prep			= io_unlinkat_prep,
 		.issue			= io_unlinkat,
 		.cleanup		= io_unlinkat_cleanup,
 	},
 	[IORING_OP_MKDIRAT] = {
+		.name			= "MKDIRAT",
 		.prep			= io_mkdirat_prep,
 		.issue			= io_mkdirat,
 		.cleanup		= io_mkdirat_cleanup,
 	},
 	[IORING_OP_SYMLINKAT] = {
+		.name			= "SYMLINKAT",
 		.prep			= io_symlinkat_prep,
 		.issue			= io_symlinkat,
 		.cleanup		= io_link_cleanup,
 	},
 	[IORING_OP_LINKAT] = {
+		.name			= "LINKAT",
 		.prep			= io_linkat_prep,
 		.issue			= io_linkat,
 		.cleanup		= io_link_cleanup,
@@ -8638,33 +8584,39 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
 		.iopoll			= 1,
+		.name			= "MSG_RING",
 		.prep			= io_msg_ring_prep,
 		.issue			= io_msg_ring,
 	},
 	[IORING_OP_FSETXATTR] = {
 		.needs_file = 1,
+		.name			= "FSETXATTR",
 		.prep			= io_fsetxattr_prep,
 		.issue			= io_fsetxattr,
 		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_SETXATTR] = {
+		.name			= "SETXATTR",
 		.prep			= io_setxattr_prep,
 		.issue			= io_setxattr,
 		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_FGETXATTR] = {
 		.needs_file = 1,
+		.name			= "FGETXATTR",
 		.prep			= io_fgetxattr_prep,
 		.issue			= io_fgetxattr,
 		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_GETXATTR] = {
+		.name			= "GETXATTR",
 		.prep			= io_getxattr_prep,
 		.issue			= io_getxattr,
 		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_SOCKET] = {
 		.audit_skip		= 1,
+		.name			= "SOCKET",
 #if defined(CONFIG_NET)
 		.prep			= io_socket_prep,
 		.issue			= io_socket,
@@ -8675,6 +8627,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_URING_CMD] = {
 		.needs_file		= 1,
 		.plug			= 1,
+		.name			= "URING_CMD",
 		.async_size		= uring_cmd_pdu_size(1),
 		.prep			= io_uring_cmd_prep,
 		.issue			= io_uring_cmd,
@@ -8752,6 +8705,7 @@ static int __init io_uring_init(void)
 		BUG_ON(!io_op_defs[i].prep);
 		if (io_op_defs[i].prep != io_eopnotsupp_prep)
 			BUG_ON(!io_op_defs[i].issue);
+		WARN_ON_ONCE(!io_op_defs[i].name);
 	}
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |

From 329061d3e2f9a0082a097e9558bd5497098586c6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 20:31:09 -0600
Subject: [PATCH 049/172] io_uring: move poll handling into its own file

Add a io_poll_issue() rather than export the general task_work locking
and io_issue_sqe(), and put the io_op_defs definition and structure into
a separate header file so that poll can use it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/io_uring.c | 842 +-------------------------------------------
 io_uring/io_uring.h |  32 ++
 io_uring/opdef.h    |  40 +++
 io_uring/poll.c     | 760 +++++++++++++++++++++++++++++++++++++++
 io_uring/poll.h     |  30 ++
 6 files changed, 879 insertions(+), 827 deletions(-)
 create mode 100644 io_uring/opdef.h
 create mode 100644 io_uring/poll.c
 create mode 100644 io_uring/poll.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 2db085cdedad84..eb1b07a9951639 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
-					sqpoll.o fdinfo.o tctx.o
+					sqpoll.o fdinfo.o tctx.o poll.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3f006907c8c5f4..b4ecfa7238555f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -88,6 +88,7 @@
 
 #include "io_uring_types.h"
 #include "io_uring.h"
+#include "opdef.h"
 #include "refs.h"
 #include "tctx.h"
 #include "sqpoll.h"
@@ -106,6 +107,7 @@
 #include "net.h"
 #include "msg_ring.h"
 #include "timeout.h"
+#include "poll.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -208,22 +210,6 @@ struct io_buffer {
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
  */
-struct io_poll {
-	struct file			*file;
-	struct wait_queue_head		*head;
-	__poll_t			events;
-	struct wait_queue_entry		wait;
-};
-
-struct io_poll_update {
-	struct file			*file;
-	u64				old_user_data;
-	u64				new_user_data;
-	__poll_t			events;
-	bool				update_events;
-	bool				update_user_data;
-};
-
 struct io_cancel {
 	struct file			*file;
 	u64				addr;
@@ -268,11 +254,6 @@ struct io_async_rw {
 	struct wait_page_queue		wpq;
 };
 
-struct async_poll {
-	struct io_poll		poll;
-	struct io_poll		*double_poll;
-};
-
 enum {
 	IORING_RSRC_FILE		= 0,
 	IORING_RSRC_BUFFER		= 1,
@@ -289,42 +270,6 @@ struct io_defer_entry {
 	u32			seq;
 };
 
-struct io_op_def {
-	/* needs req->file assigned */
-	unsigned		needs_file : 1;
-	/* should block plug */
-	unsigned		plug : 1;
-	/* hash wq insertion if file is a regular file */
-	unsigned		hash_reg_file : 1;
-	/* unbound wq insertion if file is a non-regular file */
-	unsigned		unbound_nonreg_file : 1;
-	/* set if opcode supports polled "wait" */
-	unsigned		pollin : 1;
-	unsigned		pollout : 1;
-	unsigned		poll_exclusive : 1;
-	/* op supports buffer selection */
-	unsigned		buffer_select : 1;
-	/* opcode is not supported by this kernel */
-	unsigned		not_supported : 1;
-	/* skip auditing */
-	unsigned		audit_skip : 1;
-	/* supports ioprio */
-	unsigned		ioprio : 1;
-	/* supports iopoll */
-	unsigned		iopoll : 1;
-	/* size of async data needed, if any */
-	unsigned short		async_size;
-
-	const char		*name;
-
-	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
-	int (*issue)(struct io_kiocb *, unsigned int);
-	int (*prep_async)(struct io_kiocb *);
-	void (*cleanup)(struct io_kiocb *);
-};
-
-static const struct io_op_def io_op_defs[];
-
 /* requests with any of those set should undergo io_disarm_next() */
 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
@@ -529,32 +474,12 @@ static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 	return xa_load(&ctx->io_bl_xa, bgid);
 }
 
-static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	struct io_buffer *buf;
 
-	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
-		return;
-	/*
-	 * For legacy provided buffer mode, don't recycle if we already did
-	 * IO to this buffer. For ring-mapped provided buffer mode, we should
-	 * increment ring->head to explicitly monopolize the buffer to avoid
-	 * multiple use.
-	 */
-	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
-	    (req->flags & REQ_F_PARTIAL_IO))
-		return;
-
-	/*
-	 * READV uses fields in `struct io_rw` (len/addr) to stash the selected
-	 * buffer data. However if that buffer is recycled the original request
-	 * data stored in addr is lost. Therefore forbid recycling for now.
-	 */
-	if (req->opcode == IORING_OP_READV)
-		return;
-
 	/*
 	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
 	 * the flag and hence ensure that bl->head doesn't get incremented.
@@ -599,8 +524,8 @@ static bool io_match_linked(struct io_kiocb *head)
  * As io_match_task() but protected against racing with linked timeouts.
  * User must not hold timeout_lock.
  */
-static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
-			       bool cancel_all)
+bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+			bool cancel_all)
 {
 	bool matched;
 
@@ -1310,7 +1235,7 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 		io_req_complete_post(req);
 }
 
-static void io_req_complete_failed(struct io_kiocb *req, s32 res)
+void io_req_complete_failed(struct io_kiocb *req, s32 res)
 {
 	req_set_fail(req);
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
@@ -1656,7 +1581,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 	io_req_complete_failed(req, req->cqe.res);
 }
 
-static void io_req_task_submit(struct io_kiocb *req, bool *locked)
+void io_req_task_submit(struct io_kiocb *req, bool *locked)
 {
 	io_tw_lock(req->ctx, locked);
 	/* req->task == current here, checking PF_EXITING is safe */
@@ -3437,749 +3362,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-struct io_poll_table {
-	struct poll_table_struct pt;
-	struct io_kiocb *req;
-	int nr_entries;
-	int error;
-};
-
-#define IO_POLL_CANCEL_FLAG	BIT(31)
-#define IO_POLL_REF_MASK	GENMASK(30, 0)
-
-/*
- * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
- * bump it and acquire ownership. It's disallowed to modify requests while not
- * owning it, that prevents from races for enqueueing task_work's and b/w
- * arming poll and wakeups.
- */
-static inline bool io_poll_get_ownership(struct io_kiocb *req)
-{
-	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
-}
-
-static void io_poll_mark_cancelled(struct io_kiocb *req)
-{
-	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
-}
-
-static struct io_poll *io_poll_get_double(struct io_kiocb *req)
-{
-	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
-	if (req->opcode == IORING_OP_POLL_ADD)
-		return req->async_data;
-	return req->apoll->double_poll;
-}
-
-static struct io_poll *io_poll_get_single(struct io_kiocb *req)
-{
-	if (req->opcode == IORING_OP_POLL_ADD)
-		return io_kiocb_to_cmd(req);
-	return &req->apoll->poll;
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	struct hlist_head *list;
-
-	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
-	hlist_add_head(&req->hash_node, list);
-}
-
-static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
-			      wait_queue_func_t wake_func)
-{
-	poll->head = NULL;
-#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
-	/* mask in events that we always want/need */
-	poll->events = events | IO_POLL_UNMASK;
-	INIT_LIST_HEAD(&poll->wait.entry);
-	init_waitqueue_func_entry(&poll->wait, wake_func);
-}
-
-static inline void io_poll_remove_entry(struct io_poll *poll)
-{
-	struct wait_queue_head *head = smp_load_acquire(&poll->head);
-
-	if (head) {
-		spin_lock_irq(&head->lock);
-		list_del_init(&poll->wait.entry);
-		poll->head = NULL;
-		spin_unlock_irq(&head->lock);
-	}
-}
-
-static void io_poll_remove_entries(struct io_kiocb *req)
-{
-	/*
-	 * Nothing to do if neither of those flags are set. Avoid dipping
-	 * into the poll/apoll/double cachelines if we can.
-	 */
-	if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
-		return;
-
-	/*
-	 * While we hold the waitqueue lock and the waitqueue is nonempty,
-	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
-	 * lock in the first place can race with the waitqueue being freed.
-	 *
-	 * We solve this as eventpoll does: by taking advantage of the fact that
-	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
-	 * we enter rcu_read_lock() and see that the pointer to the queue is
-	 * non-NULL, we can then lock it without the memory being freed out from
-	 * under us.
-	 *
-	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
-	 * case the caller deletes the entry from the queue, leaving it empty.
-	 * In that case, only RCU prevents the queue memory from being freed.
-	 */
-	rcu_read_lock();
-	if (req->flags & REQ_F_SINGLE_POLL)
-		io_poll_remove_entry(io_poll_get_single(req));
-	if (req->flags & REQ_F_DOUBLE_POLL)
-		io_poll_remove_entry(io_poll_get_double(req));
-	rcu_read_unlock();
-}
-
-static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
-/*
- * All poll tw should go through this. Checks for poll events, manages
- * references, does rewait, etc.
- *
- * Returns a negative error on failure. >0 when no action require, which is
- * either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->cqe.res.
- */
-static int io_poll_check_events(struct io_kiocb *req, bool *locked)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	int v, ret;
-
-	/* req->task == current here, checking PF_EXITING is safe */
-	if (unlikely(req->task->flags & PF_EXITING))
-		return -ECANCELED;
-
-	do {
-		v = atomic_read(&req->poll_refs);
-
-		/* tw handler should be the owner, and so have some references */
-		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
-			return 0;
-		if (v & IO_POLL_CANCEL_FLAG)
-			return -ECANCELED;
-
-		if (!req->cqe.res) {
-			struct poll_table_struct pt = { ._key = req->apoll_events };
-			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
-		}
-
-		if ((unlikely(!req->cqe.res)))
-			continue;
-		if (req->apoll_events & EPOLLONESHOT)
-			return 0;
-
-		/* multishot, just fill a CQE and proceed */
-		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
-			__poll_t mask = mangle_poll(req->cqe.res &
-						    req->apoll_events);
-			bool filled;
-
-			spin_lock(&ctx->completion_lock);
-			filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
-						 mask, IORING_CQE_F_MORE);
-			io_commit_cqring(ctx);
-			spin_unlock(&ctx->completion_lock);
-			if (filled) {
-				io_cqring_ev_posted(ctx);
-				continue;
-			}
-			return -ECANCELED;
-		}
-
-		io_tw_lock(req->ctx, locked);
-		if (unlikely(req->task->flags & PF_EXITING))
-			return -EFAULT;
-		ret = io_issue_sqe(req,
-				   IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
-		if (ret)
-			return ret;
-
-		/*
-		 * Release all references, retry if someone tried to restart
-		 * task_work while we were executing it.
-		 */
-	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
-
-	return 1;
-}
-
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret;
-
-	ret = io_poll_check_events(req, locked);
-	if (ret > 0)
-		return;
-
-	if (!ret) {
-		struct io_poll *poll = io_kiocb_to_cmd(req);
-
-		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
-	} else {
-		req->cqe.res = ret;
-		req_set_fail(req);
-	}
-
-	io_poll_remove_entries(req);
-	spin_lock(&ctx->completion_lock);
-	hash_del(&req->hash_node);
-	req->cqe.flags = 0;
-	__io_req_complete_post(req);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
-}
-
-static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret;
-
-	ret = io_poll_check_events(req, locked);
-	if (ret > 0)
-		return;
-
-	io_poll_remove_entries(req);
-	spin_lock(&ctx->completion_lock);
-	hash_del(&req->hash_node);
-	spin_unlock(&ctx->completion_lock);
-
-	if (!ret)
-		io_req_task_submit(req, locked);
-	else
-		io_req_complete_failed(req, ret);
-}
-
-static void __io_poll_execute(struct io_kiocb *req, int mask,
-			      __poll_t __maybe_unused events)
-{
-	io_req_set_res(req, mask, 0);
-	/*
-	 * This is useful for poll that is armed on behalf of another
-	 * request, and where the wakeup path could be on a different
-	 * CPU. We want to avoid pulling in req->apoll->events for that
-	 * case.
-	 */
-	if (req->opcode == IORING_OP_POLL_ADD)
-		req->io_task_work.func = io_poll_task_func;
-	else
-		req->io_task_work.func = io_apoll_task_func;
-
-	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
-	io_req_task_work_add(req);
-}
-
-static inline void io_poll_execute(struct io_kiocb *req, int res,
-		__poll_t events)
-{
-	if (io_poll_get_ownership(req))
-		__io_poll_execute(req, res, events);
-}
-
-static void io_poll_cancel_req(struct io_kiocb *req)
-{
-	io_poll_mark_cancelled(req);
-	/* kick tw, which should complete the request */
-	io_poll_execute(req, 0, 0);
-}
-
-#define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
-#define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
-#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
-
-static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
-			void *key)
-{
-	struct io_kiocb *req = wqe_to_req(wait);
-	struct io_poll *poll = container_of(wait, struct io_poll, wait);
-	__poll_t mask = key_to_poll(key);
-
-	if (unlikely(mask & POLLFREE)) {
-		io_poll_mark_cancelled(req);
-		/* we have to kick tw in case it's not already */
-		io_poll_execute(req, 0, poll->events);
-
-		/*
-		 * If the waitqueue is being freed early but someone is already
-		 * holds ownership over it, we have to tear down the request as
-		 * best we can. That means immediately removing the request from
-		 * its waitqueue and preventing all further accesses to the
-		 * waitqueue via the request.
-		 */
-		list_del_init(&poll->wait.entry);
-
-		/*
-		 * Careful: this *must* be the last step, since as soon
-		 * as req->head is NULL'ed out, the request can be
-		 * completed and freed, since aio_poll_complete_work()
-		 * will no longer need to take the waitqueue lock.
-		 */
-		smp_store_release(&poll->head, NULL);
-		return 1;
-	}
-
-	/* for instances that support it check for an event match first */
-	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
-		return 0;
-
-	if (io_poll_get_ownership(req)) {
-		/* optional, saves extra locking for removal in tw handler */
-		if (mask && poll->events & EPOLLONESHOT) {
-			list_del_init(&poll->wait.entry);
-			poll->head = NULL;
-			if (wqe_is_double(wait))
-				req->flags &= ~REQ_F_DOUBLE_POLL;
-			else
-				req->flags &= ~REQ_F_SINGLE_POLL;
-		}
-		__io_poll_execute(req, mask, poll->events);
-	}
-	return 1;
-}
-
-static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
-			    struct wait_queue_head *head,
-			    struct io_poll **poll_ptr)
-{
-	struct io_kiocb *req = pt->req;
-	unsigned long wqe_private = (unsigned long) req;
-
-	/*
-	 * The file being polled uses multiple waitqueues for poll handling
-	 * (e.g. one for read, one for write). Setup a separate io_poll
-	 * if this happens.
-	 */
-	if (unlikely(pt->nr_entries)) {
-		struct io_poll *first = poll;
-
-		/* double add on the same waitqueue head, ignore */
-		if (first->head == head)
-			return;
-		/* already have a 2nd entry, fail a third attempt */
-		if (*poll_ptr) {
-			if ((*poll_ptr)->head == head)
-				return;
-			pt->error = -EINVAL;
-			return;
-		}
-
-		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
-		if (!poll) {
-			pt->error = -ENOMEM;
-			return;
-		}
-		/* mark as double wq entry */
-		wqe_private |= 1;
-		req->flags |= REQ_F_DOUBLE_POLL;
-		io_init_poll_iocb(poll, first->events, first->wait.func);
-		*poll_ptr = poll;
-		if (req->opcode == IORING_OP_POLL_ADD)
-			req->flags |= REQ_F_ASYNC_DATA;
-	}
-
-	req->flags |= REQ_F_SINGLE_POLL;
-	pt->nr_entries++;
-	poll->head = head;
-	poll->wait.private = (void *) wqe_private;
-
-	if (poll->events & EPOLLEXCLUSIVE)
-		add_wait_queue_exclusive(head, &poll->wait);
-	else
-		add_wait_queue(head, &poll->wait);
-}
-
-static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
-			       struct poll_table_struct *p)
-{
-	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-	struct io_poll *poll = io_kiocb_to_cmd(pt->req);
-
-	__io_queue_proc(poll, pt, head,
-			(struct io_poll **) &pt->req->async_data);
-}
-
-static int __io_arm_poll_handler(struct io_kiocb *req,
-				 struct io_poll *poll,
-				 struct io_poll_table *ipt, __poll_t mask)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	int v;
-
-	INIT_HLIST_NODE(&req->hash_node);
-	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
-	io_init_poll_iocb(poll, mask, io_poll_wake);
-	poll->file = req->file;
-
-	req->apoll_events = poll->events;
-
-	ipt->pt._key = mask;
-	ipt->req = req;
-	ipt->error = 0;
-	ipt->nr_entries = 0;
-
-	/*
-	 * Take the ownership to delay any tw execution up until we're done
-	 * with poll arming. see io_poll_get_ownership().
-	 */
-	atomic_set(&req->poll_refs, 1);
-	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
-
-	if (mask && (poll->events & EPOLLONESHOT)) {
-		io_poll_remove_entries(req);
-		/* no one else has access to the req, forget about the ref */
-		return mask;
-	}
-	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
-		io_poll_remove_entries(req);
-		if (!ipt->error)
-			ipt->error = -EINVAL;
-		return 0;
-	}
-
-	spin_lock(&ctx->completion_lock);
-	io_poll_req_insert(req);
-	spin_unlock(&ctx->completion_lock);
-
-	if (mask) {
-		/* can't multishot if failed, just queue the event we've got */
-		if (unlikely(ipt->error || !ipt->nr_entries)) {
-			poll->events |= EPOLLONESHOT;
-			req->apoll_events |= EPOLLONESHOT;
-			ipt->error = 0;
-		}
-		__io_poll_execute(req, mask, poll->events);
-		return 0;
-	}
-
-	/*
-	 * Release ownership. If someone tried to queue a tw while it was
-	 * locked, kick it off for them.
-	 */
-	v = atomic_dec_return(&req->poll_refs);
-	if (unlikely(v & IO_POLL_REF_MASK))
-		__io_poll_execute(req, 0, poll->events);
-	return 0;
-}
-
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
-			       struct poll_table_struct *p)
-{
-	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-	struct async_poll *apoll = pt->req->apoll;
-
-	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
-}
-
-enum {
-	IO_APOLL_OK,
-	IO_APOLL_ABORTED,
-	IO_APOLL_READY
-};
-
-static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
-{
-	const struct io_op_def *def = &io_op_defs[req->opcode];
-	struct io_ring_ctx *ctx = req->ctx;
-	struct async_poll *apoll;
-	struct io_poll_table ipt;
-	__poll_t mask = POLLPRI | POLLERR;
-	int ret;
-
-	if (!def->pollin && !def->pollout)
-		return IO_APOLL_ABORTED;
-	if (!file_can_poll(req->file))
-		return IO_APOLL_ABORTED;
-	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
-		return IO_APOLL_ABORTED;
-	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
-		mask |= EPOLLONESHOT;
-
-	if (def->pollin) {
-		mask |= EPOLLIN | EPOLLRDNORM;
-
-		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
-		if (req->flags & REQ_F_CLEAR_POLLIN)
-			mask &= ~EPOLLIN;
-	} else {
-		mask |= EPOLLOUT | EPOLLWRNORM;
-	}
-	if (def->poll_exclusive)
-		mask |= EPOLLEXCLUSIVE;
-	if (req->flags & REQ_F_POLLED) {
-		apoll = req->apoll;
-		kfree(apoll->double_poll);
-	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-		   !list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del_init(&apoll->poll.wait.entry);
-	} else {
-		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
-		if (unlikely(!apoll))
-			return IO_APOLL_ABORTED;
-	}
-	apoll->double_poll = NULL;
-	req->apoll = apoll;
-	req->flags |= REQ_F_POLLED;
-	ipt.pt._qproc = io_async_queue_proc;
-
-	io_kbuf_recycle(req, issue_flags);
-
-	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
-	if (ret || ipt.error)
-		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
-
-	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
-				mask, apoll->poll.events);
-	return IO_APOLL_OK;
-}
-
-/*
- * Returns true if we found and killed one or more poll requests
- */
-static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
-				      struct task_struct *tsk, bool cancel_all)
-{
-	struct hlist_node *tmp;
-	struct io_kiocb *req;
-	bool found = false;
-	int i;
-
-	spin_lock(&ctx->completion_lock);
-	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct hlist_head *list;
-
-		list = &ctx->cancel_hash[i];
-		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
-			if (io_match_task_safe(req, tsk, cancel_all)) {
-				hlist_del_init(&req->hash_node);
-				io_poll_cancel_req(req);
-				found = true;
-			}
-		}
-	}
-	spin_unlock(&ctx->completion_lock);
-	return found;
-}
-
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
-				     struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
-{
-	struct hlist_head *list;
-	struct io_kiocb *req;
-
-	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
-	hlist_for_each_entry(req, list, hash_node) {
-		if (cd->data != req->cqe.user_data)
-			continue;
-		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
-			continue;
-		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
-			if (cd->seq == req->work.cancel_seq)
-				continue;
-			req->work.cancel_seq = cd->seq;
-		}
-		return req;
-	}
-	return NULL;
-}
-
-static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
-					  struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
-{
-	struct io_kiocb *req;
-	int i;
-
-	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct hlist_head *list;
-
-		list = &ctx->cancel_hash[i];
-		hlist_for_each_entry(req, list, hash_node) {
-			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
-			    req->file != cd->file)
-				continue;
-			if (cd->seq == req->work.cancel_seq)
-				continue;
-			req->work.cancel_seq = cd->seq;
-			return req;
-		}
-	}
-	return NULL;
-}
-
-static bool io_poll_disarm(struct io_kiocb *req)
-	__must_hold(&ctx->completion_lock)
-{
-	if (!io_poll_get_ownership(req))
-		return false;
-	io_poll_remove_entries(req);
-	hash_del(&req->hash_node);
-	return true;
-}
-
-static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
-{
-	struct io_kiocb *req;
-
-	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd);
-	else
-		req = io_poll_find(ctx, false, cd);
-	if (!req)
-		return -ENOENT;
-	io_poll_cancel_req(req);
-	return 0;
-}
-
-static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
-				     unsigned int flags)
-{
-	u32 events;
-
-	events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
-	events = swahw32(events);
-#endif
-	if (!(flags & IORING_POLL_ADD_MULTI))
-		events |= EPOLLONESHOT;
-	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
-}
-
-static int io_poll_remove_prep(struct io_kiocb *req,
-			       const struct io_uring_sqe *sqe)
-{
-	struct io_poll_update *upd = io_kiocb_to_cmd(req);
-	u32 flags;
-
-	if (sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	flags = READ_ONCE(sqe->len);
-	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
-		      IORING_POLL_ADD_MULTI))
-		return -EINVAL;
-	/* meaningless without update */
-	if (flags == IORING_POLL_ADD_MULTI)
-		return -EINVAL;
-
-	upd->old_user_data = READ_ONCE(sqe->addr);
-	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
-	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
-
-	upd->new_user_data = READ_ONCE(sqe->off);
-	if (!upd->update_user_data && upd->new_user_data)
-		return -EINVAL;
-	if (upd->update_events)
-		upd->events = io_poll_parse_events(sqe, flags);
-	else if (sqe->poll32_events)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_poll *poll = io_kiocb_to_cmd(req);
-	u32 flags;
-
-	if (sqe->buf_index || sqe->off || sqe->addr)
-		return -EINVAL;
-	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
-		return -EINVAL;
-	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
-		return -EINVAL;
-
-	io_req_set_refcount(req);
-	poll->events = io_poll_parse_events(sqe, flags);
-	return 0;
-}
-
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_poll *poll = io_kiocb_to_cmd(req);
-	struct io_poll_table ipt;
-	int ret;
-
-	ipt.pt._qproc = io_poll_queue_proc;
-
-	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
-	if (ret) {
-		io_req_set_res(req, ret, 0);
-		return IOU_OK;
-	}
-	if (ipt.error) {
-		req_set_fail(req);
-		return ipt.error;
-	}
-
-	return IOU_ISSUE_SKIP_COMPLETE;
-}
-
-static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
-	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_kiocb *preq;
-	int ret2, ret = 0;
-	bool locked;
-
-	spin_lock(&ctx->completion_lock);
-	preq = io_poll_find(ctx, true, &cd);
-	if (!preq || !io_poll_disarm(preq)) {
-		spin_unlock(&ctx->completion_lock);
-		ret = preq ? -EALREADY : -ENOENT;
-		goto out;
-	}
-	spin_unlock(&ctx->completion_lock);
-
-	if (poll_update->update_events || poll_update->update_user_data) {
-		/* only mask one event flags, keep behavior flags */
-		if (poll_update->update_events) {
-			struct io_poll *poll = io_kiocb_to_cmd(preq);
-
-			poll->events &= ~0xffff;
-			poll->events |= poll_update->events & 0xffff;
-			poll->events |= IO_POLL_UNMASK;
-		}
-		if (poll_update->update_user_data)
-			preq->cqe.user_data = poll_update->new_user_data;
-
-		ret2 = io_poll_add(preq, issue_flags);
-		/* successfully updated, don't complete poll request */
-		if (!ret2 || ret2 == -EIOCBQUEUED)
-			goto out;
-	}
-
-	req_set_fail(preq);
-	io_req_set_res(preq, -ECANCELED, 0);
-	locked = !(issue_flags & IO_URING_F_UNLOCKED);
-	io_req_task_complete(preq, &locked);
-out:
-	if (ret < 0) {
-		req_set_fail(req);
-		return ret;
-	}
-	/* complete update request, we're done with it */
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -4589,6 +3771,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+int io_poll_issue(struct io_kiocb *req, bool *locked)
+{
+	io_tw_lock(req->ctx, locked);
+	if (unlikely(req->task->flags & PF_EXITING))
+		return -EFAULT;
+	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+}
+
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -8209,7 +7399,7 @@ static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 	return -ECANCELED;
 }
 
-static const struct io_op_def io_op_defs[] = {
+const struct io_op_def io_op_defs[] = {
 	[IORING_OP_NOP] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 60678e88a9b92c..1ceac4ea62bf3b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -92,6 +92,7 @@ static inline bool io_run_task_work(void)
 	return false;
 }
 
+void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
 void __io_req_complete_post(struct io_kiocb *req);
@@ -109,6 +110,32 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
 	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
 }
 
+void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags);
+static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+{
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
+		return;
+	/*
+	 * For legacy provided buffer mode, don't recycle if we already did
+	 * IO to this buffer. For ring-mapped provided buffer mode, we should
+	 * increment ring->head to explicitly monopolize the buffer to avoid
+	 * multiple use.
+	 */
+	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
+	    (req->flags & REQ_F_PARTIAL_IO))
+		return;
+
+	/*
+	 * READV uses fields in `struct io_rw` (len/addr) to stash the selected
+	 * buffer data. However if that buffer is recycled the original request
+	 * data stored in addr is lost. Therefore forbid recycling for now.
+	 */
+	if (req->opcode == IORING_OP_READV)
+		return;
+
+	__io_kbuf_recycle(req, issue_flags);
+}
+
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
@@ -128,12 +155,14 @@ void io_req_task_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_complete(struct io_kiocb *req, bool *locked);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
+void io_req_task_submit(struct io_kiocb *req, bool *locked);
 void tctx_task_work(struct callback_head *cb);
 int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 
+int io_poll_issue(struct io_kiocb *req, bool *locked);
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 
@@ -143,6 +172,9 @@ void io_wq_submit_work(struct io_wq_work *work);
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
 
+bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+			bool cancel_all);
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
new file mode 100644
index 00000000000000..4578adcdba8a41
--- /dev/null
+++ b/io_uring/opdef.h
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_OP_DEF_H
+#define IOU_OP_DEF_H
+
+struct io_op_def {
+	/* needs req->file assigned */
+	unsigned		needs_file : 1;
+	/* should block plug */
+	unsigned		plug : 1;
+	/* hash wq insertion if file is a regular file */
+	unsigned		hash_reg_file : 1;
+	/* unbound wq insertion if file is a non-regular file */
+	unsigned		unbound_nonreg_file : 1;
+	/* set if opcode supports polled "wait" */
+	unsigned		pollin : 1;
+	unsigned		pollout : 1;
+	unsigned		poll_exclusive : 1;
+	/* op supports buffer selection */
+	unsigned		buffer_select : 1;
+	/* opcode is not supported by this kernel */
+	unsigned		not_supported : 1;
+	/* skip auditing */
+	unsigned		audit_skip : 1;
+	/* supports ioprio */
+	unsigned		ioprio : 1;
+	/* supports iopoll */
+	unsigned		iopoll : 1;
+	/* size of async data needed, if any */
+	unsigned short		async_size;
+
+	const char		*name;
+
+	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
+	int (*issue)(struct io_kiocb *, unsigned int);
+	int (*prep_async)(struct io_kiocb *);
+	void (*cleanup)(struct io_kiocb *);
+};
+
+extern const struct io_op_def io_op_defs[];
+#endif
diff --git a/io_uring/poll.c b/io_uring/poll.c
new file mode 100644
index 00000000000000..c3e4fcb0a7ba7b
--- /dev/null
+++ b/io_uring/poll.c
@@ -0,0 +1,760 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/hashtable.h>
+#include <linux/io_uring.h>
+
+#include <trace/events/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "refs.h"
+#include "opdef.h"
+#include "poll.h"
+
+struct io_poll_update {
+	struct file			*file;
+	u64				old_user_data;
+	u64				new_user_data;
+	__poll_t			events;
+	bool				update_events;
+	bool				update_user_data;
+};
+
+struct io_poll_table {
+	struct poll_table_struct pt;
+	struct io_kiocb *req;
+	int nr_entries;
+	int error;
+};
+
+#define IO_POLL_CANCEL_FLAG	BIT(31)
+#define IO_POLL_REF_MASK	GENMASK(30, 0)
+
+/*
+ * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
+ * bump it and acquire ownership. It's disallowed to modify requests while not
+ * owning it, that prevents from races for enqueueing task_work's and b/w
+ * arming poll and wakeups.
+ */
+static inline bool io_poll_get_ownership(struct io_kiocb *req)
+{
+	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
+}
+
+static void io_poll_mark_cancelled(struct io_kiocb *req)
+{
+	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
+}
+
+static struct io_poll *io_poll_get_double(struct io_kiocb *req)
+{
+	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
+	if (req->opcode == IORING_OP_POLL_ADD)
+		return req->async_data;
+	return req->apoll->double_poll;
+}
+
+static struct io_poll *io_poll_get_single(struct io_kiocb *req)
+{
+	if (req->opcode == IORING_OP_POLL_ADD)
+		return io_kiocb_to_cmd(req);
+	return &req->apoll->poll;
+}
+
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct hlist_head *list;
+
+	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
+	hlist_add_head(&req->hash_node, list);
+}
+
+static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
+			      wait_queue_func_t wake_func)
+{
+	poll->head = NULL;
+#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+	/* mask in events that we always want/need */
+	poll->events = events | IO_POLL_UNMASK;
+	INIT_LIST_HEAD(&poll->wait.entry);
+	init_waitqueue_func_entry(&poll->wait, wake_func);
+}
+
+static inline void io_poll_remove_entry(struct io_poll *poll)
+{
+	struct wait_queue_head *head = smp_load_acquire(&poll->head);
+
+	if (head) {
+		spin_lock_irq(&head->lock);
+		list_del_init(&poll->wait.entry);
+		poll->head = NULL;
+		spin_unlock_irq(&head->lock);
+	}
+}
+
+static void io_poll_remove_entries(struct io_kiocb *req)
+{
+	/*
+	 * Nothing to do if neither of those flags are set. Avoid dipping
+	 * into the poll/apoll/double cachelines if we can.
+	 */
+	if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
+		return;
+
+	/*
+	 * While we hold the waitqueue lock and the waitqueue is nonempty,
+	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
+	 * lock in the first place can race with the waitqueue being freed.
+	 *
+	 * We solve this as eventpoll does: by taking advantage of the fact that
+	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
+	 * we enter rcu_read_lock() and see that the pointer to the queue is
+	 * non-NULL, we can then lock it without the memory being freed out from
+	 * under us.
+	 *
+	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+	 * case the caller deletes the entry from the queue, leaving it empty.
+	 * In that case, only RCU prevents the queue memory from being freed.
+	 */
+	rcu_read_lock();
+	if (req->flags & REQ_F_SINGLE_POLL)
+		io_poll_remove_entry(io_poll_get_single(req));
+	if (req->flags & REQ_F_DOUBLE_POLL)
+		io_poll_remove_entry(io_poll_get_double(req));
+	rcu_read_unlock();
+}
+
+/*
+ * All poll tw should go through this. Checks for poll events, manages
+ * references, does rewait, etc.
+ *
+ * Returns a negative error on failure. >0 when no action require, which is
+ * either spurious wakeup or multishot CQE is served. 0 when it's done with
+ * the request, then the mask is stored in req->cqe.res.
+ */
+static int io_poll_check_events(struct io_kiocb *req, bool *locked)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int v, ret;
+
+	/* req->task == current here, checking PF_EXITING is safe */
+	if (unlikely(req->task->flags & PF_EXITING))
+		return -ECANCELED;
+
+	do {
+		v = atomic_read(&req->poll_refs);
+
+		/* tw handler should be the owner, and so have some references */
+		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
+			return 0;
+		if (v & IO_POLL_CANCEL_FLAG)
+			return -ECANCELED;
+
+		if (!req->cqe.res) {
+			struct poll_table_struct pt = { ._key = req->apoll_events };
+			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
+		}
+
+		if ((unlikely(!req->cqe.res)))
+			continue;
+		if (req->apoll_events & EPOLLONESHOT)
+			return 0;
+
+		/* multishot, just fill a CQE and proceed */
+		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+			__poll_t mask = mangle_poll(req->cqe.res &
+						    req->apoll_events);
+			bool filled;
+
+			spin_lock(&ctx->completion_lock);
+			filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
+						 mask, IORING_CQE_F_MORE);
+			io_commit_cqring(ctx);
+			spin_unlock(&ctx->completion_lock);
+			if (filled) {
+				io_cqring_ev_posted(ctx);
+				continue;
+			}
+			return -ECANCELED;
+		}
+
+		ret = io_poll_issue(req, locked);
+		if (ret)
+			return ret;
+
+		/*
+		 * Release all references, retry if someone tried to restart
+		 * task_work while we were executing it.
+		 */
+	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
+
+	return 1;
+}
+
+static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	ret = io_poll_check_events(req, locked);
+	if (ret > 0)
+		return;
+
+	if (!ret) {
+		struct io_poll *poll = io_kiocb_to_cmd(req);
+
+		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
+	} else {
+		req->cqe.res = ret;
+		req_set_fail(req);
+	}
+
+	io_poll_remove_entries(req);
+	spin_lock(&ctx->completion_lock);
+	hash_del(&req->hash_node);
+	req->cqe.flags = 0;
+	__io_req_complete_post(req);
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_cqring_ev_posted(ctx);
+}
+
+static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	ret = io_poll_check_events(req, locked);
+	if (ret > 0)
+		return;
+
+	io_poll_remove_entries(req);
+	spin_lock(&ctx->completion_lock);
+	hash_del(&req->hash_node);
+	spin_unlock(&ctx->completion_lock);
+
+	if (!ret)
+		io_req_task_submit(req, locked);
+	else
+		io_req_complete_failed(req, ret);
+}
+
+static void __io_poll_execute(struct io_kiocb *req, int mask,
+			      __poll_t __maybe_unused events)
+{
+	io_req_set_res(req, mask, 0);
+	/*
+	 * This is useful for poll that is armed on behalf of another
+	 * request, and where the wakeup path could be on a different
+	 * CPU. We want to avoid pulling in req->apoll->events for that
+	 * case.
+	 */
+	if (req->opcode == IORING_OP_POLL_ADD)
+		req->io_task_work.func = io_poll_task_func;
+	else
+		req->io_task_work.func = io_apoll_task_func;
+
+	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
+	io_req_task_work_add(req);
+}
+
+static inline void io_poll_execute(struct io_kiocb *req, int res,
+		__poll_t events)
+{
+	if (io_poll_get_ownership(req))
+		__io_poll_execute(req, res, events);
+}
+
+static void io_poll_cancel_req(struct io_kiocb *req)
+{
+	io_poll_mark_cancelled(req);
+	/* kick tw, which should complete the request */
+	io_poll_execute(req, 0, 0);
+}
+
+#define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
+#define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
+#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+			void *key)
+{
+	struct io_kiocb *req = wqe_to_req(wait);
+	struct io_poll *poll = container_of(wait, struct io_poll, wait);
+	__poll_t mask = key_to_poll(key);
+
+	if (unlikely(mask & POLLFREE)) {
+		io_poll_mark_cancelled(req);
+		/* we have to kick tw in case it's not already */
+		io_poll_execute(req, 0, poll->events);
+
+		/*
+		 * If the waitqueue is being freed early but someone is already
+		 * holds ownership over it, we have to tear down the request as
+		 * best we can. That means immediately removing the request from
+		 * its waitqueue and preventing all further accesses to the
+		 * waitqueue via the request.
+		 */
+		list_del_init(&poll->wait.entry);
+
+		/*
+		 * Careful: this *must* be the last step, since as soon
+		 * as req->head is NULL'ed out, the request can be
+		 * completed and freed, since aio_poll_complete_work()
+		 * will no longer need to take the waitqueue lock.
+		 */
+		smp_store_release(&poll->head, NULL);
+		return 1;
+	}
+
+	/* for instances that support it check for an event match first */
+	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
+		return 0;
+
+	if (io_poll_get_ownership(req)) {
+		/* optional, saves extra locking for removal in tw handler */
+		if (mask && poll->events & EPOLLONESHOT) {
+			list_del_init(&poll->wait.entry);
+			poll->head = NULL;
+			if (wqe_is_double(wait))
+				req->flags &= ~REQ_F_DOUBLE_POLL;
+			else
+				req->flags &= ~REQ_F_SINGLE_POLL;
+		}
+		__io_poll_execute(req, mask, poll->events);
+	}
+	return 1;
+}
+
+static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
+			    struct wait_queue_head *head,
+			    struct io_poll **poll_ptr)
+{
+	struct io_kiocb *req = pt->req;
+	unsigned long wqe_private = (unsigned long) req;
+
+	/*
+	 * The file being polled uses multiple waitqueues for poll handling
+	 * (e.g. one for read, one for write). Setup a separate io_poll
+	 * if this happens.
+	 */
+	if (unlikely(pt->nr_entries)) {
+		struct io_poll *first = poll;
+
+		/* double add on the same waitqueue head, ignore */
+		if (first->head == head)
+			return;
+		/* already have a 2nd entry, fail a third attempt */
+		if (*poll_ptr) {
+			if ((*poll_ptr)->head == head)
+				return;
+			pt->error = -EINVAL;
+			return;
+		}
+
+		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
+		if (!poll) {
+			pt->error = -ENOMEM;
+			return;
+		}
+		/* mark as double wq entry */
+		wqe_private |= 1;
+		req->flags |= REQ_F_DOUBLE_POLL;
+		io_init_poll_iocb(poll, first->events, first->wait.func);
+		*poll_ptr = poll;
+		if (req->opcode == IORING_OP_POLL_ADD)
+			req->flags |= REQ_F_ASYNC_DATA;
+	}
+
+	req->flags |= REQ_F_SINGLE_POLL;
+	pt->nr_entries++;
+	poll->head = head;
+	poll->wait.private = (void *) wqe_private;
+
+	if (poll->events & EPOLLEXCLUSIVE)
+		add_wait_queue_exclusive(head, &poll->wait);
+	else
+		add_wait_queue(head, &poll->wait);
+}
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+			       struct poll_table_struct *p)
+{
+	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+	struct io_poll *poll = io_kiocb_to_cmd(pt->req);
+
+	__io_queue_proc(poll, pt, head,
+			(struct io_poll **) &pt->req->async_data);
+}
+
+static int __io_arm_poll_handler(struct io_kiocb *req,
+				 struct io_poll *poll,
+				 struct io_poll_table *ipt, __poll_t mask)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int v;
+
+	INIT_HLIST_NODE(&req->hash_node);
+	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
+	io_init_poll_iocb(poll, mask, io_poll_wake);
+	poll->file = req->file;
+
+	req->apoll_events = poll->events;
+
+	ipt->pt._key = mask;
+	ipt->req = req;
+	ipt->error = 0;
+	ipt->nr_entries = 0;
+
+	/*
+	 * Take the ownership to delay any tw execution up until we're done
+	 * with poll arming. see io_poll_get_ownership().
+	 */
+	atomic_set(&req->poll_refs, 1);
+	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+
+	if (mask && (poll->events & EPOLLONESHOT)) {
+		io_poll_remove_entries(req);
+		/* no one else has access to the req, forget about the ref */
+		return mask;
+	}
+	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
+		io_poll_remove_entries(req);
+		if (!ipt->error)
+			ipt->error = -EINVAL;
+		return 0;
+	}
+
+	spin_lock(&ctx->completion_lock);
+	io_poll_req_insert(req);
+	spin_unlock(&ctx->completion_lock);
+
+	if (mask) {
+		/* can't multishot if failed, just queue the event we've got */
+		if (unlikely(ipt->error || !ipt->nr_entries)) {
+			poll->events |= EPOLLONESHOT;
+			req->apoll_events |= EPOLLONESHOT;
+			ipt->error = 0;
+		}
+		__io_poll_execute(req, mask, poll->events);
+		return 0;
+	}
+
+	/*
+	 * Release ownership. If someone tried to queue a tw while it was
+	 * locked, kick it off for them.
+	 */
+	v = atomic_dec_return(&req->poll_refs);
+	if (unlikely(v & IO_POLL_REF_MASK))
+		__io_poll_execute(req, 0, poll->events);
+	return 0;
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+			       struct poll_table_struct *p)
+{
+	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+	struct async_poll *apoll = pt->req->apoll;
+
+	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
+}
+
+int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
+{
+	const struct io_op_def *def = &io_op_defs[req->opcode];
+	struct io_ring_ctx *ctx = req->ctx;
+	struct async_poll *apoll;
+	struct io_poll_table ipt;
+	__poll_t mask = POLLPRI | POLLERR;
+	int ret;
+
+	if (!def->pollin && !def->pollout)
+		return IO_APOLL_ABORTED;
+	if (!file_can_poll(req->file))
+		return IO_APOLL_ABORTED;
+	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
+		return IO_APOLL_ABORTED;
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
+		mask |= EPOLLONESHOT;
+
+	if (def->pollin) {
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
+		if (req->flags & REQ_F_CLEAR_POLLIN)
+			mask &= ~EPOLLIN;
+	} else {
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	}
+	if (def->poll_exclusive)
+		mask |= EPOLLEXCLUSIVE;
+	if (req->flags & REQ_F_POLLED) {
+		apoll = req->apoll;
+		kfree(apoll->double_poll);
+	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+		   !list_empty(&ctx->apoll_cache)) {
+		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+						poll.wait.entry);
+		list_del_init(&apoll->poll.wait.entry);
+	} else {
+		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+		if (unlikely(!apoll))
+			return IO_APOLL_ABORTED;
+	}
+	apoll->double_poll = NULL;
+	req->apoll = apoll;
+	req->flags |= REQ_F_POLLED;
+	ipt.pt._qproc = io_async_queue_proc;
+
+	io_kbuf_recycle(req, issue_flags);
+
+	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
+	if (ret || ipt.error)
+		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
+
+	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
+				mask, apoll->poll.events);
+	return IO_APOLL_OK;
+}
+
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+			       bool cancel_all)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	bool found = false;
+	int i;
+
+	spin_lock(&ctx->completion_lock);
+	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+		struct hlist_head *list;
+
+		list = &ctx->cancel_hash[i];
+		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
+			if (io_match_task_safe(req, tsk, cancel_all)) {
+				hlist_del_init(&req->hash_node);
+				io_poll_cancel_req(req);
+				found = true;
+			}
+		}
+	}
+	spin_unlock(&ctx->completion_lock);
+	return found;
+}
+
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
+				     struct io_cancel_data *cd)
+	__must_hold(&ctx->completion_lock)
+{
+	struct hlist_head *list;
+	struct io_kiocb *req;
+
+	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
+	hlist_for_each_entry(req, list, hash_node) {
+		if (cd->data != req->cqe.user_data)
+			continue;
+		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
+			continue;
+		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+		}
+		return req;
+	}
+	return NULL;
+}
+
+static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
+					  struct io_cancel_data *cd)
+	__must_hold(&ctx->completion_lock)
+{
+	struct io_kiocb *req;
+	int i;
+
+	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+		struct hlist_head *list;
+
+		list = &ctx->cancel_hash[i];
+		hlist_for_each_entry(req, list, hash_node) {
+			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+			    req->file != cd->file)
+				continue;
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static bool io_poll_disarm(struct io_kiocb *req)
+	__must_hold(&ctx->completion_lock)
+{
+	if (!io_poll_get_ownership(req))
+		return false;
+	io_poll_remove_entries(req);
+	hash_del(&req->hash_node);
+	return true;
+}
+
+int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
+	__must_hold(&ctx->completion_lock)
+{
+	struct io_kiocb *req;
+
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
+		req = io_poll_file_find(ctx, cd);
+	else
+		req = io_poll_find(ctx, false, cd);
+	if (!req)
+		return -ENOENT;
+	io_poll_cancel_req(req);
+	return 0;
+}
+
+static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
+				     unsigned int flags)
+{
+	u32 events;
+
+	events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+	events = swahw32(events);
+#endif
+	if (!(flags & IORING_POLL_ADD_MULTI))
+		events |= EPOLLONESHOT;
+	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+}
+
+int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_poll_update *upd = io_kiocb_to_cmd(req);
+	u32 flags;
+
+	if (sqe->buf_index || sqe->splice_fd_in)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->len);
+	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
+		      IORING_POLL_ADD_MULTI))
+		return -EINVAL;
+	/* meaningless without update */
+	if (flags == IORING_POLL_ADD_MULTI)
+		return -EINVAL;
+
+	upd->old_user_data = READ_ONCE(sqe->addr);
+	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
+
+	upd->new_user_data = READ_ONCE(sqe->off);
+	if (!upd->update_user_data && upd->new_user_data)
+		return -EINVAL;
+	if (upd->update_events)
+		upd->events = io_poll_parse_events(sqe, flags);
+	else if (sqe->poll32_events)
+		return -EINVAL;
+
+	return 0;
+}
+
+int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_poll *poll = io_kiocb_to_cmd(req);
+	u32 flags;
+
+	if (sqe->buf_index || sqe->off || sqe->addr)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->len);
+	if (flags & ~IORING_POLL_ADD_MULTI)
+		return -EINVAL;
+	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
+		return -EINVAL;
+
+	io_req_set_refcount(req);
+	poll->events = io_poll_parse_events(sqe, flags);
+	return 0;
+}
+
+int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_poll *poll = io_kiocb_to_cmd(req);
+	struct io_poll_table ipt;
+	int ret;
+
+	ipt.pt._qproc = io_poll_queue_proc;
+
+	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
+	if (ret) {
+		io_req_set_res(req, ret, 0);
+		return IOU_OK;
+	}
+	if (ipt.error) {
+		req_set_fail(req);
+		return ipt.error;
+	}
+
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
+	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *preq;
+	int ret2, ret = 0;
+	bool locked;
+
+	spin_lock(&ctx->completion_lock);
+	preq = io_poll_find(ctx, true, &cd);
+	if (!preq || !io_poll_disarm(preq)) {
+		spin_unlock(&ctx->completion_lock);
+		ret = preq ? -EALREADY : -ENOENT;
+		goto out;
+	}
+	spin_unlock(&ctx->completion_lock);
+
+	if (poll_update->update_events || poll_update->update_user_data) {
+		/* only mask one event flags, keep behavior flags */
+		if (poll_update->update_events) {
+			struct io_poll *poll = io_kiocb_to_cmd(preq);
+
+			poll->events &= ~0xffff;
+			poll->events |= poll_update->events & 0xffff;
+			poll->events |= IO_POLL_UNMASK;
+		}
+		if (poll_update->update_user_data)
+			preq->cqe.user_data = poll_update->new_user_data;
+
+		ret2 = io_poll_add(preq, issue_flags);
+		/* successfully updated, don't complete poll request */
+		if (!ret2 || ret2 == -EIOCBQUEUED)
+			goto out;
+	}
+
+	req_set_fail(preq);
+	io_req_set_res(preq, -ECANCELED, 0);
+	locked = !(issue_flags & IO_URING_F_UNLOCKED);
+	io_req_task_complete(preq, &locked);
+out:
+	if (ret < 0) {
+		req_set_fail(req);
+		return ret;
+	}
+	/* complete update request, we're done with it */
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/poll.h b/io_uring/poll.h
new file mode 100644
index 00000000000000..cc75c1567a84ae
--- /dev/null
+++ b/io_uring/poll.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+
+enum {
+	IO_APOLL_OK,
+	IO_APOLL_ABORTED,
+	IO_APOLL_READY
+};
+
+struct io_poll {
+	struct file			*file;
+	struct wait_queue_head		*head;
+	__poll_t			events;
+	struct wait_queue_entry		wait;
+};
+
+struct async_poll {
+	struct io_poll		poll;
+	struct io_poll		*double_poll;
+};
+
+int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
+int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
+bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+			bool cancel_all);

From 7aaff708a768144ec6459f0a58301be1a6b982fc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 May 2022 20:36:47 -0600
Subject: [PATCH 050/172] io_uring: move cancelation into its own file

This also helps cleanup the io_uring.h cancel parts, as we can make
things static in the cancel.c file, mostly.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   3 +-
 io_uring/cancel.c   | 194 ++++++++++++++++++++++++++++++++++++++++++++
 io_uring/cancel.h   |   6 ++
 io_uring/io_uring.c | 177 +---------------------------------------
 io_uring/io_uring.h |   1 -
 io_uring/timeout.c  |   1 +
 6 files changed, 204 insertions(+), 178 deletions(-)
 create mode 100644 io_uring/cancel.c
 create mode 100644 io_uring/cancel.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index eb1b07a9951639..cfd61e6b7759f7 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -6,5 +6,6 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					sync.o advise.o filetable.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
-					sqpoll.o fdinfo.o tctx.o poll.o
+					sqpoll.o fdinfo.o tctx.o poll.o \
+					cancel.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
new file mode 100644
index 00000000000000..83cceb52d82d64
--- /dev/null
+++ b/io_uring/cancel.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "tctx.h"
+#include "poll.h"
+#include "timeout.h"
+#include "cancel.h"
+
+struct io_cancel {
+	struct file			*file;
+	u64				addr;
+	u32				flags;
+	s32				fd;
+};
+
+#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
+			 IORING_ASYNC_CANCEL_ANY)
+
+static bool io_cancel_cb(struct io_wq_work *work, void *data)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_cancel_data *cd = data;
+
+	if (req->ctx != cd->ctx)
+		return false;
+	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
+		;
+	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->file != cd->file)
+			return false;
+	} else {
+		if (req->cqe.user_data != cd->data)
+			return false;
+	}
+	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+		if (cd->seq == req->work.cancel_seq)
+			return false;
+		req->work.cancel_seq = cd->seq;
+	}
+	return true;
+}
+
+static int io_async_cancel_one(struct io_uring_task *tctx,
+			       struct io_cancel_data *cd)
+{
+	enum io_wq_cancel cancel_ret;
+	int ret = 0;
+	bool all;
+
+	if (!tctx || !tctx->io_wq)
+		return -ENOENT;
+
+	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
+	switch (cancel_ret) {
+	case IO_WQ_CANCEL_OK:
+		ret = 0;
+		break;
+	case IO_WQ_CANCEL_RUNNING:
+		ret = -EALREADY;
+		break;
+	case IO_WQ_CANCEL_NOTFOUND:
+		ret = -ENOENT;
+		break;
+	}
+
+	return ret;
+}
+
+int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
+
+	ret = io_async_cancel_one(req->task->io_uring, cd);
+	/*
+	 * Fall-through even for -EALREADY, as we may have poll armed
+	 * that need unarming.
+	 */
+	if (!ret)
+		return 0;
+
+	spin_lock(&ctx->completion_lock);
+	ret = io_poll_cancel(ctx, cd);
+	if (ret != -ENOENT)
+		goto out;
+	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
+		ret = io_timeout_cancel(ctx, cd);
+out:
+	spin_unlock(&ctx->completion_lock);
+	return ret;
+}
+
+
+int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+
+	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
+		return -EINVAL;
+	if (sqe->off || sqe->len || sqe->splice_fd_in)
+		return -EINVAL;
+
+	cancel->addr = READ_ONCE(sqe->addr);
+	cancel->flags = READ_ONCE(sqe->cancel_flags);
+	if (cancel->flags & ~CANCEL_FLAGS)
+		return -EINVAL;
+	if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
+		if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
+			return -EINVAL;
+		cancel->fd = READ_ONCE(sqe->fd);
+	}
+
+	return 0;
+}
+
+static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
+			     unsigned int issue_flags)
+{
+	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+	struct io_ring_ctx *ctx = cd->ctx;
+	struct io_tctx_node *node;
+	int ret, nr = 0;
+
+	do {
+		ret = io_try_cancel(req, cd);
+		if (ret == -ENOENT)
+			break;
+		if (!all)
+			return ret;
+		nr++;
+	} while (1);
+
+	/* slow path, try all io-wq's */
+	io_ring_submit_lock(ctx, issue_flags);
+	ret = -ENOENT;
+	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+		struct io_uring_task *tctx = node->task->io_uring;
+
+		ret = io_async_cancel_one(tctx, cd);
+		if (ret != -ENOENT) {
+			if (!all)
+				break;
+			nr++;
+		}
+	}
+	io_ring_submit_unlock(ctx, issue_flags);
+	return all ? nr : ret;
+}
+
+int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+	struct io_cancel_data cd = {
+		.ctx	= req->ctx,
+		.data	= cancel->addr,
+		.flags	= cancel->flags,
+		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
+	};
+	int ret;
+
+	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->flags & REQ_F_FIXED_FILE)
+			req->file = io_file_get_fixed(req, cancel->fd,
+							issue_flags);
+		else
+			req->file = io_file_get_normal(req, cancel->fd);
+		if (!req->file) {
+			ret = -EBADF;
+			goto done;
+		}
+		cd.file = req->file;
+	}
+
+	ret = __io_async_cancel(&cd, req, issue_flags);
+done:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
new file mode 100644
index 00000000000000..4f35d86963253f
--- /dev/null
+++ b/io_uring/cancel.h
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b4ecfa7238555f..fb4f3ffa58c886 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -108,6 +108,7 @@
 #include "msg_ring.h"
 #include "timeout.h"
 #include "poll.h"
+#include "cancel.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -210,13 +211,6 @@ struct io_buffer {
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
  */
-struct io_cancel {
-	struct file			*file;
-	u64				addr;
-	u32				flags;
-	s32				fd;
-};
-
 struct io_rw {
 	/* NOTE: kiocb has the file as the first member, so don't do it here */
 	struct kiocb			kiocb;
@@ -3362,175 +3356,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-static bool io_cancel_cb(struct io_wq_work *work, void *data)
-{
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-	struct io_cancel_data *cd = data;
-
-	if (req->ctx != cd->ctx)
-		return false;
-	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
-		;
-	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
-		if (req->file != cd->file)
-			return false;
-	} else {
-		if (req->cqe.user_data != cd->data)
-			return false;
-	}
-	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
-		if (cd->seq == req->work.cancel_seq)
-			return false;
-		req->work.cancel_seq = cd->seq;
-	}
-	return true;
-}
-
-static int io_async_cancel_one(struct io_uring_task *tctx,
-			       struct io_cancel_data *cd)
-{
-	enum io_wq_cancel cancel_ret;
-	int ret = 0;
-	bool all;
-
-	if (!tctx || !tctx->io_wq)
-		return -ENOENT;
-
-	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
-	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
-	switch (cancel_ret) {
-	case IO_WQ_CANCEL_OK:
-		ret = 0;
-		break;
-	case IO_WQ_CANCEL_RUNNING:
-		ret = -EALREADY;
-		break;
-	case IO_WQ_CANCEL_NOTFOUND:
-		ret = -ENOENT;
-		break;
-	}
-
-	return ret;
-}
-
-int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret;
-
-	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
-
-	ret = io_async_cancel_one(req->task->io_uring, cd);
-	/*
-	 * Fall-through even for -EALREADY, as we may have poll armed
-	 * that need unarming.
-	 */
-	if (!ret)
-		return 0;
-
-	spin_lock(&ctx->completion_lock);
-	ret = io_poll_cancel(ctx, cd);
-	if (ret != -ENOENT)
-		goto out;
-	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
-		ret = io_timeout_cancel(ctx, cd);
-out:
-	spin_unlock(&ctx->completion_lock);
-	return ret;
-}
-
-#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
-			 IORING_ASYNC_CANCEL_ANY)
-
-static int io_async_cancel_prep(struct io_kiocb *req,
-				const struct io_uring_sqe *sqe)
-{
-	struct io_cancel *cancel = io_kiocb_to_cmd(req);
-
-	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
-		return -EINVAL;
-	if (sqe->off || sqe->len || sqe->splice_fd_in)
-		return -EINVAL;
-
-	cancel->addr = READ_ONCE(sqe->addr);
-	cancel->flags = READ_ONCE(sqe->cancel_flags);
-	if (cancel->flags & ~CANCEL_FLAGS)
-		return -EINVAL;
-	if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
-		if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
-			return -EINVAL;
-		cancel->fd = READ_ONCE(sqe->fd);
-	}
-
-	return 0;
-}
-
-static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
-			     unsigned int issue_flags)
-{
-	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
-	struct io_ring_ctx *ctx = cd->ctx;
-	struct io_tctx_node *node;
-	int ret, nr = 0;
-
-	do {
-		ret = io_try_cancel(req, cd);
-		if (ret == -ENOENT)
-			break;
-		if (!all)
-			return ret;
-		nr++;
-	} while (1);
-
-	/* slow path, try all io-wq's */
-	io_ring_submit_lock(ctx, issue_flags);
-	ret = -ENOENT;
-	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
-		struct io_uring_task *tctx = node->task->io_uring;
-
-		ret = io_async_cancel_one(tctx, cd);
-		if (ret != -ENOENT) {
-			if (!all)
-				break;
-			nr++;
-		}
-	}
-	io_ring_submit_unlock(ctx, issue_flags);
-	return all ? nr : ret;
-}
-
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_cancel *cancel = io_kiocb_to_cmd(req);
-	struct io_cancel_data cd = {
-		.ctx	= req->ctx,
-		.data	= cancel->addr,
-		.flags	= cancel->flags,
-		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
-	};
-	int ret;
-
-	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
-		if (req->flags & REQ_F_FIXED_FILE)
-			req->file = io_file_get_fixed(req, cancel->fd,
-							issue_flags);
-		else
-			req->file = io_file_get_normal(req, cancel->fd);
-		if (!req->file) {
-			ret = -EBADF;
-			goto done;
-		}
-		cd.file = req->file;
-	}
-
-	ret = __io_async_cancel(&cd, req, issue_flags);
-done:
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 static int io_files_update_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 1ceac4ea62bf3b..a78e3c5ab109b8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -157,7 +157,6 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
 void io_req_task_submit(struct io_kiocb *req, bool *locked);
 void tctx_task_work(struct callback_head *cb);
-int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 5e42bfcd683e2a..69cca42d6835b4 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -11,6 +11,7 @@
 #include "io_uring_types.h"
 #include "io_uring.h"
 #include "refs.h"
+#include "cancel.h"
 #include "timeout.h"
 
 struct io_timeout {

From 3b77495a97239faa27989f946d29b6be7dd091e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 07:07:23 -0600
Subject: [PATCH 051/172] io_uring: split provided buffers handling into its
 own file

Move both the opcodes related to it, and the internals code dealing with
it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |   2 +-
 io_uring/io_uring.c | 602 +-------------------------------------------
 io_uring/io_uring.h |  36 +--
 io_uring/kbuf.c     | 524 ++++++++++++++++++++++++++++++++++++++
 io_uring/kbuf.h     | 142 +++++++++++
 io_uring/net.c      |   1 +
 io_uring/poll.c     |   1 +
 7 files changed, 672 insertions(+), 636 deletions(-)
 create mode 100644 io_uring/kbuf.c
 create mode 100644 io_uring/kbuf.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index cfd61e6b7759f7..b85418b64e8241 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o
+					cancel.o kbuf.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fb4f3ffa58c886..e395167999edfb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -93,6 +93,7 @@
 #include "tctx.h"
 #include "sqpoll.h"
 #include "fdinfo.h"
+#include "kbuf.h"
 
 #include "xattr.h"
 #include "nop.h"
@@ -171,42 +172,10 @@ struct io_rsrc_data {
 	bool				quiesce;
 };
 
-#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
-struct io_buffer_list {
-	/*
-	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
-	 * then these are classic provided buffers and ->buf_list is used.
-	 */
-	union {
-		struct list_head buf_list;
-		struct {
-			struct page **buf_pages;
-			struct io_uring_buf_ring *buf_ring;
-		};
-	};
-	__u16 bgid;
-
-	/* below is for ring provided buffers */
-	__u16 buf_nr_pages;
-	__u16 nr_entries;
-	__u16 head;
-	__u16 mask;
-};
-
-struct io_buffer {
-	struct list_head list;
-	__u64 addr;
-	__u32 len;
-	__u16 bid;
-	__u16 bgid;
-};
-
 #define IO_COMPL_BATCH			32
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
-#define BGID_ARRAY			64
-
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -226,15 +195,6 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
-struct io_provide_buf {
-	struct file			*file;
-	__u64				addr;
-	__u32				len;
-	__u32				bgid;
-	__u16				nbufs;
-	__u16				bid;
-};
-
 struct io_rw_state {
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
@@ -399,110 +359,6 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 	}
 }
 
-static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
-{
-	if (req->flags & REQ_F_BUFFER_RING) {
-		if (req->buf_list)
-			req->buf_list->head++;
-		req->flags &= ~REQ_F_BUFFER_RING;
-	} else {
-		list_add(&req->kbuf->list, list);
-		req->flags &= ~REQ_F_BUFFER_SELECTED;
-	}
-
-	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
-}
-
-static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
-{
-	lockdep_assert_held(&req->ctx->completion_lock);
-
-	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
-		return 0;
-	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
-}
-
-inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
-{
-	unsigned int cflags;
-
-	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
-		return 0;
-
-	/*
-	 * We can add this buffer back to two lists:
-	 *
-	 * 1) The io_buffers_cache list. This one is protected by the
-	 *    ctx->uring_lock. If we already hold this lock, add back to this
-	 *    list as we can grab it from issue as well.
-	 * 2) The io_buffers_comp list. This one is protected by the
-	 *    ctx->completion_lock.
-	 *
-	 * We migrate buffers from the comp_list to the issue cache list
-	 * when we need one.
-	 */
-	if (req->flags & REQ_F_BUFFER_RING) {
-		/* no buffers to recycle for this case */
-		cflags = __io_put_kbuf(req, NULL);
-	} else if (issue_flags & IO_URING_F_UNLOCKED) {
-		struct io_ring_ctx *ctx = req->ctx;
-
-		spin_lock(&ctx->completion_lock);
-		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
-		spin_unlock(&ctx->completion_lock);
-	} else {
-		lockdep_assert_held(&req->ctx->uring_lock);
-
-		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
-	}
-
-	return cflags;
-}
-
-static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
-						 unsigned int bgid)
-{
-	if (ctx->io_bl && bgid < BGID_ARRAY)
-		return &ctx->io_bl[bgid];
-
-	return xa_load(&ctx->io_bl_xa, bgid);
-}
-
-void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_buffer_list *bl;
-	struct io_buffer *buf;
-
-	/*
-	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
-	 * the flag and hence ensure that bl->head doesn't get incremented.
-	 * If the tail has already been incremented, hang on to it.
-	 */
-	if (req->flags & REQ_F_BUFFER_RING) {
-		if (req->buf_list) {
-			if (req->flags & REQ_F_PARTIAL_IO) {
-				req->buf_list->head++;
-				req->buf_list = NULL;
-			} else {
-				req->buf_index = req->buf_list->bgid;
-				req->flags &= ~REQ_F_BUFFER_RING;
-			}
-		}
-		return;
-	}
-
-	io_ring_submit_lock(ctx, issue_flags);
-
-	buf = req->kbuf;
-	bl = io_buffer_get_list(ctx, buf->bgid);
-	list_add(&buf->list, &bl->buf_list);
-	req->flags &= ~REQ_F_BUFFER_SELECTED;
-	req->buf_index = buf->bgid;
-
-	io_ring_submit_unlock(ctx, issue_flags);
-}
-
 static bool io_match_linked(struct io_kiocb *head)
 {
 	struct io_kiocb *req;
@@ -2296,96 +2152,6 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
 	return __io_import_fixed(req, rw, iter, req->imu);
 }
 
-static int io_buffer_add_list(struct io_ring_ctx *ctx,
-			      struct io_buffer_list *bl, unsigned int bgid)
-{
-	bl->bgid = bgid;
-	if (bgid < BGID_ARRAY)
-		return 0;
-
-	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
-}
-
-static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
-					      struct io_buffer_list *bl)
-{
-	if (!list_empty(&bl->buf_list)) {
-		struct io_buffer *kbuf;
-
-		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
-		list_del(&kbuf->list);
-		if (*len > kbuf->len)
-			*len = kbuf->len;
-		req->flags |= REQ_F_BUFFER_SELECTED;
-		req->kbuf = kbuf;
-		req->buf_index = kbuf->bid;
-		return u64_to_user_ptr(kbuf->addr);
-	}
-	return NULL;
-}
-
-static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
-					  struct io_buffer_list *bl,
-					  unsigned int issue_flags)
-{
-	struct io_uring_buf_ring *br = bl->buf_ring;
-	struct io_uring_buf *buf;
-	__u16 head = bl->head;
-
-	if (unlikely(smp_load_acquire(&br->tail) == head))
-		return NULL;
-
-	head &= bl->mask;
-	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
-		buf = &br->bufs[head];
-	} else {
-		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
-		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
-		buf = page_address(bl->buf_pages[index]);
-		buf += off;
-	}
-	if (*len > buf->len)
-		*len = buf->len;
-	req->flags |= REQ_F_BUFFER_RING;
-	req->buf_list = bl;
-	req->buf_index = buf->bid;
-
-	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
-		/*
-		 * If we came in unlocked, we have no choice but to consume the
-		 * buffer here. This does mean it'll be pinned until the IO
-		 * completes. But coming in unlocked means we're in io-wq
-		 * context, hence there should be no further retry. For the
-		 * locked case, the caller must ensure to call the commit when
-		 * the transfer completes (or if we get -EAGAIN and must poll
-		 * or retry).
-		 */
-		req->buf_list = NULL;
-		bl->head++;
-	}
-	return u64_to_user_ptr(buf->addr);
-}
-
-void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-			      unsigned int issue_flags)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_buffer_list *bl;
-	void __user *ret = NULL;
-
-	io_ring_submit_lock(req->ctx, issue_flags);
-
-	bl = io_buffer_get_list(ctx, req->buf_index);
-	if (likely(bl)) {
-		if (bl->buf_nr_pages)
-			ret = io_ring_buffer_select(req, len, bl, issue_flags);
-		else
-			ret = io_provided_buffer_select(req, len, bl);
-	}
-	io_ring_submit_unlock(req->ctx, issue_flags);
-	return ret;
-}
-
 #ifdef CONFIG_COMPAT
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)
@@ -3098,258 +2864,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 	return ret;
 }
 
-static int io_remove_buffers_prep(struct io_kiocb *req,
-				  const struct io_uring_sqe *sqe)
-{
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
-	u64 tmp;
-
-	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
-	    sqe->splice_fd_in)
-		return -EINVAL;
-
-	tmp = READ_ONCE(sqe->fd);
-	if (!tmp || tmp > USHRT_MAX)
-		return -EINVAL;
-
-	memset(p, 0, sizeof(*p));
-	p->nbufs = tmp;
-	p->bgid = READ_ONCE(sqe->buf_group);
-	return 0;
-}
-
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
-			       struct io_buffer_list *bl, unsigned nbufs)
-{
-	unsigned i = 0;
-
-	/* shouldn't happen */
-	if (!nbufs)
-		return 0;
-
-	if (bl->buf_nr_pages) {
-		int j;
-
-		i = bl->buf_ring->tail - bl->head;
-		for (j = 0; j < bl->buf_nr_pages; j++)
-			unpin_user_page(bl->buf_pages[j]);
-		kvfree(bl->buf_pages);
-		bl->buf_pages = NULL;
-		bl->buf_nr_pages = 0;
-		/* make sure it's seen as empty */
-		INIT_LIST_HEAD(&bl->buf_list);
-		return i;
-	}
-
-	/* the head kbuf is the list itself */
-	while (!list_empty(&bl->buf_list)) {
-		struct io_buffer *nxt;
-
-		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
-		list_del(&nxt->list);
-		if (++i == nbufs)
-			return i;
-		cond_resched();
-	}
-	i++;
-
-	return i;
-}
-
-static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_buffer_list *bl;
-	int ret = 0;
-
-	io_ring_submit_lock(ctx, issue_flags);
-
-	ret = -ENOENT;
-	bl = io_buffer_get_list(ctx, p->bgid);
-	if (bl) {
-		ret = -EINVAL;
-		/* can't use provide/remove buffers command on mapped buffers */
-		if (!bl->buf_nr_pages)
-			ret = __io_remove_buffers(ctx, bl, p->nbufs);
-	}
-	if (ret < 0)
-		req_set_fail(req);
-
-	/* complete before unlock, IOPOLL may need the lock */
-	io_req_set_res(req, ret, 0);
-	__io_req_complete(req, issue_flags);
-	io_ring_submit_unlock(ctx, issue_flags);
-	return IOU_ISSUE_SKIP_COMPLETE;
-}
-
-static int io_provide_buffers_prep(struct io_kiocb *req,
-				   const struct io_uring_sqe *sqe)
-{
-	unsigned long size, tmp_check;
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
-	u64 tmp;
-
-	if (sqe->rw_flags || sqe->splice_fd_in)
-		return -EINVAL;
-
-	tmp = READ_ONCE(sqe->fd);
-	if (!tmp || tmp > USHRT_MAX)
-		return -E2BIG;
-	p->nbufs = tmp;
-	p->addr = READ_ONCE(sqe->addr);
-	p->len = READ_ONCE(sqe->len);
-
-	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
-				&size))
-		return -EOVERFLOW;
-	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
-		return -EOVERFLOW;
-
-	size = (unsigned long)p->len * p->nbufs;
-	if (!access_ok(u64_to_user_ptr(p->addr), size))
-		return -EFAULT;
-
-	p->bgid = READ_ONCE(sqe->buf_group);
-	tmp = READ_ONCE(sqe->off);
-	if (tmp > USHRT_MAX)
-		return -E2BIG;
-	p->bid = tmp;
-	return 0;
-}
-
-static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
-{
-	struct io_buffer *buf;
-	struct page *page;
-	int bufs_in_page;
-
-	/*
-	 * Completions that don't happen inline (eg not under uring_lock) will
-	 * add to ->io_buffers_comp. If we don't have any free buffers, check
-	 * the completion list and splice those entries first.
-	 */
-	if (!list_empty_careful(&ctx->io_buffers_comp)) {
-		spin_lock(&ctx->completion_lock);
-		if (!list_empty(&ctx->io_buffers_comp)) {
-			list_splice_init(&ctx->io_buffers_comp,
-						&ctx->io_buffers_cache);
-			spin_unlock(&ctx->completion_lock);
-			return 0;
-		}
-		spin_unlock(&ctx->completion_lock);
-	}
-
-	/*
-	 * No free buffers and no completion entries either. Allocate a new
-	 * page worth of buffer entries and add those to our freelist.
-	 */
-	page = alloc_page(GFP_KERNEL_ACCOUNT);
-	if (!page)
-		return -ENOMEM;
-
-	list_add(&page->lru, &ctx->io_buffers_pages);
-
-	buf = page_address(page);
-	bufs_in_page = PAGE_SIZE / sizeof(*buf);
-	while (bufs_in_page) {
-		list_add_tail(&buf->list, &ctx->io_buffers_cache);
-		buf++;
-		bufs_in_page--;
-	}
-
-	return 0;
-}
-
-static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
-			  struct io_buffer_list *bl)
-{
-	struct io_buffer *buf;
-	u64 addr = pbuf->addr;
-	int i, bid = pbuf->bid;
-
-	for (i = 0; i < pbuf->nbufs; i++) {
-		if (list_empty(&ctx->io_buffers_cache) &&
-		    io_refill_buffer_cache(ctx))
-			break;
-		buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
-					list);
-		list_move_tail(&buf->list, &bl->buf_list);
-		buf->addr = addr;
-		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
-		buf->bid = bid;
-		buf->bgid = pbuf->bgid;
-		addr += pbuf->len;
-		bid++;
-		cond_resched();
-	}
-
-	return i ? 0 : -ENOMEM;
-}
-
-static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
-{
-	int i;
-
-	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
-				GFP_KERNEL);
-	if (!ctx->io_bl)
-		return -ENOMEM;
-
-	for (i = 0; i < BGID_ARRAY; i++) {
-		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
-		ctx->io_bl[i].bgid = i;
-	}
-
-	return 0;
-}
-
-static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_buffer_list *bl;
-	int ret = 0;
-
-	io_ring_submit_lock(ctx, issue_flags);
-
-	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
-		ret = io_init_bl_list(ctx);
-		if (ret)
-			goto err;
-	}
-
-	bl = io_buffer_get_list(ctx, p->bgid);
-	if (unlikely(!bl)) {
-		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
-		if (!bl) {
-			ret = -ENOMEM;
-			goto err;
-		}
-		INIT_LIST_HEAD(&bl->buf_list);
-		ret = io_buffer_add_list(ctx, bl, p->bgid);
-		if (ret) {
-			kfree(bl);
-			goto err;
-		}
-	}
-	/* can't add buffers via this command for a mapped buffer ring */
-	if (bl->buf_nr_pages) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = io_add_buffers(ctx, p, bl);
-err:
-	if (ret < 0)
-		req_set_fail(req);
-	/* complete before unlock, IOPOLL may need the lock */
-	io_req_set_res(req, ret, 0);
-	__io_req_complete(req, issue_flags);
-	io_ring_submit_unlock(ctx, issue_flags);
-	return IOU_ISSUE_SKIP_COMPLETE;
-}
-
 static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 					     const struct io_uring_sqe *sqe)
 {
@@ -5218,8 +4732,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
-static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
-				  int *npages)
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 {
 	unsigned long start, end, nr_pages;
 	struct vm_area_struct **vmas = NULL;
@@ -5543,33 +5056,6 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 	return -ENXIO;
 }
 
-static void io_destroy_buffers(struct io_ring_ctx *ctx)
-{
-	struct io_buffer_list *bl;
-	unsigned long index;
-	int i;
-
-	for (i = 0; i < BGID_ARRAY; i++) {
-		if (!ctx->io_bl)
-			break;
-		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
-	}
-
-	xa_for_each(&ctx->io_bl_xa, index, bl) {
-		xa_erase(&ctx->io_bl_xa, bl->bgid);
-		__io_remove_buffers(ctx, bl, -1U);
-		kfree(bl);
-	}
-
-	while (!list_empty(&ctx->io_buffers_pages)) {
-		struct page *page;
-
-		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
-		list_del_init(&page->lru);
-		__free_page(page);
-	}
-}
-
 static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
@@ -6953,89 +6439,6 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }
 
-static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
-{
-	struct io_uring_buf_ring *br;
-	struct io_uring_buf_reg reg;
-	struct io_buffer_list *bl, *free_bl = NULL;
-	struct page **pages;
-	int nr_pages;
-
-	if (copy_from_user(&reg, arg, sizeof(reg)))
-		return -EFAULT;
-
-	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
-		return -EINVAL;
-	if (!reg.ring_addr)
-		return -EFAULT;
-	if (reg.ring_addr & ~PAGE_MASK)
-		return -EINVAL;
-	if (!is_power_of_2(reg.ring_entries))
-		return -EINVAL;
-
-	/* cannot disambiguate full vs empty due to head/tail size */
-	if (reg.ring_entries >= 65536)
-		return -EINVAL;
-
-	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
-		int ret = io_init_bl_list(ctx);
-		if (ret)
-			return ret;
-	}
-
-	bl = io_buffer_get_list(ctx, reg.bgid);
-	if (bl) {
-		/* if mapped buffer ring OR classic exists, don't allow */
-		if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
-			return -EEXIST;
-	} else {
-		free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
-		if (!bl)
-			return -ENOMEM;
-	}
-
-	pages = io_pin_pages(reg.ring_addr,
-			     struct_size(br, bufs, reg.ring_entries),
-			     &nr_pages);
-	if (IS_ERR(pages)) {
-		kfree(free_bl);
-		return PTR_ERR(pages);
-	}
-
-	br = page_address(pages[0]);
-	bl->buf_pages = pages;
-	bl->buf_nr_pages = nr_pages;
-	bl->nr_entries = reg.ring_entries;
-	bl->buf_ring = br;
-	bl->mask = reg.ring_entries - 1;
-	io_buffer_add_list(ctx, bl, reg.bgid);
-	return 0;
-}
-
-static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
-{
-	struct io_uring_buf_reg reg;
-	struct io_buffer_list *bl;
-
-	if (copy_from_user(&reg, arg, sizeof(reg)))
-		return -EFAULT;
-	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
-		return -EINVAL;
-
-	bl = io_buffer_get_list(ctx, reg.bgid);
-	if (!bl)
-		return -ENOENT;
-	if (!bl->buf_nr_pages)
-		return -EINVAL;
-
-	__io_remove_buffers(ctx, bl, -1U);
-	if (bl->bgid >= BGID_ARRAY) {
-		xa_erase(&ctx->io_bl_xa, bl->bgid);
-		kfree(bl);
-	}
-	return 0;
-}
-
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -7701,7 +7104,6 @@ static int __init io_uring_init(void)
 
 	/* ->buf_index is u16 */
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
-	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
 	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
 	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
 		     offsetof(struct io_uring_buf_ring, tail));
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index a78e3c5ab109b8..172defdcfdbe40 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -99,42 +99,8 @@ void __io_req_complete_post(struct io_kiocb *req);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 		     u32 cflags);
 void io_cqring_ev_posted(struct io_ring_ctx *ctx);
-void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-			      unsigned int issue_flags);
-unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
-static inline bool io_do_buffer_select(struct io_kiocb *req)
-{
-	if (!(req->flags & REQ_F_BUFFER_SELECT))
-		return false;
-	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
-}
-
-void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags);
-static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
-{
-	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
-		return;
-	/*
-	 * For legacy provided buffer mode, don't recycle if we already did
-	 * IO to this buffer. For ring-mapped provided buffer mode, we should
-	 * increment ring->head to explicitly monopolize the buffer to avoid
-	 * multiple use.
-	 */
-	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
-	    (req->flags & REQ_F_PARTIAL_IO))
-		return;
-
-	/*
-	 * READV uses fields in `struct io_rw` (len/addr) to stash the selected
-	 * buffer data. However if that buffer is recycled the original request
-	 * data stored in addr is lost. Therefore forbid recycling for now.
-	 */
-	if (req->opcode == IORING_OP_READV)
-		return;
-
-	__io_kbuf_recycle(req, issue_flags);
-}
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
new file mode 100644
index 00000000000000..bc58890d932b2f
--- /dev/null
+++ b/io_uring/kbuf.c
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "opdef.h"
+#include "kbuf.h"
+
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
+
+#define BGID_ARRAY	64
+
+struct io_provide_buf {
+	struct file			*file;
+	__u64				addr;
+	__u32				len;
+	__u32				bgid;
+	__u16				nbufs;
+	__u16				bid;
+};
+
+static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+							unsigned int bgid)
+{
+	if (ctx->io_bl && bgid < BGID_ARRAY)
+		return &ctx->io_bl[bgid];
+
+	return xa_load(&ctx->io_bl_xa, bgid);
+}
+
+void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	struct io_buffer *buf;
+
+	/*
+	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+	 * the flag and hence ensure that bl->head doesn't get incremented.
+	 * If the tail has already been incremented, hang on to it.
+	 */
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (req->buf_list) {
+			if (req->flags & REQ_F_PARTIAL_IO) {
+				req->buf_list->head++;
+				req->buf_list = NULL;
+			} else {
+				req->buf_index = req->buf_list->bgid;
+				req->flags &= ~REQ_F_BUFFER_RING;
+			}
+		}
+		return;
+	}
+
+	io_ring_submit_lock(ctx, issue_flags);
+
+	buf = req->kbuf;
+	bl = io_buffer_get_list(ctx, buf->bgid);
+	list_add(&buf->list, &bl->buf_list);
+	req->flags &= ~REQ_F_BUFFER_SELECTED;
+	req->buf_index = buf->bgid;
+
+	io_ring_submit_unlock(ctx, issue_flags);
+}
+
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+			      struct io_buffer_list *bl, unsigned int bgid)
+{
+	bl->bgid = bgid;
+	if (bgid < BGID_ARRAY)
+		return 0;
+
+	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
+}
+
+static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
+					      struct io_buffer_list *bl)
+{
+	if (!list_empty(&bl->buf_list)) {
+		struct io_buffer *kbuf;
+
+		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+		list_del(&kbuf->list);
+		if (*len > kbuf->len)
+			*len = kbuf->len;
+		req->flags |= REQ_F_BUFFER_SELECTED;
+		req->kbuf = kbuf;
+		req->buf_index = kbuf->bid;
+		return u64_to_user_ptr(kbuf->addr);
+	}
+	return NULL;
+}
+
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+					  struct io_buffer_list *bl,
+					  unsigned int issue_flags)
+{
+	struct io_uring_buf_ring *br = bl->buf_ring;
+	struct io_uring_buf *buf;
+	__u16 head = bl->head;
+
+	if (unlikely(smp_load_acquire(&br->tail) == head))
+		return NULL;
+
+	head &= bl->mask;
+	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+		buf = &br->bufs[head];
+	} else {
+		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
+		buf = page_address(bl->buf_pages[index]);
+		buf += off;
+	}
+	if (*len > buf->len)
+		*len = buf->len;
+	req->flags |= REQ_F_BUFFER_RING;
+	req->buf_list = bl;
+	req->buf_index = buf->bid;
+
+	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
+		/*
+		 * If we came in unlocked, we have no choice but to consume the
+		 * buffer here. This does mean it'll be pinned until the IO
+		 * completes. But coming in unlocked means we're in io-wq
+		 * context, hence there should be no further retry. For the
+		 * locked case, the caller must ensure to call the commit when
+		 * the transfer completes (or if we get -EAGAIN and must poll
+		 * or retry).
+		 */
+		req->buf_list = NULL;
+		bl->head++;
+	}
+	return u64_to_user_ptr(buf->addr);
+}
+
+void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+			      unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	void __user *ret = NULL;
+
+	io_ring_submit_lock(req->ctx, issue_flags);
+
+	bl = io_buffer_get_list(ctx, req->buf_index);
+	if (likely(bl)) {
+		if (bl->buf_nr_pages)
+			ret = io_ring_buffer_select(req, len, bl, issue_flags);
+		else
+			ret = io_provided_buffer_select(req, len, bl);
+	}
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return ret;
+}
+
+static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
+{
+	int i;
+
+	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
+				GFP_KERNEL);
+	if (!ctx->io_bl)
+		return -ENOMEM;
+
+	for (i = 0; i < BGID_ARRAY; i++) {
+		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
+		ctx->io_bl[i].bgid = i;
+	}
+
+	return 0;
+}
+
+static int __io_remove_buffers(struct io_ring_ctx *ctx,
+			       struct io_buffer_list *bl, unsigned nbufs)
+{
+	unsigned i = 0;
+
+	/* shouldn't happen */
+	if (!nbufs)
+		return 0;
+
+	if (bl->buf_nr_pages) {
+		int j;
+
+		i = bl->buf_ring->tail - bl->head;
+		for (j = 0; j < bl->buf_nr_pages; j++)
+			unpin_user_page(bl->buf_pages[j]);
+		kvfree(bl->buf_pages);
+		bl->buf_pages = NULL;
+		bl->buf_nr_pages = 0;
+		/* make sure it's seen as empty */
+		INIT_LIST_HEAD(&bl->buf_list);
+		return i;
+	}
+
+	/* the head kbuf is the list itself */
+	while (!list_empty(&bl->buf_list)) {
+		struct io_buffer *nxt;
+
+		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
+		list_del(&nxt->list);
+		if (++i == nbufs)
+			return i;
+		cond_resched();
+	}
+	i++;
+
+	return i;
+}
+
+void io_destroy_buffers(struct io_ring_ctx *ctx)
+{
+	struct io_buffer_list *bl;
+	unsigned long index;
+	int i;
+
+	for (i = 0; i < BGID_ARRAY; i++) {
+		if (!ctx->io_bl)
+			break;
+		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
+	}
+
+	xa_for_each(&ctx->io_bl_xa, index, bl) {
+		xa_erase(&ctx->io_bl_xa, bl->bgid);
+		__io_remove_buffers(ctx, bl, -1U);
+		kfree(bl);
+	}
+
+	while (!list_empty(&ctx->io_buffers_pages)) {
+		struct page *page;
+
+		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
+		list_del_init(&page->lru);
+		__free_page(page);
+	}
+}
+
+int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	u64 tmp;
+
+	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+	    sqe->splice_fd_in)
+		return -EINVAL;
+
+	tmp = READ_ONCE(sqe->fd);
+	if (!tmp || tmp > USHRT_MAX)
+		return -EINVAL;
+
+	memset(p, 0, sizeof(*p));
+	p->nbufs = tmp;
+	p->bgid = READ_ONCE(sqe->buf_group);
+	return 0;
+}
+
+int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	int ret = 0;
+
+	io_ring_submit_lock(ctx, issue_flags);
+
+	ret = -ENOENT;
+	bl = io_buffer_get_list(ctx, p->bgid);
+	if (bl) {
+		ret = -EINVAL;
+		/* can't use provide/remove buffers command on mapped buffers */
+		if (!bl->buf_nr_pages)
+			ret = __io_remove_buffers(ctx, bl, p->nbufs);
+	}
+	if (ret < 0)
+		req_set_fail(req);
+
+	/* complete before unlock, IOPOLL may need the lock */
+	io_req_set_res(req, ret, 0);
+	__io_req_complete(req, issue_flags);
+	io_ring_submit_unlock(ctx, issue_flags);
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	unsigned long size, tmp_check;
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	u64 tmp;
+
+	if (sqe->rw_flags || sqe->splice_fd_in)
+		return -EINVAL;
+
+	tmp = READ_ONCE(sqe->fd);
+	if (!tmp || tmp > USHRT_MAX)
+		return -E2BIG;
+	p->nbufs = tmp;
+	p->addr = READ_ONCE(sqe->addr);
+	p->len = READ_ONCE(sqe->len);
+
+	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
+				&size))
+		return -EOVERFLOW;
+	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
+		return -EOVERFLOW;
+
+	size = (unsigned long)p->len * p->nbufs;
+	if (!access_ok(u64_to_user_ptr(p->addr), size))
+		return -EFAULT;
+
+	p->bgid = READ_ONCE(sqe->buf_group);
+	tmp = READ_ONCE(sqe->off);
+	if (tmp > USHRT_MAX)
+		return -E2BIG;
+	p->bid = tmp;
+	return 0;
+}
+
+static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
+{
+	struct io_buffer *buf;
+	struct page *page;
+	int bufs_in_page;
+
+	/*
+	 * Completions that don't happen inline (eg not under uring_lock) will
+	 * add to ->io_buffers_comp. If we don't have any free buffers, check
+	 * the completion list and splice those entries first.
+	 */
+	if (!list_empty_careful(&ctx->io_buffers_comp)) {
+		spin_lock(&ctx->completion_lock);
+		if (!list_empty(&ctx->io_buffers_comp)) {
+			list_splice_init(&ctx->io_buffers_comp,
+						&ctx->io_buffers_cache);
+			spin_unlock(&ctx->completion_lock);
+			return 0;
+		}
+		spin_unlock(&ctx->completion_lock);
+	}
+
+	/*
+	 * No free buffers and no completion entries either. Allocate a new
+	 * page worth of buffer entries and add those to our freelist.
+	 */
+	page = alloc_page(GFP_KERNEL_ACCOUNT);
+	if (!page)
+		return -ENOMEM;
+
+	list_add(&page->lru, &ctx->io_buffers_pages);
+
+	buf = page_address(page);
+	bufs_in_page = PAGE_SIZE / sizeof(*buf);
+	while (bufs_in_page) {
+		list_add_tail(&buf->list, &ctx->io_buffers_cache);
+		buf++;
+		bufs_in_page--;
+	}
+
+	return 0;
+}
+
+static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
+			  struct io_buffer_list *bl)
+{
+	struct io_buffer *buf;
+	u64 addr = pbuf->addr;
+	int i, bid = pbuf->bid;
+
+	for (i = 0; i < pbuf->nbufs; i++) {
+		if (list_empty(&ctx->io_buffers_cache) &&
+		    io_refill_buffer_cache(ctx))
+			break;
+		buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
+					list);
+		list_move_tail(&buf->list, &bl->buf_list);
+		buf->addr = addr;
+		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
+		buf->bid = bid;
+		buf->bgid = pbuf->bgid;
+		addr += pbuf->len;
+		bid++;
+		cond_resched();
+	}
+
+	return i ? 0 : -ENOMEM;
+}
+
+int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	int ret = 0;
+
+	io_ring_submit_lock(ctx, issue_flags);
+
+	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
+		ret = io_init_bl_list(ctx);
+		if (ret)
+			goto err;
+	}
+
+	bl = io_buffer_get_list(ctx, p->bgid);
+	if (unlikely(!bl)) {
+		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+		if (!bl) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		INIT_LIST_HEAD(&bl->buf_list);
+		ret = io_buffer_add_list(ctx, bl, p->bgid);
+		if (ret) {
+			kfree(bl);
+			goto err;
+		}
+	}
+	/* can't add buffers via this command for a mapped buffer ring */
+	if (bl->buf_nr_pages) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = io_add_buffers(ctx, p, bl);
+err:
+	if (ret < 0)
+		req_set_fail(req);
+	/* complete before unlock, IOPOLL may need the lock */
+	io_req_set_res(req, ret, 0);
+	__io_req_complete(req, issue_flags);
+	io_ring_submit_unlock(ctx, issue_flags);
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_ring *br;
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl, *free_bl = NULL;
+	struct page **pages;
+	int nr_pages;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+
+	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (!reg.ring_addr)
+		return -EFAULT;
+	if (reg.ring_addr & ~PAGE_MASK)
+		return -EINVAL;
+	if (!is_power_of_2(reg.ring_entries))
+		return -EINVAL;
+
+	/* cannot disambiguate full vs empty due to head/tail size */
+	if (reg.ring_entries >= 65536)
+		return -EINVAL;
+
+	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
+		int ret = io_init_bl_list(ctx);
+		if (ret)
+			return ret;
+	}
+
+	bl = io_buffer_get_list(ctx, reg.bgid);
+	if (bl) {
+		/* if mapped buffer ring OR classic exists, don't allow */
+		if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
+			return -EEXIST;
+	} else {
+		free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+		if (!bl)
+			return -ENOMEM;
+	}
+
+	pages = io_pin_pages(reg.ring_addr,
+			     struct_size(br, bufs, reg.ring_entries),
+			     &nr_pages);
+	if (IS_ERR(pages)) {
+		kfree(free_bl);
+		return PTR_ERR(pages);
+	}
+
+	br = page_address(pages[0]);
+	bl->buf_pages = pages;
+	bl->buf_nr_pages = nr_pages;
+	bl->nr_entries = reg.ring_entries;
+	bl->buf_ring = br;
+	bl->mask = reg.ring_entries - 1;
+	io_buffer_add_list(ctx, bl, reg.bgid);
+	return 0;
+}
+
+int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+
+	bl = io_buffer_get_list(ctx, reg.bgid);
+	if (!bl)
+		return -ENOENT;
+	if (!bl->buf_nr_pages)
+		return -EINVAL;
+
+	__io_remove_buffers(ctx, bl, -1U);
+	if (bl->bgid >= BGID_ARRAY) {
+		xa_erase(&ctx->io_bl_xa, bl->bgid);
+		kfree(bl);
+	}
+	return 0;
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
new file mode 100644
index 00000000000000..9da3a933ef40e1
--- /dev/null
+++ b/io_uring/kbuf.h
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_KBUF_H
+#define IOU_KBUF_H
+
+#include <uapi/linux/io_uring.h>
+
+struct io_buffer_list {
+	/*
+	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+	 * then these are classic provided buffers and ->buf_list is used.
+	 */
+	union {
+		struct list_head buf_list;
+		struct {
+			struct page **buf_pages;
+			struct io_uring_buf_ring *buf_ring;
+		};
+	};
+	__u16 bgid;
+
+	/* below is for ring provided buffers */
+	__u16 buf_nr_pages;
+	__u16 nr_entries;
+	__u16 head;
+	__u16 mask;
+};
+
+struct io_buffer {
+	struct list_head list;
+	__u64 addr;
+	__u32 len;
+	__u16 bid;
+	__u16 bgid;
+};
+
+void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+			      unsigned int issue_flags);
+void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags);
+void io_destroy_buffers(struct io_ring_ctx *ctx);
+
+int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_BUFFER_SELECT))
+		return false;
+	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
+}
+
+static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+{
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
+		return;
+	/*
+	 * For legacy provided buffer mode, don't recycle if we already did
+	 * IO to this buffer. For ring-mapped provided buffer mode, we should
+	 * increment ring->head to explicitly monopolize the buffer to avoid
+	 * multiple use.
+	 */
+	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
+	    (req->flags & REQ_F_PARTIAL_IO))
+		return;
+
+	/*
+	 * READV uses fields in `struct io_rw` (len/addr) to stash the selected
+	 * buffer data. However if that buffer is recycled the original request
+	 * data stored in addr is lost. Therefore forbid recycling for now.
+	 */
+	if (req->opcode == IORING_OP_READV)
+		return;
+
+	__io_kbuf_recycle(req, issue_flags);
+}
+
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
+{
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (req->buf_list)
+			req->buf_list->head++;
+		req->flags &= ~REQ_F_BUFFER_RING;
+	} else {
+		list_add(&req->kbuf->list, list);
+		req->flags &= ~REQ_F_BUFFER_SELECTED;
+	}
+
+	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+}
+
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
+{
+	lockdep_assert_held(&req->ctx->completion_lock);
+
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
+		return 0;
+	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
+}
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+				       unsigned issue_flags)
+{
+	unsigned int cflags;
+
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
+		return 0;
+
+	/*
+	 * We can add this buffer back to two lists:
+	 *
+	 * 1) The io_buffers_cache list. This one is protected by the
+	 *    ctx->uring_lock. If we already hold this lock, add back to this
+	 *    list as we can grab it from issue as well.
+	 * 2) The io_buffers_comp list. This one is protected by the
+	 *    ctx->completion_lock.
+	 *
+	 * We migrate buffers from the comp_list to the issue cache list
+	 * when we need one.
+	 */
+	if (req->flags & REQ_F_BUFFER_RING) {
+		/* no buffers to recycle for this case */
+		cflags = __io_put_kbuf(req, NULL);
+	} else if (issue_flags & IO_URING_F_UNLOCKED) {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		spin_lock(&ctx->completion_lock);
+		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+		spin_unlock(&ctx->completion_lock);
+	} else {
+		lockdep_assert_held(&req->ctx->uring_lock);
+
+		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
+	}
+
+	return cflags;
+}
+#endif
diff --git a/io_uring/net.c b/io_uring/net.c
index 2434548d0c1fb3..fe1fe920b9291f 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -12,6 +12,7 @@
 
 #include "io_uring_types.h"
 #include "io_uring.h"
+#include "kbuf.h"
 #include "net.h"
 
 #if defined(CONFIG_NET)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index c3e4fcb0a7ba7b..b80f7fa261232d 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -17,6 +17,7 @@
 #include "io_uring.h"
 #include "refs.h"
 #include "opdef.h"
+#include "kbuf.h"
 #include "poll.h"
 
 struct io_poll_update {

From 73572984481907d92673255b494c0ff4f77c8ed4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 07:12:45 -0600
Subject: [PATCH 052/172] io_uring: move rsrc related data, core, and commands

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile    |    2 +-
 io_uring/io_uring.c  | 1644 +++---------------------------------------
 io_uring/io_uring.h  |    6 +-
 io_uring/openclose.c |    1 +
 io_uring/rsrc.c      | 1320 +++++++++++++++++++++++++++++++++
 io_uring/rsrc.h      |  155 ++++
 6 files changed, 1595 insertions(+), 1533 deletions(-)
 create mode 100644 io_uring/rsrc.c
 create mode 100644 io_uring/rsrc.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index b85418b64e8241..360a83039c2a4f 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o
+					cancel.o kbuf.o rsrc.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e395167999edfb..0c47c919887f52 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -68,7 +68,6 @@
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
 #include <linux/sizes.h>
-#include <linux/hugetlb.h>
 #include <linux/highmem.h>
 #include <linux/fsnotify.h>
 #include <linux/fadvise.h>
@@ -94,6 +93,7 @@
 #include "sqpoll.h"
 #include "fdinfo.h"
 #include "kbuf.h"
+#include "rsrc.h"
 
 #include "xattr.h"
 #include "nop.h"
@@ -114,17 +114,9 @@
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 
-/* only define max */
-#define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 
-#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
-#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
-#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
-
-#define IORING_MAX_REG_BUFFERS	(1U << 14)
-
 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 
@@ -140,38 +132,6 @@
 
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
-struct io_rsrc_put {
-	struct list_head list;
-	u64 tag;
-	union {
-		void *rsrc;
-		struct file *file;
-		struct io_mapped_ubuf *buf;
-	};
-};
-
-struct io_rsrc_node {
-	struct percpu_ref		refs;
-	struct list_head		node;
-	struct list_head		rsrc_list;
-	struct io_rsrc_data		*rsrc_data;
-	struct llist_node		llist;
-	bool				done;
-};
-
-typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
-struct io_rsrc_data {
-	struct io_ring_ctx		*ctx;
-
-	u64				**tags;
-	unsigned int			nr;
-	rsrc_put_fn			*do_put;
-	atomic_t			refs;
-	struct completion		done;
-	bool				quiesce;
-};
-
 #define IO_COMPL_BATCH			32
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
@@ -188,13 +148,6 @@ struct io_rw {
 	rwf_t				flags;
 };
 
-struct io_rsrc_update {
-	struct file			*file;
-	u64				arg;
-	u32				nr_args;
-	u32				offset;
-};
-
 struct io_rw_state {
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
@@ -208,11 +161,6 @@ struct io_async_rw {
 	struct wait_page_queue		wpq;
 };
 
-enum {
-	IORING_RSRC_FILE		= 0,
-	IORING_RSRC_BUFFER		= 1,
-};
-
 enum {
 	IO_CHECK_CQ_OVERFLOW_BIT,
 	IO_CHECK_CQ_DROPPED_BIT,
@@ -233,12 +181,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 
 static void io_dismantle_req(struct io_kiocb *req);
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				     struct io_uring_rsrc_update2 *up,
-				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
 static void io_queue_sqe(struct io_kiocb *req);
-static void io_rsrc_put_work(struct work_struct *work);
 
 static void io_req_task_queue(struct io_kiocb *req);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
@@ -268,22 +212,6 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
-#if defined(CONFIG_UNIX)
-static inline bool io_file_need_scm(struct file *filp)
-{
-#if defined(IO_URING_SCM_ALL)
-	return true;
-#else
-	return !!unix_get_socket(filp);
-#endif
-}
-#else
-static inline bool io_file_need_scm(struct file *filp)
-{
-	return false;
-}
-#endif
-
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!*locked) {
@@ -298,67 +226,6 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 		__io_submit_flush_completions(ctx);
 }
 
-#define IO_RSRC_REF_BATCH	100
-
-static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
-{
-	percpu_ref_put_many(&node->refs, nr);
-}
-
-static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
-					  struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	struct io_rsrc_node *node = req->rsrc_node;
-
-	if (node) {
-		if (node == ctx->rsrc_node)
-			ctx->rsrc_cached_refs++;
-		else
-			io_rsrc_put_node(node, 1);
-	}
-}
-
-static inline void io_req_put_rsrc(struct io_kiocb *req)
-{
-	if (req->rsrc_node)
-		io_rsrc_put_node(req->rsrc_node, 1);
-}
-
-static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	if (ctx->rsrc_cached_refs) {
-		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
-		ctx->rsrc_cached_refs = 0;
-	}
-}
-
-static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
-	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
-}
-
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-					struct io_ring_ctx *ctx,
-					unsigned int issue_flags)
-{
-	if (!req->rsrc_node) {
-		req->rsrc_node = ctx->rsrc_node;
-
-		if (!(issue_flags & IO_URING_F_UNLOCKED)) {
-			lockdep_assert_held(&ctx->uring_lock);
-			ctx->rsrc_cached_refs--;
-			if (unlikely(ctx->rsrc_cached_refs < 0))
-				io_rsrc_refs_refill(ctx);
-		} else {
-			percpu_ref_get(&req->rsrc_node->refs);
-		}
-	}
-}
-
 static bool io_match_linked(struct io_kiocb *head)
 {
 	struct io_kiocb *req;
@@ -2870,92 +2737,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-static int io_files_update_prep(struct io_kiocb *req,
-				const struct io_uring_sqe *sqe)
-{
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
-
-	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
-		return -EINVAL;
-	if (sqe->rw_flags || sqe->splice_fd_in)
-		return -EINVAL;
-
-	up->offset = READ_ONCE(sqe->off);
-	up->nr_args = READ_ONCE(sqe->len);
-	if (!up->nr_args)
-		return -EINVAL;
-	up->arg = READ_ONCE(sqe->addr);
-	return 0;
-}
-
-static int io_files_update_with_index_alloc(struct io_kiocb *req,
-					    unsigned int issue_flags)
-{
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
-	__s32 __user *fds = u64_to_user_ptr(up->arg);
-	unsigned int done;
-	struct file *file;
-	int ret, fd;
-
-	if (!req->ctx->file_data)
-		return -ENXIO;
-
-	for (done = 0; done < up->nr_args; done++) {
-		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
-			ret = -EFAULT;
-			break;
-		}
-
-		file = fget(fd);
-		if (!file) {
-			ret = -EBADF;
-			break;
-		}
-		ret = io_fixed_fd_install(req, issue_flags, file,
-					  IORING_FILE_INDEX_ALLOC);
-		if (ret < 0)
-			break;
-		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
-			__io_close_fixed(req, issue_flags, ret);
-			ret = -EFAULT;
-			break;
-		}
-	}
-
-	if (done)
-		return done;
-	return ret;
-}
-
-static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_rsrc_update2 up2;
-	int ret;
-
-	up2.offset = up->offset;
-	up2.data = up->arg;
-	up2.nr = 0;
-	up2.tags = 0;
-	up2.resv = 0;
-	up2.resv2 = 0;
-
-	if (up->offset == IORING_FILE_INDEX_ALLOC) {
-		ret = io_files_update_with_index_alloc(req, issue_flags);
-	} else {
-		io_ring_submit_lock(ctx, issue_flags);
-		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
-						&up2, up->nr_args);
-		io_ring_submit_unlock(ctx, issue_flags);
-	}
-
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 static int io_req_prep_async(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -3696,7 +3477,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	return -1;
 }
 
-static int io_run_task_work_sig(void)
+int io_run_task_work_sig(void)
 {
 	if (io_run_task_work())
 		return 1;
@@ -3798,1265 +3579,164 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-static void io_free_page_table(void **table, size_t size)
+int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+			  unsigned int issue_flags, u32 slot_index)
+	__must_hold(&req->ctx->uring_lock)
 {
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+	struct io_ring_ctx *ctx = req->ctx;
+	bool needs_switch = false;
+	struct io_fixed_file *file_slot;
+	int ret;
 
-	for (i = 0; i < nr_tables; i++)
-		kfree(table[i]);
-	kfree(table);
-}
+	if (io_is_uring_fops(file))
+		return -EBADF;
+	if (!ctx->file_data)
+		return -ENXIO;
+	if (slot_index >= ctx->nr_user_files)
+		return -EINVAL;
 
-static __cold void **io_alloc_page_table(size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-	size_t init_size = size;
-	void **table;
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
 
-	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
-	if (!table)
-		return NULL;
+	if (file_slot->file_ptr) {
+		struct file *old_file;
 
-	for (i = 0; i < nr_tables; i++) {
-		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+		ret = io_rsrc_node_switch_start(ctx);
+		if (ret)
+			goto err;
 
-		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
-		if (!table[i]) {
-			io_free_page_table(table, init_size);
-			return NULL;
-		}
-		size -= this_size;
+		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+					    ctx->rsrc_node, old_file);
+		if (ret)
+			goto err;
+		file_slot->file_ptr = 0;
+		io_file_bitmap_clear(&ctx->file_table, slot_index);
+		needs_switch = true;
 	}
-	return table;
-}
 
-static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
-{
-	percpu_ref_exit(&ref_node->refs);
-	kfree(ref_node);
+	ret = io_scm_file_account(ctx, file);
+	if (!ret) {
+		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
+		io_fixed_file_set(file_slot, file);
+		io_file_bitmap_set(&ctx->file_table, slot_index);
+	}
+err:
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, ctx->file_data);
+	if (ret)
+		fput(file);
+	return ret;
 }
 
-static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static void io_mem_free(void *ptr)
 {
-	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
-	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	unsigned long flags;
-	bool first_add = false;
-	unsigned long delay = HZ;
-
-	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
-	node->done = true;
-
-	/* if we are mid-quiesce then do not delay */
-	if (node->rsrc_data->quiesce)
-		delay = 0;
+	struct page *page;
 
-	while (!list_empty(&ctx->rsrc_ref_list)) {
-		node = list_first_entry(&ctx->rsrc_ref_list,
-					    struct io_rsrc_node, node);
-		/* recycle ref nodes in order */
-		if (!node->done)
-			break;
-		list_del(&node->node);
-		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
-	}
-	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
+	if (!ptr)
+		return;
 
-	if (first_add)
-		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+	page = virt_to_head_page(ptr);
+	if (put_page_testzero(page))
+		free_compound_page(page);
 }
 
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
+static void *io_mem_alloc(size_t size)
 {
-	struct io_rsrc_node *ref_node;
-
-	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-	if (!ref_node)
-		return NULL;
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
 
-	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
-			    0, GFP_KERNEL)) {
-		kfree(ref_node);
-		return NULL;
-	}
-	INIT_LIST_HEAD(&ref_node->node);
-	INIT_LIST_HEAD(&ref_node->rsrc_list);
-	ref_node->done = false;
-	return ref_node;
+	return (void *) __get_free_pages(gfp, get_order(size));
 }
 
-void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-			 struct io_rsrc_data *data_to_kill)
-	__must_hold(&ctx->uring_lock)
+static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
+				unsigned int cq_entries, size_t *sq_offset)
 {
-	WARN_ON_ONCE(!ctx->rsrc_backup_node);
-	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+	struct io_rings *rings;
+	size_t off, sq_array_size;
 
-	io_rsrc_refs_drop(ctx);
+	off = struct_size(rings, cqes, cq_entries);
+	if (off == SIZE_MAX)
+		return SIZE_MAX;
+	if (ctx->flags & IORING_SETUP_CQE32) {
+		if (check_shl_overflow(off, 1, &off))
+			return SIZE_MAX;
+	}
 
-	if (data_to_kill) {
-		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+#ifdef CONFIG_SMP
+	off = ALIGN(off, SMP_CACHE_BYTES);
+	if (off == 0)
+		return SIZE_MAX;
+#endif
 
-		rsrc_node->rsrc_data = data_to_kill;
-		spin_lock_irq(&ctx->rsrc_ref_lock);
-		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
-		spin_unlock_irq(&ctx->rsrc_ref_lock);
+	if (sq_offset)
+		*sq_offset = off;
 
-		atomic_inc(&data_to_kill->refs);
-		percpu_ref_kill(&rsrc_node->refs);
-		ctx->rsrc_node = NULL;
-	}
+	sq_array_size = array_size(sizeof(u32), sq_entries);
+	if (sq_array_size == SIZE_MAX)
+		return SIZE_MAX;
 
-	if (!ctx->rsrc_node) {
-		ctx->rsrc_node = ctx->rsrc_backup_node;
-		ctx->rsrc_backup_node = NULL;
-	}
-}
+	if (check_add_overflow(off, sq_array_size, &off))
+		return SIZE_MAX;
 
-int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
-{
-	if (ctx->rsrc_backup_node)
-		return 0;
-	ctx->rsrc_backup_node = io_rsrc_node_alloc();
-	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+	return off;
 }
 
-static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-				      struct io_ring_ctx *ctx)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+			       unsigned int eventfd_async)
 {
-	int ret;
-
-	/* As we may drop ->uring_lock, other task may have started quiesce */
-	if (data->quiesce)
-		return -ENXIO;
-
-	data->quiesce = true;
-	do {
-		ret = io_rsrc_node_switch_start(ctx);
-		if (ret)
-			break;
-		io_rsrc_node_switch(ctx, data);
+	struct io_ev_fd *ev_fd;
+	__s32 __user *fds = arg;
+	int fd;
 
-		/* kill initial ref, already quiesced if zero */
-		if (atomic_dec_and_test(&data->refs))
-			break;
-		mutex_unlock(&ctx->uring_lock);
-		flush_delayed_work(&ctx->rsrc_put_work);
-		ret = wait_for_completion_interruptible(&data->done);
-		if (!ret) {
-			mutex_lock(&ctx->uring_lock);
-			if (atomic_read(&data->refs) > 0) {
-				/*
-				 * it has been revived by another thread while
-				 * we were unlocked
-				 */
-				mutex_unlock(&ctx->uring_lock);
-			} else {
-				break;
-			}
-		}
+	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+					lockdep_is_held(&ctx->uring_lock));
+	if (ev_fd)
+		return -EBUSY;
 
-		atomic_inc(&data->refs);
-		/* wait for all works potentially completing data->done */
-		flush_delayed_work(&ctx->rsrc_put_work);
-		reinit_completion(&data->done);
+	if (copy_from_user(&fd, fds, sizeof(*fds)))
+		return -EFAULT;
 
-		ret = io_run_task_work_sig();
-		mutex_lock(&ctx->uring_lock);
-	} while (ret >= 0);
-	data->quiesce = false;
+	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+	if (!ev_fd)
+		return -ENOMEM;
 
-	return ret;
+	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(ev_fd->cq_ev_fd)) {
+		int ret = PTR_ERR(ev_fd->cq_ev_fd);
+		kfree(ev_fd);
+		return ret;
+	}
+	ev_fd->eventfd_async = eventfd_async;
+	ctx->has_evfd = true;
+	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
+	return 0;
 }
 
-static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+static void io_eventfd_put(struct rcu_head *rcu)
 {
-	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
-	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
 
-	return &data->tags[table_idx][off];
+	eventfd_ctx_put(ev_fd->cq_ev_fd);
+	kfree(ev_fd);
 }
 
-static void io_rsrc_data_free(struct io_rsrc_data *data)
+static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
-	size_t size = data->nr * sizeof(data->tags[0][0]);
+	struct io_ev_fd *ev_fd;
+
+	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+					lockdep_is_held(&ctx->uring_lock));
+	if (ev_fd) {
+		ctx->has_evfd = false;
+		rcu_assign_pointer(ctx->io_ev_fd, NULL);
+		call_rcu(&ev_fd->rcu, io_eventfd_put);
+		return 0;
+	}
 
-	if (data->tags)
-		io_free_page_table((void **)data->tags, size);
-	kfree(data);
+	return -ENXIO;
 }
 
-static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
-				     u64 __user *utags, unsigned nr,
-				     struct io_rsrc_data **pdata)
-{
-	struct io_rsrc_data *data;
-	int ret = -ENOMEM;
-	unsigned i;
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
-	if (!data->tags) {
-		kfree(data);
-		return -ENOMEM;
-	}
-
-	data->nr = nr;
-	data->ctx = ctx;
-	data->do_put = do_put;
-	if (utags) {
-		ret = -EFAULT;
-		for (i = 0; i < nr; i++) {
-			u64 *tag_slot = io_get_tag_slot(data, i);
-
-			if (copy_from_user(tag_slot, &utags[i],
-					   sizeof(*tag_slot)))
-				goto fail;
-		}
-	}
-
-	atomic_set(&data->refs, 1);
-	init_completion(&data->done);
-	*pdata = data;
-	return 0;
-fail:
-	io_rsrc_data_free(data);
-	return ret;
-}
-
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-#if !defined(IO_URING_SCM_ALL)
-	int i;
-
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(&ctx->file_table, i);
-
-		if (!file)
-			continue;
-		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
-			continue;
-		io_file_bitmap_clear(&ctx->file_table, i);
-		fput(file);
-	}
-#endif
-
-#if defined(CONFIG_UNIX)
-	if (ctx->ring_sock) {
-		struct sock *sock = ctx->ring_sock->sk;
-		struct sk_buff *skb;
-
-		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
-			kfree_skb(skb);
-	}
-#endif
-	io_free_file_tables(&ctx->file_table);
-	io_rsrc_data_free(ctx->file_data);
-	ctx->file_data = NULL;
-	ctx->nr_user_files = 0;
-}
-
-static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-	unsigned nr = ctx->nr_user_files;
-	int ret;
-
-	if (!ctx->file_data)
-		return -ENXIO;
-
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_files = 0;
-	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
-	ctx->nr_user_files = nr;
-	if (!ret)
-		__io_sqe_files_unregister(ctx);
-	return ret;
-}
-
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing. We account only files that can hold other
- * files because otherwise they can't form a loop and so are not interesting
- * for GC.
- */
-static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
-	struct sock *sk = ctx->ring_sock->sk;
-	struct sk_buff_head *head = &sk->sk_receive_queue;
-	struct scm_fp_list *fpl;
-	struct sk_buff *skb;
-
-	if (likely(!io_file_need_scm(file)))
-		return 0;
-
-	/*
-	 * See if we can merge this file into an existing skb SCM_RIGHTS
-	 * file set. If there's no room, fall back to allocating a new skb
-	 * and filling it in.
-	 */
-	spin_lock_irq(&head->lock);
-	skb = skb_peek(head);
-	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
-		__skb_unlink(skb, head);
-	else
-		skb = NULL;
-	spin_unlock_irq(&head->lock);
-
-	if (!skb) {
-		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
-		if (!fpl)
-			return -ENOMEM;
-
-		skb = alloc_skb(0, GFP_KERNEL);
-		if (!skb) {
-			kfree(fpl);
-			return -ENOMEM;
-		}
-
-		fpl->user = get_uid(current_user());
-		fpl->max = SCM_MAX_FD;
-		fpl->count = 0;
-
-		UNIXCB(skb).fp = fpl;
-		skb->sk = sk;
-		skb->destructor = unix_destruct_scm;
-		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-	}
-
-	fpl = UNIXCB(skb).fp;
-	fpl->fp[fpl->count++] = get_file(file);
-	unix_inflight(fpl->user, file);
-	skb_queue_head(head, skb);
-	fput(file);
-#endif
-	return 0;
-}
-
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-	struct file *file = prsrc->file;
-#if defined(CONFIG_UNIX)
-	struct sock *sock = ctx->ring_sock->sk;
-	struct sk_buff_head list, *head = &sock->sk_receive_queue;
-	struct sk_buff *skb;
-	int i;
-
-	if (!io_file_need_scm(file)) {
-		fput(file);
-		return;
-	}
-
-	__skb_queue_head_init(&list);
-
-	/*
-	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
-	 * remove this entry and rearrange the file array.
-	 */
-	skb = skb_dequeue(head);
-	while (skb) {
-		struct scm_fp_list *fp;
-
-		fp = UNIXCB(skb).fp;
-		for (i = 0; i < fp->count; i++) {
-			int left;
-
-			if (fp->fp[i] != file)
-				continue;
-
-			unix_notinflight(fp->user, fp->fp[i]);
-			left = fp->count - 1 - i;
-			if (left) {
-				memmove(&fp->fp[i], &fp->fp[i + 1],
-						left * sizeof(struct file *));
-			}
-			fp->count--;
-			if (!fp->count) {
-				kfree_skb(skb);
-				skb = NULL;
-			} else {
-				__skb_queue_tail(&list, skb);
-			}
-			fput(file);
-			file = NULL;
-			break;
-		}
-
-		if (!file)
-			break;
-
-		__skb_queue_tail(&list, skb);
-
-		skb = skb_dequeue(head);
-	}
-
-	if (skb_peek(&list)) {
-		spin_lock_irq(&head->lock);
-		while ((skb = __skb_dequeue(&list)) != NULL)
-			__skb_queue_tail(head, skb);
-		spin_unlock_irq(&head->lock);
-	}
-#else
-	fput(file);
-#endif
-}
-
-static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
-{
-	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-	struct io_ring_ctx *ctx = rsrc_data->ctx;
-	struct io_rsrc_put *prsrc, *tmp;
-
-	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
-		list_del(&prsrc->list);
-
-		if (prsrc->tag) {
-			if (ctx->flags & IORING_SETUP_IOPOLL)
-				mutex_lock(&ctx->uring_lock);
-
-			spin_lock(&ctx->completion_lock);
-			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
-			io_commit_cqring(ctx);
-			spin_unlock(&ctx->completion_lock);
-			io_cqring_ev_posted(ctx);
-
-			if (ctx->flags & IORING_SETUP_IOPOLL)
-				mutex_unlock(&ctx->uring_lock);
-		}
-
-		rsrc_data->do_put(ctx, prsrc);
-		kfree(prsrc);
-	}
-
-	io_rsrc_node_destroy(ref_node);
-	if (atomic_dec_and_test(&rsrc_data->refs))
-		complete(&rsrc_data->done);
-}
-
-static void io_rsrc_put_work(struct work_struct *work)
-{
-	struct io_ring_ctx *ctx;
-	struct llist_node *node;
-
-	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
-	node = llist_del_all(&ctx->rsrc_put_llist);
-
-	while (node) {
-		struct io_rsrc_node *ref_node;
-		struct llist_node *next = node->next;
-
-		ref_node = llist_entry(node, struct io_rsrc_node, llist);
-		__io_rsrc_put_work(ref_node);
-		node = next;
-	}
-}
-
-static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-				 unsigned nr_args, u64 __user *tags)
-{
-	__s32 __user *fds = (__s32 __user *) arg;
-	struct file *file;
-	int fd, ret;
-	unsigned i;
-
-	if (ctx->file_data)
-		return -EBUSY;
-	if (!nr_args)
-		return -EINVAL;
-	if (nr_args > IORING_MAX_FIXED_FILES)
-		return -EMFILE;
-	if (nr_args > rlimit(RLIMIT_NOFILE))
-		return -EMFILE;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		return ret;
-	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
-				 &ctx->file_data);
-	if (ret)
-		return ret;
-
-	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
-		io_rsrc_data_free(ctx->file_data);
-		ctx->file_data = NULL;
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct io_fixed_file *file_slot;
-
-		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
-			ret = -EFAULT;
-			goto fail;
-		}
-		/* allow sparse sets */
-		if (!fds || fd == -1) {
-			ret = -EINVAL;
-			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
-				goto fail;
-			continue;
-		}
-
-		file = fget(fd);
-		ret = -EBADF;
-		if (unlikely(!file))
-			goto fail;
-
-		/*
-		 * Don't allow io_uring instances to be registered. If UNIX
-		 * isn't enabled, then this causes a reference cycle and this
-		 * instance can never get freed. If UNIX is enabled we'll
-		 * handle it just fine, but there's still no point in allowing
-		 * a ring fd as it doesn't support regular read/write anyway.
-		 */
-		if (io_is_uring_fops(file)) {
-			fput(file);
-			goto fail;
-		}
-		ret = io_scm_file_account(ctx, file);
-		if (ret) {
-			fput(file);
-			goto fail;
-		}
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-		io_fixed_file_set(file_slot, file);
-		io_file_bitmap_set(&ctx->file_table, i);
-	}
-
-	io_rsrc_node_switch(ctx, NULL);
-	return 0;
-fail:
-	__io_sqe_files_unregister(ctx);
-	return ret;
-}
-
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-			  struct io_rsrc_node *node, void *rsrc)
-{
-	u64 *tag_slot = io_get_tag_slot(data, idx);
-	struct io_rsrc_put *prsrc;
-
-	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
-	if (!prsrc)
-		return -ENOMEM;
-
-	prsrc->tag = *tag_slot;
-	*tag_slot = 0;
-	prsrc->rsrc = rsrc;
-	list_add(&prsrc->list, &node->rsrc_list);
-	return 0;
-}
-
-int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-			  unsigned int issue_flags, u32 slot_index)
-	__must_hold(&req->ctx->uring_lock)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	bool needs_switch = false;
-	struct io_fixed_file *file_slot;
-	int ret;
-
-	if (io_is_uring_fops(file))
-		return -EBADF;
-	if (!ctx->file_data)
-		return -ENXIO;
-	if (slot_index >= ctx->nr_user_files)
-		return -EINVAL;
-
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
-	if (file_slot->file_ptr) {
-		struct file *old_file;
-
-		ret = io_rsrc_node_switch_start(ctx);
-		if (ret)
-			goto err;
-
-		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    ctx->rsrc_node, old_file);
-		if (ret)
-			goto err;
-		file_slot->file_ptr = 0;
-		io_file_bitmap_clear(&ctx->file_table, slot_index);
-		needs_switch = true;
-	}
-
-	ret = io_scm_file_account(ctx, file);
-	if (!ret) {
-		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-		io_fixed_file_set(file_slot, file);
-		io_file_bitmap_set(&ctx->file_table, slot_index);
-	}
-err:
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, ctx->file_data);
-	if (ret)
-		fput(file);
-	return ret;
-}
-
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-				 struct io_uring_rsrc_update2 *up,
-				 unsigned nr_args)
-{
-	u64 __user *tags = u64_to_user_ptr(up->tags);
-	__s32 __user *fds = u64_to_user_ptr(up->data);
-	struct io_rsrc_data *data = ctx->file_data;
-	struct io_fixed_file *file_slot;
-	struct file *file;
-	int fd, i, err = 0;
-	unsigned int done;
-	bool needs_switch = false;
-
-	if (!ctx->file_data)
-		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_files)
-		return -EINVAL;
-
-	for (done = 0; done < nr_args; done++) {
-		u64 tag = 0;
-
-		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
-		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
-			err = -EFAULT;
-			break;
-		}
-		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
-			err = -EINVAL;
-			break;
-		}
-		if (fd == IORING_REGISTER_FILES_SKIP)
-			continue;
-
-		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
-		if (file_slot->file_ptr) {
-			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
-			if (err)
-				break;
-			file_slot->file_ptr = 0;
-			io_file_bitmap_clear(&ctx->file_table, i);
-			needs_switch = true;
-		}
-		if (fd != -1) {
-			file = fget(fd);
-			if (!file) {
-				err = -EBADF;
-				break;
-			}
-			/*
-			 * Don't allow io_uring instances to be registered. If
-			 * UNIX isn't enabled, then this causes a reference
-			 * cycle and this instance can never get freed. If UNIX
-			 * is enabled we'll handle it just fine, but there's
-			 * still no point in allowing a ring fd as it doesn't
-			 * support regular read/write anyway.
-			 */
-			if (io_is_uring_fops(file)) {
-				fput(file);
-				err = -EBADF;
-				break;
-			}
-			err = io_scm_file_account(ctx, file);
-			if (err) {
-				fput(file);
-				break;
-			}
-			*io_get_tag_slot(data, i) = tag;
-			io_fixed_file_set(file_slot, file);
-			io_file_bitmap_set(&ctx->file_table, i);
-		}
-	}
-
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, data);
-	return done ? done : err;
-}
-
-static inline void __io_unaccount_mem(struct user_struct *user,
-				      unsigned long nr_pages)
-{
-	atomic_long_sub(nr_pages, &user->locked_vm);
-}
-
-static inline int __io_account_mem(struct user_struct *user,
-				   unsigned long nr_pages)
-{
-	unsigned long page_limit, cur_pages, new_pages;
-
-	/* Don't allow more pages than we can safely lock */
-	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	do {
-		cur_pages = atomic_long_read(&user->locked_vm);
-		new_pages = cur_pages + nr_pages;
-		if (new_pages > page_limit)
-			return -ENOMEM;
-	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
-					new_pages) != cur_pages);
-
-	return 0;
-}
-
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, nr_pages);
-
-	if (ctx->mm_account)
-		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
-}
-
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
-	int ret;
-
-	if (ctx->user) {
-		ret = __io_account_mem(ctx->user, nr_pages);
-		if (ret)
-			return ret;
-	}
-
-	if (ctx->mm_account)
-		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
-
-	return 0;
-}
-
-static void io_mem_free(void *ptr)
-{
-	struct page *page;
-
-	if (!ptr)
-		return;
-
-	page = virt_to_head_page(ptr);
-	if (put_page_testzero(page))
-		free_compound_page(page);
-}
-
-static void *io_mem_alloc(size_t size)
-{
-	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
-
-	return (void *) __get_free_pages(gfp, get_order(size));
-}
-
-static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
-				unsigned int cq_entries, size_t *sq_offset)
-{
-	struct io_rings *rings;
-	size_t off, sq_array_size;
-
-	off = struct_size(rings, cqes, cq_entries);
-	if (off == SIZE_MAX)
-		return SIZE_MAX;
-	if (ctx->flags & IORING_SETUP_CQE32) {
-		if (check_shl_overflow(off, 1, &off))
-			return SIZE_MAX;
-	}
-
-#ifdef CONFIG_SMP
-	off = ALIGN(off, SMP_CACHE_BYTES);
-	if (off == 0)
-		return SIZE_MAX;
-#endif
-
-	if (sq_offset)
-		*sq_offset = off;
-
-	sq_array_size = array_size(sizeof(u32), sq_entries);
-	if (sq_array_size == SIZE_MAX)
-		return SIZE_MAX;
-
-	if (check_add_overflow(off, sq_array_size, &off))
-		return SIZE_MAX;
-
-	return off;
-}
-
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
-{
-	struct io_mapped_ubuf *imu = *slot;
-	unsigned int i;
-
-	if (imu != ctx->dummy_ubuf) {
-		for (i = 0; i < imu->nr_bvecs; i++)
-			unpin_user_page(imu->bvec[i].bv_page);
-		if (imu->acct_pages)
-			io_unaccount_mem(ctx, imu->acct_pages);
-		kvfree(imu);
-	}
-	*slot = NULL;
-}
-
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-	io_buffer_unmap(ctx, &prsrc->buf);
-	prsrc->buf = NULL;
-}
-
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-	unsigned int i;
-
-	for (i = 0; i < ctx->nr_user_bufs; i++)
-		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
-	kfree(ctx->user_bufs);
-	io_rsrc_data_free(ctx->buf_data);
-	ctx->user_bufs = NULL;
-	ctx->buf_data = NULL;
-	ctx->nr_user_bufs = 0;
-}
-
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-	unsigned nr = ctx->nr_user_bufs;
-	int ret;
-
-	if (!ctx->buf_data)
-		return -ENXIO;
-
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_bufs = 0;
-	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
-	ctx->nr_user_bufs = nr;
-	if (!ret)
-		__io_sqe_buffers_unregister(ctx);
-	return ret;
-}
-
-static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
-		       void __user *arg, unsigned index)
-{
-	struct iovec __user *src;
-
-#ifdef CONFIG_COMPAT
-	if (ctx->compat) {
-		struct compat_iovec __user *ciovs;
-		struct compat_iovec ciov;
-
-		ciovs = (struct compat_iovec __user *) arg;
-		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
-			return -EFAULT;
-
-		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
-		dst->iov_len = ciov.iov_len;
-		return 0;
-	}
-#endif
-	src = (struct iovec __user *) arg;
-	if (copy_from_user(dst, &src[index], sizeof(*dst)))
-		return -EFAULT;
-	return 0;
-}
-
-/*
- * Not super efficient, but this is just a registration time. And we do cache
- * the last compound head, so generally we'll only do a full search if we don't
- * match that one.
- *
- * We check if the given compound head page has already been accounted, to
- * avoid double accounting it. This allows us to account the full size of the
- * page, not just the constituent pages of a huge page.
- */
-static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
-				  int nr_pages, struct page *hpage)
-{
-	int i, j;
-
-	/* check current page array */
-	for (i = 0; i < nr_pages; i++) {
-		if (!PageCompound(pages[i]))
-			continue;
-		if (compound_head(pages[i]) == hpage)
-			return true;
-	}
-
-	/* check previously registered pages */
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
-
-		for (j = 0; j < imu->nr_bvecs; j++) {
-			if (!PageCompound(imu->bvec[j].bv_page))
-				continue;
-			if (compound_head(imu->bvec[j].bv_page) == hpage)
-				return true;
-		}
-	}
-
-	return false;
-}
-
-static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
-				 int nr_pages, struct io_mapped_ubuf *imu,
-				 struct page **last_hpage)
-{
-	int i, ret;
-
-	imu->acct_pages = 0;
-	for (i = 0; i < nr_pages; i++) {
-		if (!PageCompound(pages[i])) {
-			imu->acct_pages++;
-		} else {
-			struct page *hpage;
-
-			hpage = compound_head(pages[i]);
-			if (hpage == *last_hpage)
-				continue;
-			*last_hpage = hpage;
-			if (headpage_already_acct(ctx, pages, i, hpage))
-				continue;
-			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
-		}
-	}
-
-	if (!imu->acct_pages)
-		return 0;
-
-	ret = io_account_mem(ctx, imu->acct_pages);
-	if (ret)
-		imu->acct_pages = 0;
-	return ret;
-}
-
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
-{
-	unsigned long start, end, nr_pages;
-	struct vm_area_struct **vmas = NULL;
-	struct page **pages = NULL;
-	int i, pret, ret = -ENOMEM;
-
-	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	start = ubuf >> PAGE_SHIFT;
-	nr_pages = end - start;
-
-	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
-	if (!pages)
-		goto done;
-
-	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
-			      GFP_KERNEL);
-	if (!vmas)
-		goto done;
-
-	ret = 0;
-	mmap_read_lock(current->mm);
-	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
-			      pages, vmas);
-	if (pret == nr_pages) {
-		/* don't support file backed memory */
-		for (i = 0; i < nr_pages; i++) {
-			struct vm_area_struct *vma = vmas[i];
-
-			if (vma_is_shmem(vma))
-				continue;
-			if (vma->vm_file &&
-			    !is_file_hugepages(vma->vm_file)) {
-				ret = -EOPNOTSUPP;
-				break;
-			}
-		}
-		*npages = nr_pages;
-	} else {
-		ret = pret < 0 ? pret : -EFAULT;
-	}
-	mmap_read_unlock(current->mm);
-	if (ret) {
-		/*
-		 * if we did partial map, or found file backed vmas,
-		 * release any pages we did get
-		 */
-		if (pret > 0)
-			unpin_user_pages(pages, pret);
-		goto done;
-	}
-	ret = 0;
-done:
-	kvfree(vmas);
-	if (ret < 0) {
-		kvfree(pages);
-		pages = ERR_PTR(ret);
-	}
-	return pages;
-}
-
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage)
-{
-	struct io_mapped_ubuf *imu = NULL;
-	struct page **pages = NULL;
-	unsigned long off;
-	size_t size;
-	int ret, nr_pages, i;
-
-	if (!iov->iov_base) {
-		*pimu = ctx->dummy_ubuf;
-		return 0;
-	}
-
-	*pimu = NULL;
-	ret = -ENOMEM;
-
-	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
-				&nr_pages);
-	if (IS_ERR(pages)) {
-		ret = PTR_ERR(pages);
-		pages = NULL;
-		goto done;
-	}
-
-	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
-	if (!imu)
-		goto done;
-
-	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
-	if (ret) {
-		unpin_user_pages(pages, nr_pages);
-		goto done;
-	}
-
-	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
-	size = iov->iov_len;
-	for (i = 0; i < nr_pages; i++) {
-		size_t vec_len;
-
-		vec_len = min_t(size_t, size, PAGE_SIZE - off);
-		imu->bvec[i].bv_page = pages[i];
-		imu->bvec[i].bv_len = vec_len;
-		imu->bvec[i].bv_offset = off;
-		off = 0;
-		size -= vec_len;
-	}
-	/* store original address for later verification */
-	imu->ubuf = (unsigned long) iov->iov_base;
-	imu->ubuf_end = imu->ubuf + iov->iov_len;
-	imu->nr_bvecs = nr_pages;
-	*pimu = imu;
-	ret = 0;
-done:
-	if (ret)
-		kvfree(imu);
-	kvfree(pages);
-	return ret;
-}
-
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
-	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
-	return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
-static int io_buffer_validate(struct iovec *iov)
-{
-	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
-
-	/*
-	 * Don't impose further limits on the size and buffer
-	 * constraints here, we'll -EINVAL later when IO is
-	 * submitted if they are wrong.
-	 */
-	if (!iov->iov_base)
-		return iov->iov_len ? -EFAULT : 0;
-	if (!iov->iov_len)
-		return -EFAULT;
-
-	/* arbitrary limit, but we need something */
-	if (iov->iov_len > SZ_1G)
-		return -EFAULT;
-
-	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
-		return -EOVERFLOW;
-
-	return 0;
-}
-
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-				   unsigned int nr_args, u64 __user *tags)
-{
-	struct page *last_hpage = NULL;
-	struct io_rsrc_data *data;
-	int i, ret;
-	struct iovec iov;
-
-	if (ctx->user_bufs)
-		return -EBUSY;
-	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
-		return -EINVAL;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		return ret;
-	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
-	if (ret)
-		return ret;
-	ret = io_buffers_map_alloc(ctx, nr_args);
-	if (ret) {
-		io_rsrc_data_free(data);
-		return ret;
-	}
-
-	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
-		if (arg) {
-			ret = io_copy_iov(ctx, &iov, arg, i);
-			if (ret)
-				break;
-			ret = io_buffer_validate(&iov);
-			if (ret)
-				break;
-		} else {
-			memset(&iov, 0, sizeof(iov));
-		}
-
-		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
-			ret = -EINVAL;
-			break;
-		}
-
-		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
-					     &last_hpage);
-		if (ret)
-			break;
-	}
-
-	WARN_ON_ONCE(ctx->buf_data);
-
-	ctx->buf_data = data;
-	if (ret)
-		__io_sqe_buffers_unregister(ctx);
-	else
-		io_rsrc_node_switch(ctx, NULL);
-	return ret;
-}
-
-static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
-				   struct io_uring_rsrc_update2 *up,
-				   unsigned int nr_args)
-{
-	u64 __user *tags = u64_to_user_ptr(up->tags);
-	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
-	struct page *last_hpage = NULL;
-	bool needs_switch = false;
-	__u32 done;
-	int i, err;
-
-	if (!ctx->buf_data)
-		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_bufs)
-		return -EINVAL;
-
-	for (done = 0; done < nr_args; done++) {
-		struct io_mapped_ubuf *imu;
-		int offset = up->offset + done;
-		u64 tag = 0;
-
-		err = io_copy_iov(ctx, &iov, iovs, done);
-		if (err)
-			break;
-		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
-			err = -EFAULT;
-			break;
-		}
-		err = io_buffer_validate(&iov);
-		if (err)
-			break;
-		if (!iov.iov_base && tag) {
-			err = -EINVAL;
-			break;
-		}
-		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
-		if (err)
-			break;
-
-		i = array_index_nospec(offset, ctx->nr_user_bufs);
-		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
-			err = io_queue_rsrc_removal(ctx->buf_data, i,
-						    ctx->rsrc_node, ctx->user_bufs[i]);
-			if (unlikely(err)) {
-				io_buffer_unmap(ctx, &imu);
-				break;
-			}
-			ctx->user_bufs[i] = NULL;
-			needs_switch = true;
-		}
-
-		ctx->user_bufs[i] = imu;
-		*io_get_tag_slot(ctx->buf_data, offset) = tag;
-	}
-
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, ctx->buf_data);
-	return done ? done : err;
-}
-
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
-			       unsigned int eventfd_async)
-{
-	struct io_ev_fd *ev_fd;
-	__s32 __user *fds = arg;
-	int fd;
-
-	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
-					lockdep_is_held(&ctx->uring_lock));
-	if (ev_fd)
-		return -EBUSY;
-
-	if (copy_from_user(&fd, fds, sizeof(*fds)))
-		return -EFAULT;
-
-	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
-	if (!ev_fd)
-		return -ENOMEM;
-
-	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
-	if (IS_ERR(ev_fd->cq_ev_fd)) {
-		int ret = PTR_ERR(ev_fd->cq_ev_fd);
-		kfree(ev_fd);
-		return ret;
-	}
-	ev_fd->eventfd_async = eventfd_async;
-	ctx->has_evfd = true;
-	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
-	return 0;
-}
-
-static void io_eventfd_put(struct rcu_head *rcu)
-{
-	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
-
-	eventfd_ctx_put(ev_fd->cq_ev_fd);
-	kfree(ev_fd);
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
-	struct io_ev_fd *ev_fd;
-
-	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
-					lockdep_is_held(&ctx->uring_lock));
-	if (ev_fd) {
-		ctx->has_evfd = false;
-		rcu_assign_pointer(ctx->io_ev_fd, NULL);
-		call_rcu(&ev_fd->rcu, io_eventfd_put);
-		return 0;
-	}
-
-	return -ENXIO;
-}
-
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 	int nr = 0;
@@ -5078,12 +3758,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_wait_rsrc_data(struct io_rsrc_data *data)
-{
-	if (data && !atomic_dec_and_test(&data->refs))
-		wait_for_completion(&data->done);
-}
-
 static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
 {
 	struct async_poll *apoll;
@@ -6228,89 +4902,6 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	return 0;
 }
 
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				     struct io_uring_rsrc_update2 *up,
-				     unsigned nr_args)
-{
-	__u32 tmp;
-	int err;
-
-	if (check_add_overflow(up->offset, nr_args, &tmp))
-		return -EOVERFLOW;
-	err = io_rsrc_node_switch_start(ctx);
-	if (err)
-		return err;
-
-	switch (type) {
-	case IORING_RSRC_FILE:
-		return __io_sqe_files_update(ctx, up, nr_args);
-	case IORING_RSRC_BUFFER:
-		return __io_sqe_buffers_update(ctx, up, nr_args);
-	}
-	return -EINVAL;
-}
-
-static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
-				    unsigned nr_args)
-{
-	struct io_uring_rsrc_update2 up;
-
-	if (!nr_args)
-		return -EINVAL;
-	memset(&up, 0, sizeof(up));
-	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
-		return -EFAULT;
-	if (up.resv || up.resv2)
-		return -EINVAL;
-	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
-}
-
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
-				   unsigned size, unsigned type)
-{
-	struct io_uring_rsrc_update2 up;
-
-	if (size != sizeof(up))
-		return -EINVAL;
-	if (copy_from_user(&up, arg, sizeof(up)))
-		return -EFAULT;
-	if (!up.nr || up.resv || up.resv2)
-		return -EINVAL;
-	return __io_register_rsrc_update(ctx, type, &up, up.nr);
-}
-
-static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
-			    unsigned int size, unsigned int type)
-{
-	struct io_uring_rsrc_register rr;
-
-	/* keep it extendible */
-	if (size != sizeof(rr))
-		return -EINVAL;
-
-	memset(&rr, 0, sizeof(rr));
-	if (copy_from_user(&rr, arg, size))
-		return -EFAULT;
-	if (!rr.nr || rr.resv2)
-		return -EINVAL;
-	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
-		return -EINVAL;
-
-	switch (type) {
-	case IORING_RSRC_FILE:
-		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
-			break;
-		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
-					     rr.nr, u64_to_user_ptr(rr.tags));
-	case IORING_RSRC_BUFFER:
-		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
-			break;
-		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
-					       rr.nr, u64_to_user_ptr(rr.tags));
-	}
-	return -EINVAL;
-}
-
 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
 				       void __user *arg, unsigned len)
 {
@@ -7103,7 +5694,6 @@ static int __init io_uring_init(void)
 		     sizeof(struct io_uring_rsrc_update2));
 
 	/* ->buf_index is u16 */
-	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
 	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
 	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
 		     offsetof(struct io_uring_buf_ring, tail));
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 172defdcfdbe40..090c17deba9db2 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -92,6 +92,7 @@ static inline bool io_run_task_work(void)
 	return false;
 }
 
+int io_run_task_work_sig(void);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
@@ -110,11 +111,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 			  unsigned int issue_flags, u32 slot_index);
 
-int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-			  struct io_rsrc_node *node, void *rsrc);
-void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-			 struct io_rsrc_data *data_to_kill);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index fa35bd56a33086..1cbf3903097053 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -14,6 +14,7 @@
 
 #include "io_uring_types.h"
 #include "io_uring.h"
+#include "rsrc.h"
 #include "openclose.h"
 
 struct io_open {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
new file mode 100644
index 00000000000000..8c40b20659d40a
--- /dev/null
+++ b/io_uring/rsrc.c
@@ -0,0 +1,1320 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/nospec.h>
+#include <linux/hugetlb.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "openclose.h"
+#include "rsrc.h"
+
+struct io_rsrc_update {
+	struct file			*file;
+	u64				arg;
+	u32				nr_args;
+	u32				offset;
+};
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+				  struct io_mapped_ubuf **pimu,
+				  struct page **last_hpage);
+
+#define IO_RSRC_REF_BATCH	100
+
+/* only define max */
+#define IORING_MAX_FIXED_FILES	(1U << 20)
+#define IORING_MAX_REG_BUFFERS	(1U << 14)
+
+void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	if (ctx->rsrc_cached_refs) {
+		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
+		ctx->rsrc_cached_refs = 0;
+	}
+}
+
+static inline void __io_unaccount_mem(struct user_struct *user,
+				      unsigned long nr_pages)
+{
+	atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static inline int __io_account_mem(struct user_struct *user,
+				   unsigned long nr_pages)
+{
+	unsigned long page_limit, cur_pages, new_pages;
+
+	/* Don't allow more pages than we can safely lock */
+	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	do {
+		cur_pages = atomic_long_read(&user->locked_vm);
+		new_pages = cur_pages + nr_pages;
+		if (new_pages > page_limit)
+			return -ENOMEM;
+	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+					new_pages) != cur_pages);
+
+	return 0;
+}
+
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+{
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, nr_pages);
+
+	if (ctx->mm_account)
+		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
+}
+
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+{
+	int ret;
+
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, nr_pages);
+		if (ret)
+			return ret;
+	}
+
+	if (ctx->mm_account)
+		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
+
+	return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+		       void __user *arg, unsigned index)
+{
+	struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat) {
+		struct compat_iovec __user *ciovs;
+		struct compat_iovec ciov;
+
+		ciovs = (struct compat_iovec __user *) arg;
+		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+			return -EFAULT;
+
+		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
+		dst->iov_len = ciov.iov_len;
+		return 0;
+	}
+#endif
+	src = (struct iovec __user *) arg;
+	if (copy_from_user(dst, &src[index], sizeof(*dst)))
+		return -EFAULT;
+	return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
+	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+
+	/*
+	 * Don't impose further limits on the size and buffer
+	 * constraints here, we'll -EINVAL later when IO is
+	 * submitted if they are wrong.
+	 */
+	if (!iov->iov_base)
+		return iov->iov_len ? -EFAULT : 0;
+	if (!iov->iov_len)
+		return -EFAULT;
+
+	/* arbitrary limit, but we need something */
+	if (iov->iov_len > SZ_1G)
+		return -EFAULT;
+
+	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
+		return -EOVERFLOW;
+
+	return 0;
+}
+
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+{
+	struct io_mapped_ubuf *imu = *slot;
+	unsigned int i;
+
+	if (imu != ctx->dummy_ubuf) {
+		for (i = 0; i < imu->nr_bvecs; i++)
+			unpin_user_page(imu->bvec[i].bv_page);
+		if (imu->acct_pages)
+			io_unaccount_mem(ctx, imu->acct_pages);
+		kvfree(imu);
+	}
+	*slot = NULL;
+}
+
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+
+static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
+{
+	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
+	struct io_ring_ctx *ctx = rsrc_data->ctx;
+	struct io_rsrc_put *prsrc, *tmp;
+
+	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+		list_del(&prsrc->list);
+
+		if (prsrc->tag) {
+			if (ctx->flags & IORING_SETUP_IOPOLL)
+				mutex_lock(&ctx->uring_lock);
+
+			spin_lock(&ctx->completion_lock);
+			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
+			io_commit_cqring(ctx);
+			spin_unlock(&ctx->completion_lock);
+			io_cqring_ev_posted(ctx);
+
+			if (ctx->flags & IORING_SETUP_IOPOLL)
+				mutex_unlock(&ctx->uring_lock);
+		}
+
+		rsrc_data->do_put(ctx, prsrc);
+		kfree(prsrc);
+	}
+
+	io_rsrc_node_destroy(ref_node);
+	if (atomic_dec_and_test(&rsrc_data->refs))
+		complete(&rsrc_data->done);
+}
+
+void io_rsrc_put_work(struct work_struct *work)
+{
+	struct io_ring_ctx *ctx;
+	struct llist_node *node;
+
+	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
+	node = llist_del_all(&ctx->rsrc_put_llist);
+
+	while (node) {
+		struct io_rsrc_node *ref_node;
+		struct llist_node *next = node->next;
+
+		ref_node = llist_entry(node, struct io_rsrc_node, llist);
+		__io_rsrc_put_work(ref_node);
+		node = next;
+	}
+}
+
+void io_wait_rsrc_data(struct io_rsrc_data *data)
+{
+	if (data && !atomic_dec_and_test(&data->refs))
+		wait_for_completion(&data->done);
+}
+
+void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+{
+	percpu_ref_exit(&ref_node->refs);
+	kfree(ref_node);
+}
+
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+{
+	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+	unsigned long flags;
+	bool first_add = false;
+	unsigned long delay = HZ;
+
+	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
+	node->done = true;
+
+	/* if we are mid-quiesce then do not delay */
+	if (node->rsrc_data->quiesce)
+		delay = 0;
+
+	while (!list_empty(&ctx->rsrc_ref_list)) {
+		node = list_first_entry(&ctx->rsrc_ref_list,
+					    struct io_rsrc_node, node);
+		/* recycle ref nodes in order */
+		if (!node->done)
+			break;
+		list_del(&node->node);
+		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
+	}
+	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
+
+	if (first_add)
+		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+}
+
+static struct io_rsrc_node *io_rsrc_node_alloc(void)
+{
+	struct io_rsrc_node *ref_node;
+
+	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+	if (!ref_node)
+		return NULL;
+
+	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
+			    0, GFP_KERNEL)) {
+		kfree(ref_node);
+		return NULL;
+	}
+	INIT_LIST_HEAD(&ref_node->node);
+	INIT_LIST_HEAD(&ref_node->rsrc_list);
+	ref_node->done = false;
+	return ref_node;
+}
+
+void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+			 struct io_rsrc_data *data_to_kill)
+	__must_hold(&ctx->uring_lock)
+{
+	WARN_ON_ONCE(!ctx->rsrc_backup_node);
+	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+
+	io_rsrc_refs_drop(ctx);
+
+	if (data_to_kill) {
+		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+
+		rsrc_node->rsrc_data = data_to_kill;
+		spin_lock_irq(&ctx->rsrc_ref_lock);
+		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
+		spin_unlock_irq(&ctx->rsrc_ref_lock);
+
+		atomic_inc(&data_to_kill->refs);
+		percpu_ref_kill(&rsrc_node->refs);
+		ctx->rsrc_node = NULL;
+	}
+
+	if (!ctx->rsrc_node) {
+		ctx->rsrc_node = ctx->rsrc_backup_node;
+		ctx->rsrc_backup_node = NULL;
+	}
+}
+
+int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+{
+	if (ctx->rsrc_backup_node)
+		return 0;
+	ctx->rsrc_backup_node = io_rsrc_node_alloc();
+	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+}
+
+__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+				      struct io_ring_ctx *ctx)
+{
+	int ret;
+
+	/* As we may drop ->uring_lock, other task may have started quiesce */
+	if (data->quiesce)
+		return -ENXIO;
+
+	data->quiesce = true;
+	do {
+		ret = io_rsrc_node_switch_start(ctx);
+		if (ret)
+			break;
+		io_rsrc_node_switch(ctx, data);
+
+		/* kill initial ref, already quiesced if zero */
+		if (atomic_dec_and_test(&data->refs))
+			break;
+		mutex_unlock(&ctx->uring_lock);
+		flush_delayed_work(&ctx->rsrc_put_work);
+		ret = wait_for_completion_interruptible(&data->done);
+		if (!ret) {
+			mutex_lock(&ctx->uring_lock);
+			if (atomic_read(&data->refs) > 0) {
+				/*
+				 * it has been revived by another thread while
+				 * we were unlocked
+				 */
+				mutex_unlock(&ctx->uring_lock);
+			} else {
+				break;
+			}
+		}
+
+		atomic_inc(&data->refs);
+		/* wait for all works potentially completing data->done */
+		flush_delayed_work(&ctx->rsrc_put_work);
+		reinit_completion(&data->done);
+
+		ret = io_run_task_work_sig();
+		mutex_lock(&ctx->uring_lock);
+	} while (ret >= 0);
+	data->quiesce = false;
+
+	return ret;
+}
+
+static void io_free_page_table(void **table, size_t size)
+{
+	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+
+	for (i = 0; i < nr_tables; i++)
+		kfree(table[i]);
+	kfree(table);
+}
+
+static void io_rsrc_data_free(struct io_rsrc_data *data)
+{
+	size_t size = data->nr * sizeof(data->tags[0][0]);
+
+	if (data->tags)
+		io_free_page_table((void **)data->tags, size);
+	kfree(data);
+}
+
+static __cold void **io_alloc_page_table(size_t size)
+{
+	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+	size_t init_size = size;
+	void **table;
+
+	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
+	if (!table)
+		return NULL;
+
+	for (i = 0; i < nr_tables; i++) {
+		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+
+		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
+		if (!table[i]) {
+			io_free_page_table(table, init_size);
+			return NULL;
+		}
+		size -= this_size;
+	}
+	return table;
+}
+
+__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
+				     rsrc_put_fn *do_put, u64 __user *utags,
+				     unsigned nr, struct io_rsrc_data **pdata)
+{
+	struct io_rsrc_data *data;
+	int ret = -ENOMEM;
+	unsigned i;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
+	if (!data->tags) {
+		kfree(data);
+		return -ENOMEM;
+	}
+
+	data->nr = nr;
+	data->ctx = ctx;
+	data->do_put = do_put;
+	if (utags) {
+		ret = -EFAULT;
+		for (i = 0; i < nr; i++) {
+			u64 *tag_slot = io_get_tag_slot(data, i);
+
+			if (copy_from_user(tag_slot, &utags[i],
+					   sizeof(*tag_slot)))
+				goto fail;
+		}
+	}
+
+	atomic_set(&data->refs, 1);
+	init_completion(&data->done);
+	*pdata = data;
+	return 0;
+fail:
+	io_rsrc_data_free(data);
+	return ret;
+}
+
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+				 struct io_uring_rsrc_update2 *up,
+				 unsigned nr_args)
+{
+	u64 __user *tags = u64_to_user_ptr(up->tags);
+	__s32 __user *fds = u64_to_user_ptr(up->data);
+	struct io_rsrc_data *data = ctx->file_data;
+	struct io_fixed_file *file_slot;
+	struct file *file;
+	int fd, i, err = 0;
+	unsigned int done;
+	bool needs_switch = false;
+
+	if (!ctx->file_data)
+		return -ENXIO;
+	if (up->offset + nr_args > ctx->nr_user_files)
+		return -EINVAL;
+
+	for (done = 0; done < nr_args; done++) {
+		u64 tag = 0;
+
+		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
+			err = -EFAULT;
+			break;
+		}
+		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+			err = -EINVAL;
+			break;
+		}
+		if (fd == IORING_REGISTER_FILES_SKIP)
+			continue;
+
+		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
+		file_slot = io_fixed_file_slot(&ctx->file_table, i);
+
+		if (file_slot->file_ptr) {
+			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
+			if (err)
+				break;
+			file_slot->file_ptr = 0;
+			io_file_bitmap_clear(&ctx->file_table, i);
+			needs_switch = true;
+		}
+		if (fd != -1) {
+			file = fget(fd);
+			if (!file) {
+				err = -EBADF;
+				break;
+			}
+			/*
+			 * Don't allow io_uring instances to be registered. If
+			 * UNIX isn't enabled, then this causes a reference
+			 * cycle and this instance can never get freed. If UNIX
+			 * is enabled we'll handle it just fine, but there's
+			 * still no point in allowing a ring fd as it doesn't
+			 * support regular read/write anyway.
+			 */
+			if (io_is_uring_fops(file)) {
+				fput(file);
+				err = -EBADF;
+				break;
+			}
+			err = io_scm_file_account(ctx, file);
+			if (err) {
+				fput(file);
+				break;
+			}
+			*io_get_tag_slot(data, i) = tag;
+			io_fixed_file_set(file_slot, file);
+			io_file_bitmap_set(&ctx->file_table, i);
+		}
+	}
+
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, data);
+	return done ? done : err;
+}
+
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+				   struct io_uring_rsrc_update2 *up,
+				   unsigned int nr_args)
+{
+	u64 __user *tags = u64_to_user_ptr(up->tags);
+	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+	struct page *last_hpage = NULL;
+	bool needs_switch = false;
+	__u32 done;
+	int i, err;
+
+	if (!ctx->buf_data)
+		return -ENXIO;
+	if (up->offset + nr_args > ctx->nr_user_bufs)
+		return -EINVAL;
+
+	for (done = 0; done < nr_args; done++) {
+		struct io_mapped_ubuf *imu;
+		int offset = up->offset + done;
+		u64 tag = 0;
+
+		err = io_copy_iov(ctx, &iov, iovs, done);
+		if (err)
+			break;
+		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+			err = -EFAULT;
+			break;
+		}
+		err = io_buffer_validate(&iov);
+		if (err)
+			break;
+		if (!iov.iov_base && tag) {
+			err = -EINVAL;
+			break;
+		}
+		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+		if (err)
+			break;
+
+		i = array_index_nospec(offset, ctx->nr_user_bufs);
+		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
+			err = io_queue_rsrc_removal(ctx->buf_data, i,
+						    ctx->rsrc_node, ctx->user_bufs[i]);
+			if (unlikely(err)) {
+				io_buffer_unmap(ctx, &imu);
+				break;
+			}
+			ctx->user_bufs[i] = NULL;
+			needs_switch = true;
+		}
+
+		ctx->user_bufs[i] = imu;
+		*io_get_tag_slot(ctx->buf_data, offset) = tag;
+	}
+
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, ctx->buf_data);
+	return done ? done : err;
+}
+
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+				     struct io_uring_rsrc_update2 *up,
+				     unsigned nr_args)
+{
+	__u32 tmp;
+	int err;
+
+	if (check_add_overflow(up->offset, nr_args, &tmp))
+		return -EOVERFLOW;
+	err = io_rsrc_node_switch_start(ctx);
+	if (err)
+		return err;
+
+	switch (type) {
+	case IORING_RSRC_FILE:
+		return __io_sqe_files_update(ctx, up, nr_args);
+	case IORING_RSRC_BUFFER:
+		return __io_sqe_buffers_update(ctx, up, nr_args);
+	}
+	return -EINVAL;
+}
+
+int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+			     unsigned nr_args)
+{
+	struct io_uring_rsrc_update2 up;
+
+	if (!nr_args)
+		return -EINVAL;
+	memset(&up, 0, sizeof(up));
+	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+		return -EFAULT;
+	if (up.resv || up.resv2)
+		return -EINVAL;
+	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+}
+
+int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned size, unsigned type)
+{
+	struct io_uring_rsrc_update2 up;
+
+	if (size != sizeof(up))
+		return -EINVAL;
+	if (copy_from_user(&up, arg, sizeof(up)))
+		return -EFAULT;
+	if (!up.nr || up.resv || up.resv2)
+		return -EINVAL;
+	return __io_register_rsrc_update(ctx, type, &up, up.nr);
+}
+
+__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned int size, unsigned int type)
+{
+	struct io_uring_rsrc_register rr;
+
+	/* keep it extendible */
+	if (size != sizeof(rr))
+		return -EINVAL;
+
+	memset(&rr, 0, sizeof(rr));
+	if (copy_from_user(&rr, arg, size))
+		return -EFAULT;
+	if (!rr.nr || rr.resv2)
+		return -EINVAL;
+	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
+		return -EINVAL;
+
+	switch (type) {
+	case IORING_RSRC_FILE:
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+			break;
+		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+					     rr.nr, u64_to_user_ptr(rr.tags));
+	case IORING_RSRC_BUFFER:
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+			break;
+		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+					       rr.nr, u64_to_user_ptr(rr.tags));
+	}
+	return -EINVAL;
+}
+
+int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+
+	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+		return -EINVAL;
+	if (sqe->rw_flags || sqe->splice_fd_in)
+		return -EINVAL;
+
+	up->offset = READ_ONCE(sqe->off);
+	up->nr_args = READ_ONCE(sqe->len);
+	if (!up->nr_args)
+		return -EINVAL;
+	up->arg = READ_ONCE(sqe->addr);
+	return 0;
+}
+
+static int io_files_update_with_index_alloc(struct io_kiocb *req,
+					    unsigned int issue_flags)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	__s32 __user *fds = u64_to_user_ptr(up->arg);
+	unsigned int done;
+	struct file *file;
+	int ret, fd;
+
+	if (!req->ctx->file_data)
+		return -ENXIO;
+
+	for (done = 0; done < up->nr_args; done++) {
+		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		file = fget(fd);
+		if (!file) {
+			ret = -EBADF;
+			break;
+		}
+		ret = io_fixed_fd_install(req, issue_flags, file,
+					  IORING_FILE_INDEX_ALLOC);
+		if (ret < 0)
+			break;
+		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
+			__io_close_fixed(req, issue_flags, ret);
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	if (done)
+		return done;
+	return ret;
+}
+
+int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_uring_rsrc_update2 up2;
+	int ret;
+
+	up2.offset = up->offset;
+	up2.data = up->arg;
+	up2.nr = 0;
+	up2.tags = 0;
+	up2.resv = 0;
+	up2.resv2 = 0;
+
+	if (up->offset == IORING_FILE_INDEX_ALLOC) {
+		ret = io_files_update_with_index_alloc(req, issue_flags);
+	} else {
+		io_ring_submit_lock(ctx, issue_flags);
+		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+						&up2, up->nr_args);
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
+
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+			  struct io_rsrc_node *node, void *rsrc)
+{
+	u64 *tag_slot = io_get_tag_slot(data, idx);
+	struct io_rsrc_put *prsrc;
+
+	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+	if (!prsrc)
+		return -ENOMEM;
+
+	prsrc->tag = *tag_slot;
+	*tag_slot = 0;
+	prsrc->rsrc = rsrc;
+	list_add(&prsrc->list, &node->rsrc_list);
+	return 0;
+}
+
+void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if !defined(IO_URING_SCM_ALL)
+	int i;
+
+	for (i = 0; i < ctx->nr_user_files; i++) {
+		struct file *file = io_file_from_index(&ctx->file_table, i);
+
+		if (!file)
+			continue;
+		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+			continue;
+		io_file_bitmap_clear(&ctx->file_table, i);
+		fput(file);
+	}
+#endif
+
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock) {
+		struct sock *sock = ctx->ring_sock->sk;
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+			kfree_skb(skb);
+	}
+#endif
+	io_free_file_tables(&ctx->file_table);
+	io_rsrc_data_free(ctx->file_data);
+	ctx->file_data = NULL;
+	ctx->nr_user_files = 0;
+}
+
+int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+	unsigned nr = ctx->nr_user_files;
+	int ret;
+
+	if (!ctx->file_data)
+		return -ENXIO;
+
+	/*
+	 * Quiesce may unlock ->uring_lock, and while it's not held
+	 * prevent new requests using the table.
+	 */
+	ctx->nr_user_files = 0;
+	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+	ctx->nr_user_files = nr;
+	if (!ret)
+		__io_sqe_files_unregister(ctx);
+	return ret;
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing. We account only files that can hold other
+ * files because otherwise they can't form a loop and so are not interesting
+ * for GC.
+ */
+int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
+{
+#if defined(CONFIG_UNIX)
+	struct sock *sk = ctx->ring_sock->sk;
+	struct sk_buff_head *head = &sk->sk_receive_queue;
+	struct scm_fp_list *fpl;
+	struct sk_buff *skb;
+
+	if (likely(!io_file_need_scm(file)))
+		return 0;
+
+	/*
+	 * See if we can merge this file into an existing skb SCM_RIGHTS
+	 * file set. If there's no room, fall back to allocating a new skb
+	 * and filling it in.
+	 */
+	spin_lock_irq(&head->lock);
+	skb = skb_peek(head);
+	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
+		__skb_unlink(skb, head);
+	else
+		skb = NULL;
+	spin_unlock_irq(&head->lock);
+
+	if (!skb) {
+		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+		if (!fpl)
+			return -ENOMEM;
+
+		skb = alloc_skb(0, GFP_KERNEL);
+		if (!skb) {
+			kfree(fpl);
+			return -ENOMEM;
+		}
+
+		fpl->user = get_uid(current_user());
+		fpl->max = SCM_MAX_FD;
+		fpl->count = 0;
+
+		UNIXCB(skb).fp = fpl;
+		skb->sk = sk;
+		skb->destructor = unix_destruct_scm;
+		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+	}
+
+	fpl = UNIXCB(skb).fp;
+	fpl->fp[fpl->count++] = get_file(file);
+	unix_inflight(fpl->user, file);
+	skb_queue_head(head, skb);
+	fput(file);
+#endif
+	return 0;
+}
+
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+	struct file *file = prsrc->file;
+#if defined(CONFIG_UNIX)
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head list, *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+	int i;
+
+	if (!io_file_need_scm(file)) {
+		fput(file);
+		return;
+	}
+
+	__skb_queue_head_init(&list);
+
+	/*
+	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
+	 * remove this entry and rearrange the file array.
+	 */
+	skb = skb_dequeue(head);
+	while (skb) {
+		struct scm_fp_list *fp;
+
+		fp = UNIXCB(skb).fp;
+		for (i = 0; i < fp->count; i++) {
+			int left;
+
+			if (fp->fp[i] != file)
+				continue;
+
+			unix_notinflight(fp->user, fp->fp[i]);
+			left = fp->count - 1 - i;
+			if (left) {
+				memmove(&fp->fp[i], &fp->fp[i + 1],
+						left * sizeof(struct file *));
+			}
+			fp->count--;
+			if (!fp->count) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else {
+				__skb_queue_tail(&list, skb);
+			}
+			fput(file);
+			file = NULL;
+			break;
+		}
+
+		if (!file)
+			break;
+
+		__skb_queue_tail(&list, skb);
+
+		skb = skb_dequeue(head);
+	}
+
+	if (skb_peek(&list)) {
+		spin_lock_irq(&head->lock);
+		while ((skb = __skb_dequeue(&list)) != NULL)
+			__skb_queue_tail(head, skb);
+		spin_unlock_irq(&head->lock);
+	}
+#else
+	fput(file);
+#endif
+}
+
+int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+			  unsigned nr_args, u64 __user *tags)
+{
+	__s32 __user *fds = (__s32 __user *) arg;
+	struct file *file;
+	int fd, ret;
+	unsigned i;
+
+	if (ctx->file_data)
+		return -EBUSY;
+	if (!nr_args)
+		return -EINVAL;
+	if (nr_args > IORING_MAX_FIXED_FILES)
+		return -EMFILE;
+	if (nr_args > rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
+	ret = io_rsrc_node_switch_start(ctx);
+	if (ret)
+		return ret;
+	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+				 &ctx->file_data);
+	if (ret)
+		return ret;
+
+	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
+		io_rsrc_data_free(ctx->file_data);
+		ctx->file_data = NULL;
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+		struct io_fixed_file *file_slot;
+
+		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
+			ret = -EFAULT;
+			goto fail;
+		}
+		/* allow sparse sets */
+		if (!fds || fd == -1) {
+			ret = -EINVAL;
+			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
+				goto fail;
+			continue;
+		}
+
+		file = fget(fd);
+		ret = -EBADF;
+		if (unlikely(!file))
+			goto fail;
+
+		/*
+		 * Don't allow io_uring instances to be registered. If UNIX
+		 * isn't enabled, then this causes a reference cycle and this
+		 * instance can never get freed. If UNIX is enabled we'll
+		 * handle it just fine, but there's still no point in allowing
+		 * a ring fd as it doesn't support regular read/write anyway.
+		 */
+		if (io_is_uring_fops(file)) {
+			fput(file);
+			goto fail;
+		}
+		ret = io_scm_file_account(ctx, file);
+		if (ret) {
+			fput(file);
+			goto fail;
+		}
+		file_slot = io_fixed_file_slot(&ctx->file_table, i);
+		io_fixed_file_set(file_slot, file);
+		io_file_bitmap_set(&ctx->file_table, i);
+	}
+
+	io_rsrc_node_switch(ctx, NULL);
+	return 0;
+fail:
+	__io_sqe_files_unregister(ctx);
+	return ret;
+}
+
+static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+	io_buffer_unmap(ctx, &prsrc->buf);
+	prsrc->buf = NULL;
+}
+
+void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+	unsigned int i;
+
+	for (i = 0; i < ctx->nr_user_bufs; i++)
+		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
+	kfree(ctx->user_bufs);
+	io_rsrc_data_free(ctx->buf_data);
+	ctx->user_bufs = NULL;
+	ctx->buf_data = NULL;
+	ctx->nr_user_bufs = 0;
+}
+
+int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+	unsigned nr = ctx->nr_user_bufs;
+	int ret;
+
+	if (!ctx->buf_data)
+		return -ENXIO;
+
+	/*
+	 * Quiesce may unlock ->uring_lock, and while it's not held
+	 * prevent new requests using the table.
+	 */
+	ctx->nr_user_bufs = 0;
+	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+	ctx->nr_user_bufs = nr;
+	if (!ret)
+		__io_sqe_buffers_unregister(ctx);
+	return ret;
+}
+
+/*
+ * Not super efficient, but this is just a registration time. And we do cache
+ * the last compound head, so generally we'll only do a full search if we don't
+ * match that one.
+ *
+ * We check if the given compound head page has already been accounted, to
+ * avoid double accounting it. This allows us to account the full size of the
+ * page, not just the constituent pages of a huge page.
+ */
+static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
+				  int nr_pages, struct page *hpage)
+{
+	int i, j;
+
+	/* check current page array */
+	for (i = 0; i < nr_pages; i++) {
+		if (!PageCompound(pages[i]))
+			continue;
+		if (compound_head(pages[i]) == hpage)
+			return true;
+	}
+
+	/* check previously registered pages */
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		for (j = 0; j < imu->nr_bvecs; j++) {
+			if (!PageCompound(imu->bvec[j].bv_page))
+				continue;
+			if (compound_head(imu->bvec[j].bv_page) == hpage)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
+				 int nr_pages, struct io_mapped_ubuf *imu,
+				 struct page **last_hpage)
+{
+	int i, ret;
+
+	imu->acct_pages = 0;
+	for (i = 0; i < nr_pages; i++) {
+		if (!PageCompound(pages[i])) {
+			imu->acct_pages++;
+		} else {
+			struct page *hpage;
+
+			hpage = compound_head(pages[i]);
+			if (hpage == *last_hpage)
+				continue;
+			*last_hpage = hpage;
+			if (headpage_already_acct(ctx, pages, i, hpage))
+				continue;
+			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
+		}
+	}
+
+	if (!imu->acct_pages)
+		return 0;
+
+	ret = io_account_mem(ctx, imu->acct_pages);
+	if (ret)
+		imu->acct_pages = 0;
+	return ret;
+}
+
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
+{
+	unsigned long start, end, nr_pages;
+	struct vm_area_struct **vmas = NULL;
+	struct page **pages = NULL;
+	int i, pret, ret = -ENOMEM;
+
+	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ubuf >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto done;
+
+	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
+			      GFP_KERNEL);
+	if (!vmas)
+		goto done;
+
+	ret = 0;
+	mmap_read_lock(current->mm);
+	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+			      pages, vmas);
+	if (pret == nr_pages) {
+		/* don't support file backed memory */
+		for (i = 0; i < nr_pages; i++) {
+			struct vm_area_struct *vma = vmas[i];
+
+			if (vma_is_shmem(vma))
+				continue;
+			if (vma->vm_file &&
+			    !is_file_hugepages(vma->vm_file)) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+		}
+		*npages = nr_pages;
+	} else {
+		ret = pret < 0 ? pret : -EFAULT;
+	}
+	mmap_read_unlock(current->mm);
+	if (ret) {
+		/*
+		 * if we did partial map, or found file backed vmas,
+		 * release any pages we did get
+		 */
+		if (pret > 0)
+			unpin_user_pages(pages, pret);
+		goto done;
+	}
+	ret = 0;
+done:
+	kvfree(vmas);
+	if (ret < 0) {
+		kvfree(pages);
+		pages = ERR_PTR(ret);
+	}
+	return pages;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+				  struct io_mapped_ubuf **pimu,
+				  struct page **last_hpage)
+{
+	struct io_mapped_ubuf *imu = NULL;
+	struct page **pages = NULL;
+	unsigned long off;
+	size_t size;
+	int ret, nr_pages, i;
+
+	if (!iov->iov_base) {
+		*pimu = ctx->dummy_ubuf;
+		return 0;
+	}
+
+	*pimu = NULL;
+	ret = -ENOMEM;
+
+	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+				&nr_pages);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		pages = NULL;
+		goto done;
+	}
+
+	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	if (!imu)
+		goto done;
+
+	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
+	if (ret) {
+		unpin_user_pages(pages, nr_pages);
+		goto done;
+	}
+
+	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
+	size = iov->iov_len;
+	for (i = 0; i < nr_pages; i++) {
+		size_t vec_len;
+
+		vec_len = min_t(size_t, size, PAGE_SIZE - off);
+		imu->bvec[i].bv_page = pages[i];
+		imu->bvec[i].bv_len = vec_len;
+		imu->bvec[i].bv_offset = off;
+		off = 0;
+		size -= vec_len;
+	}
+	/* store original address for later verification */
+	imu->ubuf = (unsigned long) iov->iov_base;
+	imu->ubuf_end = imu->ubuf + iov->iov_len;
+	imu->nr_bvecs = nr_pages;
+	*pimu = imu;
+	ret = 0;
+done:
+	if (ret)
+		kvfree(imu);
+	kvfree(pages);
+	return ret;
+}
+
+static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
+{
+	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+	return ctx->user_bufs ? 0 : -ENOMEM;
+}
+
+int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned int nr_args, u64 __user *tags)
+{
+	struct page *last_hpage = NULL;
+	struct io_rsrc_data *data;
+	int i, ret;
+	struct iovec iov;
+
+	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+
+	if (ctx->user_bufs)
+		return -EBUSY;
+	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
+		return -EINVAL;
+	ret = io_rsrc_node_switch_start(ctx);
+	if (ret)
+		return ret;
+	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+	if (ret)
+		return ret;
+	ret = io_buffers_map_alloc(ctx, nr_args);
+	if (ret) {
+		io_rsrc_data_free(data);
+		return ret;
+	}
+
+	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+		if (arg) {
+			ret = io_copy_iov(ctx, &iov, arg, i);
+			if (ret)
+				break;
+			ret = io_buffer_validate(&iov);
+			if (ret)
+				break;
+		} else {
+			memset(&iov, 0, sizeof(iov));
+		}
+
+		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+					     &last_hpage);
+		if (ret)
+			break;
+	}
+
+	WARN_ON_ONCE(ctx->buf_data);
+
+	ctx->buf_data = data;
+	if (ret)
+		__io_sqe_buffers_unregister(ctx);
+	else
+		io_rsrc_node_switch(ctx, NULL);
+	return ret;
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
new file mode 100644
index 00000000000000..872c86312cbc2d
--- /dev/null
+++ b/io_uring/rsrc.h
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_RSRC_H
+#define IOU_RSRC_H
+
+#include <net/af_unix.h>
+
+#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
+#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
+
+enum {
+	IORING_RSRC_FILE		= 0,
+	IORING_RSRC_BUFFER		= 1,
+};
+
+struct io_rsrc_put {
+	struct list_head list;
+	u64 tag;
+	union {
+		void *rsrc;
+		struct file *file;
+		struct io_mapped_ubuf *buf;
+	};
+};
+
+typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+
+struct io_rsrc_data {
+	struct io_ring_ctx		*ctx;
+
+	u64				**tags;
+	unsigned int			nr;
+	rsrc_put_fn			*do_put;
+	atomic_t			refs;
+	struct completion		done;
+	bool				quiesce;
+};
+
+struct io_rsrc_node {
+	struct percpu_ref		refs;
+	struct list_head		node;
+	struct list_head		rsrc_list;
+	struct io_rsrc_data		*rsrc_data;
+	struct llist_node		llist;
+	bool				done;
+};
+
+void io_rsrc_put_work(struct work_struct *work);
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
+void io_wait_rsrc_data(struct io_rsrc_data *data);
+void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
+int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+			  struct io_rsrc_node *node, void *rsrc);
+void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+			 struct io_rsrc_data *data_to_kill);
+
+
+void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
+int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
+int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned int nr_args, u64 __user *tags);
+void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
+int io_sqe_files_unregister(struct io_ring_ctx *ctx);
+int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+			  unsigned nr_args, u64 __user *tags);
+
+int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
+
+#if defined(CONFIG_UNIX)
+static inline bool io_file_need_scm(struct file *filp)
+{
+#if defined(IO_URING_SCM_ALL)
+	return true;
+#else
+	return !!unix_get_socket(filp);
+#endif
+}
+#else
+static inline bool io_file_need_scm(struct file *filp)
+{
+	return false;
+}
+#endif
+
+static inline int io_scm_file_account(struct io_ring_ctx *ctx,
+				      struct file *file)
+{
+	if (likely(!io_file_need_scm(file)))
+		return 0;
+	return __io_scm_file_account(ctx, file);
+}
+
+int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+			     unsigned nr_args);
+int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned size, unsigned type);
+int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+			unsigned int size, unsigned int type);
+
+static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
+{
+	percpu_ref_put_many(&node->refs, nr);
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req)
+{
+	if (req->rsrc_node)
+		io_rsrc_put_node(req->rsrc_node, 1);
+}
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+					  struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_rsrc_node *node = req->rsrc_node;
+
+	if (node) {
+		if (node == ctx->rsrc_node)
+			ctx->rsrc_cached_refs++;
+		else
+			io_rsrc_put_node(node, 1);
+	}
+}
+
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+					struct io_ring_ctx *ctx,
+					unsigned int issue_flags)
+{
+	if (!req->rsrc_node) {
+		req->rsrc_node = ctx->rsrc_node;
+
+		if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+			lockdep_assert_held(&ctx->uring_lock);
+			ctx->rsrc_cached_refs--;
+			if (unlikely(ctx->rsrc_cached_refs < 0))
+				io_rsrc_refs_refill(ctx);
+		} else {
+			percpu_ref_get(&req->rsrc_node->refs);
+		}
+	}
+}
+
+static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+	return &data->tags[table_idx][off];
+}
+
+int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
+int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+#endif

From c98817e6cd4471a6f6283813dd6efea162f5ac5f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 26 May 2022 09:44:31 -0600
Subject: [PATCH 053/172] io_uring: move remaining file table manipulation to
 filetable.c

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.c | 87 +++++++++++++++++++++++++++++++++++++++++++-
 io_uring/filetable.h |  5 ++-
 io_uring/io_uring.c  | 82 -----------------------------------------
 io_uring/io_uring.h  |  4 --
 4 files changed, 90 insertions(+), 88 deletions(-)

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 560629a93c04b0..e449ceb9a848e6 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -4,14 +4,17 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring_types.h"
 #include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
 
-int io_file_bitmap_get(struct io_ring_ctx *ctx)
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 {
 	struct io_file_table *table = &ctx->file_table;
 	unsigned long nr = ctx->nr_user_files;
@@ -55,3 +58,85 @@ void io_free_file_tables(struct io_file_table *table)
 	table->files = NULL;
 	table->bitmap = NULL;
 }
+
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+				 unsigned int issue_flags, u32 slot_index)
+	__must_hold(&req->ctx->uring_lock)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	bool needs_switch = false;
+	struct io_fixed_file *file_slot;
+	int ret;
+
+	if (io_is_uring_fops(file))
+		return -EBADF;
+	if (!ctx->file_data)
+		return -ENXIO;
+	if (slot_index >= ctx->nr_user_files)
+		return -EINVAL;
+
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+
+	if (file_slot->file_ptr) {
+		struct file *old_file;
+
+		ret = io_rsrc_node_switch_start(ctx);
+		if (ret)
+			goto err;
+
+		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+					    ctx->rsrc_node, old_file);
+		if (ret)
+			goto err;
+		file_slot->file_ptr = 0;
+		io_file_bitmap_clear(&ctx->file_table, slot_index);
+		needs_switch = true;
+	}
+
+	ret = io_scm_file_account(ctx, file);
+	if (!ret) {
+		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
+		io_fixed_file_set(file_slot, file);
+		io_file_bitmap_set(&ctx->file_table, slot_index);
+	}
+err:
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, ctx->file_data);
+	if (ret)
+		fput(file);
+	return ret;
+}
+
+/*
+ * Note when io_fixed_fd_install() returns error value, it will ensure
+ * fput() is called correspondingly.
+ */
+int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			struct file *file, unsigned int file_slot)
+{
+	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	io_ring_submit_lock(ctx, issue_flags);
+
+	if (alloc_slot) {
+		ret = io_file_bitmap_get(ctx);
+		if (unlikely(ret < 0))
+			goto err;
+		file_slot = ret;
+	} else {
+		file_slot--;
+	}
+
+	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+	if (!ret && alloc_slot)
+		ret = file_slot;
+err:
+	io_ring_submit_unlock(ctx, issue_flags);
+	if (unlikely(ret < 0))
+		fput(file);
+	return ret;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 6e1675f406b729..c404360f709053 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -3,6 +3,7 @@
 #define IOU_FILE_TABLE_H
 
 struct io_ring_ctx;
+struct io_kiocb;
 
 /*
  * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
@@ -34,7 +35,9 @@ struct io_file_table {
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
-int io_file_bitmap_get(struct io_ring_ctx *ctx);
+
+int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			struct file *file, unsigned int file_slot);
 
 unsigned int io_file_get_flags(struct file *file);
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0c47c919887f52..c0f1f79933ac2a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2699,38 +2699,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
-/*
- * Note when io_fixed_fd_install() returns error value, it will ensure
- * fput() is called correspondingly.
- */
-int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
-			struct file *file, unsigned int file_slot)
-{
-	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret;
-
-	io_ring_submit_lock(ctx, issue_flags);
-
-	if (alloc_slot) {
-		ret = io_file_bitmap_get(ctx);
-		if (unlikely(ret < 0))
-			goto err;
-		file_slot = ret;
-	} else {
-		file_slot--;
-	}
-
-	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
-	if (!ret && alloc_slot)
-		ret = file_slot;
-err:
-	io_ring_submit_unlock(ctx, issue_flags);
-	if (unlikely(ret < 0))
-		fput(file);
-	return ret;
-}
-
 static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 					     const struct io_uring_sqe *sqe)
 {
@@ -3579,56 +3547,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-			  unsigned int issue_flags, u32 slot_index)
-	__must_hold(&req->ctx->uring_lock)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	bool needs_switch = false;
-	struct io_fixed_file *file_slot;
-	int ret;
-
-	if (io_is_uring_fops(file))
-		return -EBADF;
-	if (!ctx->file_data)
-		return -ENXIO;
-	if (slot_index >= ctx->nr_user_files)
-		return -EINVAL;
-
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
-	if (file_slot->file_ptr) {
-		struct file *old_file;
-
-		ret = io_rsrc_node_switch_start(ctx);
-		if (ret)
-			goto err;
-
-		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    ctx->rsrc_node, old_file);
-		if (ret)
-			goto err;
-		file_slot->file_ptr = 0;
-		io_file_bitmap_clear(&ctx->file_table, slot_index);
-		needs_switch = true;
-	}
-
-	ret = io_scm_file_account(ctx, file);
-	if (!ret) {
-		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-		io_fixed_file_set(file_slot, file);
-		io_file_bitmap_set(&ctx->file_table, slot_index);
-	}
-err:
-	if (needs_switch)
-		io_rsrc_node_switch(ctx, ctx->file_data);
-	if (ret)
-		fput(file);
-	return ret;
-}
-
 static void io_mem_free(void *ptr)
 {
 	struct page *page;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 090c17deba9db2..71afb46070e364 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -106,10 +106,6 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
-int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
-			struct file *file, unsigned int file_slot);
-int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-			  unsigned int issue_flags, u32 slot_index);
 
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);

From f3b44f92e59a804cf375479bda0bccbf4b6e6ef6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 07:27:03 -0600
Subject: [PATCH 054/172] io_uring: move read/write related opcodes to its own
 file

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile   |    2 +-
 io_uring/io_uring.c | 1239 +------------------------------------------
 io_uring/io_uring.h |  131 +++++
 io_uring/rw.c       | 1099 ++++++++++++++++++++++++++++++++++++++
 io_uring/rw.h       |   23 +
 5 files changed, 1263 insertions(+), 1231 deletions(-)
 create mode 100644 io_uring/rw.c
 create mode 100644 io_uring/rw.h

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 360a83039c2a4f..d70deed65a0bb2 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o
+					cancel.o kbuf.o rsrc.o rw.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c0f1f79933ac2a..0af61a6c29cfee 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -43,7 +43,6 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/syscalls.h>
-#include <linux/compat.h>
 #include <net/compat.h>
 #include <linux/refcount.h>
 #include <linux/uio.h>
@@ -57,7 +56,6 @@
 #include <linux/mman.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
-#include <linux/blk-mq.h>
 #include <linux/bvec.h>
 #include <linux/net.h>
 #include <net/sock.h>
@@ -67,13 +65,10 @@
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
-#include <linux/sizes.h>
 #include <linux/highmem.h>
 #include <linux/fsnotify.h>
 #include <linux/fadvise.h>
-#include <linux/eventpoll.h>
 #include <linux/task_work.h>
-#include <linux/pagemap.h>
 #include <linux/io_uring.h>
 #include <linux/audit.h>
 #include <linux/security.h>
@@ -110,6 +105,7 @@
 #include "timeout.h"
 #include "poll.h"
 #include "cancel.h"
+#include "rw.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -136,31 +132,6 @@
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
-/*
- * First field must be the file pointer in all the
- * iocb unions! See also 'struct kiocb' in <linux/fs.h>
- */
-struct io_rw {
-	/* NOTE: kiocb has the file as the first member, so don't do it here */
-	struct kiocb			kiocb;
-	u64				addr;
-	u32				len;
-	rwf_t				flags;
-};
-
-struct io_rw_state {
-	struct iov_iter			iter;
-	struct iov_iter_state		iter_state;
-	struct iovec			fast_iov[UIO_FASTIOV];
-};
-
-struct io_async_rw {
-	struct io_rw_state		s;
-	const struct iovec		*free_iovec;
-	size_t				bytes_done;
-	struct wait_page_queue		wpq;
-};
-
 enum {
 	IO_CHECK_CQ_OVERFLOW_BIT,
 	IO_CHECK_CQ_DROPPED_BIT,
@@ -184,9 +155,7 @@ static void io_dismantle_req(struct io_kiocb *req);
 static void io_clean_op(struct io_kiocb *req);
 static void io_queue_sqe(struct io_kiocb *req);
 
-static void io_req_task_queue(struct io_kiocb *req);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
-static int io_req_prep_async(struct io_kiocb *req);
 
 static void io_eventfd_signal(struct io_ring_ctx *ctx);
 
@@ -393,11 +362,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
-static inline bool io_req_ffs_set(struct io_kiocb *req)
-{
-	return req->flags & REQ_F_FIXED_FILE;
-}
-
 static inline void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
@@ -489,7 +453,7 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
-static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
+void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
@@ -532,7 +496,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 	}
 }
 
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
 	if (ctx->off_timeout_used || ctx->drain_active) {
 		spin_lock(&ctx->completion_lock);
@@ -547,60 +511,6 @@ static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_signal(ctx);
 }
 
-static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
-{
-	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
-}
-
-/*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
-static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
-{
-	struct io_rings *rings = ctx->rings;
-	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
-	unsigned int shift = 0;
-	unsigned int free, queued, len;
-
-	if (ctx->flags & IORING_SETUP_CQE32)
-		shift = 1;
-
-	/* userspace may cheat modifying the tail, be safe and do min */
-	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
-	free = ctx->cq_entries - queued;
-	/* we need a contiguous range, limit based on the current array offset */
-	len = min(free, ctx->cq_entries - off);
-	if (!len)
-		return NULL;
-
-	ctx->cached_cq_tail++;
-	ctx->cqe_cached = &rings->cqes[off];
-	ctx->cqe_sentinel = ctx->cqe_cached + len;
-	ctx->cqe_cached++;
-	return &rings->cqes[off << shift];
-}
-
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
-{
-	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
-		struct io_uring_cqe *cqe = ctx->cqe_cached;
-
-		if (ctx->flags & IORING_SETUP_CQE32) {
-			unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
-
-			cqe += off;
-		}
-
-		ctx->cached_cq_tail++;
-		ctx->cqe_cached++;
-		return cqe;
-	}
-
-	return __io_get_cqe(ctx);
-}
-
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
@@ -628,17 +538,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
 	rcu_read_unlock();
 }
 
-static inline void io_cqring_wake(struct io_ring_ctx *ctx)
-{
-	/*
-	 * wake_up_all() may seem excessive, but io_wake_function() and
-	 * io_should_wake() handle the termination of the loop and only
-	 * wake as many waiters as we need to.
-	 */
-	if (wq_has_sleeper(&ctx->cq_wait))
-		wake_up_all(&ctx->cq_wait);
-}
-
 /*
  * This should only get called when at least one event has been posted.
  * Some applications rely on the eventfd notification count only changing
@@ -655,16 +554,6 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 	io_cqring_wake(ctx);
 }
 
-static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
-{
-	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
-		     ctx->has_evfd))
-		__io_commit_cqring_flush(ctx);
-
-	if (ctx->flags & IORING_SETUP_SQPOLL)
-		io_cqring_wake(ctx);
-}
-
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -775,9 +664,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 	}
 }
 
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
-				     s32 res, u32 cflags, u64 extra1,
-				     u64 extra2)
+bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
+			      u32 cflags, u64 extra1, u64 extra2)
 {
 	struct io_overflow_cqe *ocqe;
 	size_t ocq_size = sizeof(struct io_overflow_cqe);
@@ -814,59 +702,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	return true;
 }
 
-static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
-				     struct io_kiocb *req)
-{
-	struct io_uring_cqe *cqe;
-
-	if (!(ctx->flags & IORING_SETUP_CQE32)) {
-		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags, 0, 0);
-
-		/*
-		 * If we can't get a cq entry, userspace overflowed the
-		 * submission (by quite a lot). Increment the overflow count in
-		 * the ring.
-		 */
-		cqe = io_get_cqe(ctx);
-		if (likely(cqe)) {
-			memcpy(cqe, &req->cqe, sizeof(*cqe));
-			return true;
-		}
-
-		return io_cqring_event_overflow(ctx, req->cqe.user_data,
-						req->cqe.res, req->cqe.flags,
-						0, 0);
-	} else {
-		u64 extra1 = 0, extra2 = 0;
-
-		if (req->flags & REQ_F_CQE32_INIT) {
-			extra1 = req->extra1;
-			extra2 = req->extra2;
-		}
-
-		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags, extra1, extra2);
-
-		/*
-		 * If we can't get a cq entry, userspace overflowed the
-		 * submission (by quite a lot). Increment the overflow count in
-		 * the ring.
-		 */
-		cqe = io_get_cqe(ctx);
-		if (likely(cqe)) {
-			memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
-			WRITE_ONCE(cqe->big_cqe[0], extra1);
-			WRITE_ONCE(cqe->big_cqe[1], extra2);
-			return true;
-		}
-
-		return io_cqring_event_overflow(ctx, req->cqe.user_data,
-				req->cqe.res, req->cqe.flags,
-				extra1, extra2);
-	}
-}
-
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 		     u32 cflags)
 {
@@ -1269,7 +1104,7 @@ void io_req_task_work_add(struct io_kiocb *req)
 	__io_req_task_work_add(req, tctx, &tctx->task_list);
 }
 
-static void io_req_task_prio_work_add(struct io_kiocb *req)
+void io_req_task_prio_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 
@@ -1315,18 +1150,12 @@ void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 	io_req_task_work_add(req);
 }
 
-static void io_req_task_queue(struct io_kiocb *req)
+void io_req_task_queue(struct io_kiocb *req)
 {
 	req->io_task_work.func = io_req_task_submit;
 	io_req_task_work_add(req);
 }
 
-static void io_req_task_queue_reissue(struct io_kiocb *req)
-{
-	req->io_task_work.func = io_queue_iowq;
-	io_req_task_work_add(req);
-}
-
 void io_queue_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = io_req_find_next(req);
@@ -1335,8 +1164,7 @@ void io_queue_next(struct io_kiocb *req)
 		io_req_task_queue(nxt);
 }
 
-static void io_free_batch_list(struct io_ring_ctx *ctx,
-				struct io_wq_work_node *node)
+void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 	__must_hold(&ctx->uring_lock)
 {
 	struct task_struct *task = NULL;
@@ -1435,76 +1263,6 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 	return __io_cqring_events(ctx);
 }
 
-int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
-{
-	struct io_wq_work_node *pos, *start, *prev;
-	unsigned int poll_flags = BLK_POLL_NOSLEEP;
-	DEFINE_IO_COMP_BATCH(iob);
-	int nr_events = 0;
-
-	/*
-	 * Only spin for completions if we don't have multiple devices hanging
-	 * off our complete list.
-	 */
-	if (ctx->poll_multi_queue || force_nonspin)
-		poll_flags |= BLK_POLL_ONESHOT;
-
-	wq_list_for_each(pos, start, &ctx->iopoll_list) {
-		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct io_rw *rw = io_kiocb_to_cmd(req);
-		int ret;
-
-		/*
-		 * Move completed and retryable entries to our local lists.
-		 * If we find a request that requires polling, break out
-		 * and complete those lists first, if we have entries there.
-		 */
-		if (READ_ONCE(req->iopoll_completed))
-			break;
-
-		ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
-		if (unlikely(ret < 0))
-			return ret;
-		else if (ret)
-			poll_flags |= BLK_POLL_ONESHOT;
-
-		/* iopoll may have completed current req */
-		if (!rq_list_empty(iob.req_list) ||
-		    READ_ONCE(req->iopoll_completed))
-			break;
-	}
-
-	if (!rq_list_empty(iob.req_list))
-		iob.complete(&iob);
-	else if (!pos)
-		return 0;
-
-	prev = start;
-	wq_list_for_each_resume(pos, prev) {
-		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-
-		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
-		if (!smp_load_acquire(&req->iopoll_completed))
-			break;
-		nr_events++;
-		if (unlikely(req->flags & REQ_F_CQE_SKIP))
-			continue;
-
-		req->cqe.flags = io_put_kbuf(req, 0);
-		__io_fill_cqe_req(req->ctx, req);
-	}
-
-	if (unlikely(!nr_events))
-		return 0;
-
-	io_commit_cqring(ctx);
-	io_cqring_ev_posted_iopoll(ctx);
-	pos = start ? start->next : ctx->iopoll_list.first;
-	wq_list_cut(&ctx->iopoll_list, prev, start);
-	io_free_batch_list(ctx, pos);
-	return nr_events;
-}
-
 /*
  * We can't just wait for polled events to come to us, we have to actively
  * find and complete them.
@@ -1589,90 +1347,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
 	return ret;
 }
-
-static void kiocb_end_write(struct io_kiocb *req)
-{
-	/*
-	 * Tell lockdep we inherited freeze protection from submission
-	 * thread.
-	 */
-	if (req->flags & REQ_F_ISREG) {
-		struct super_block *sb = file_inode(req->file)->i_sb;
-
-		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
-		sb_end_write(sb);
-	}
-}
-
-#ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
-	struct io_async_rw *io = req->async_data;
-
-	if (!req_has_async_data(req))
-		return !io_req_prep_async(req);
-	iov_iter_restore(&io->s.iter, &io->s.iter_state);
-	return true;
-}
-
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
-	umode_t mode = file_inode(req->file)->i_mode;
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (!S_ISBLK(mode) && !S_ISREG(mode))
-		return false;
-	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
-	    !(ctx->flags & IORING_SETUP_IOPOLL)))
-		return false;
-	/*
-	 * If ref is dying, we might be running poll reap from the exit work.
-	 * Don't attempt to reissue from that path, just let it fail with
-	 * -EAGAIN.
-	 */
-	if (percpu_ref_is_dying(&ctx->refs))
-		return false;
-	/*
-	 * Play it safe and assume not safe to re-import and reissue if we're
-	 * not in the original thread group (or in task context).
-	 */
-	if (!same_thread_group(req->task, current) || !in_task())
-		return false;
-	return true;
-}
-#else
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
-	return false;
-}
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
-	return false;
-}
-#endif
-
-static bool __io_complete_rw_common(struct io_kiocb *req, long res)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-
-	if (rw->kiocb.ki_flags & IOCB_WRITE) {
-		kiocb_end_write(req);
-		fsnotify_modify(req->file);
-	} else {
-		fsnotify_access(req->file);
-	}
-	if (unlikely(res != req->cqe.res)) {
-		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
-		    io_rw_should_reissue(req)) {
-			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
-			return true;
-		}
-		req_set_fail(req);
-		req->cqe.res = res;
-	}
-	return false;
-}
-
 inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
 	if (*locked) {
@@ -1685,46 +1359,6 @@ inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	}
 }
 
-static void __io_complete_rw(struct io_kiocb *req, long res,
-			     unsigned int issue_flags)
-{
-	if (__io_complete_rw_common(req, res))
-		return;
-	io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags));
-	__io_req_complete(req, issue_flags);
-}
-
-static void io_complete_rw(struct kiocb *kiocb, long res)
-{
-	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
-	struct io_kiocb *req = cmd_to_io_kiocb(rw);
-
-	if (__io_complete_rw_common(req, res))
-		return;
-	io_req_set_res(req, res, 0);
-	req->io_task_work.func = io_req_task_complete;
-	io_req_task_prio_work_add(req);
-}
-
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
-{
-	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
-	struct io_kiocb *req = cmd_to_io_kiocb(rw);
-
-	if (kiocb->ki_flags & IOCB_WRITE)
-		kiocb_end_write(req);
-	if (unlikely(res != req->cqe.res)) {
-		if (res == -EAGAIN && io_rw_should_reissue(req)) {
-			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
-			return;
-		}
-		req->cqe.res = res;
-	}
-
-	/* order with io_iopoll_complete() checking ->iopoll_completed */
-	smp_store_release(&req->iopoll_completed, 1);
-}
-
 /*
  * After the iocb has been issued, it's safe to be found on the poll list.
  * Adding the kiocb to the list AFTER submission ensures that we don't
@@ -1833,426 +1467,6 @@ unsigned int io_file_get_flags(struct file *file)
 	return res;
 }
 
-static inline bool io_file_supports_nowait(struct io_kiocb *req)
-{
-	return req->flags & REQ_F_SUPPORT_NOWAIT;
-}
-
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	unsigned ioprio;
-	int ret;
-
-	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
-	/* used for fixed read/write too - just read unconditionally */
-	req->buf_index = READ_ONCE(sqe->buf_index);
-
-	if (req->opcode == IORING_OP_READ_FIXED ||
-	    req->opcode == IORING_OP_WRITE_FIXED) {
-		struct io_ring_ctx *ctx = req->ctx;
-		u16 index;
-
-		if (unlikely(req->buf_index >= ctx->nr_user_bufs))
-			return -EFAULT;
-		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-		req->imu = ctx->user_bufs[index];
-		io_req_set_rsrc_node(req, ctx, 0);
-	}
-
-	ioprio = READ_ONCE(sqe->ioprio);
-	if (ioprio) {
-		ret = ioprio_check_cap(ioprio);
-		if (ret)
-			return ret;
-
-		rw->kiocb.ki_ioprio = ioprio;
-	} else {
-		rw->kiocb.ki_ioprio = get_current_ioprio();
-	}
-
-	rw->addr = READ_ONCE(sqe->addr);
-	rw->len = READ_ONCE(sqe->len);
-	rw->flags = READ_ONCE(sqe->rw_flags);
-	return 0;
-}
-
-static void io_readv_writev_cleanup(struct io_kiocb *req)
-{
-	struct io_async_rw *io = req->async_data;
-
-	kfree(io->free_iovec);
-}
-
-static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
-{
-	switch (ret) {
-	case -EIOCBQUEUED:
-		break;
-	case -ERESTARTSYS:
-	case -ERESTARTNOINTR:
-	case -ERESTARTNOHAND:
-	case -ERESTART_RESTARTBLOCK:
-		/*
-		 * We can't just restart the syscall, since previously
-		 * submitted sqes may already be in progress. Just fail this
-		 * IO with EINTR.
-		 */
-		ret = -EINTR;
-		fallthrough;
-	default:
-		kiocb->ki_complete(kiocb, ret);
-	}
-}
-
-static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-
-	if (rw->kiocb.ki_pos != -1)
-		return &rw->kiocb.ki_pos;
-
-	if (!(req->file->f_mode & FMODE_STREAM)) {
-		req->flags |= REQ_F_CUR_POS;
-		rw->kiocb.ki_pos = req->file->f_pos;
-		return &rw->kiocb.ki_pos;
-	}
-
-	rw->kiocb.ki_pos = 0;
-	return NULL;
-}
-
-static void kiocb_done(struct io_kiocb *req, ssize_t ret,
-		       unsigned int issue_flags)
-{
-	struct io_async_rw *io = req->async_data;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-
-	/* add previously done IO, if any */
-	if (req_has_async_data(req) && io->bytes_done > 0) {
-		if (ret < 0)
-			ret = io->bytes_done;
-		else
-			ret += io->bytes_done;
-	}
-
-	if (req->flags & REQ_F_CUR_POS)
-		req->file->f_pos = rw->kiocb.ki_pos;
-	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw))
-		__io_complete_rw(req, ret, issue_flags);
-	else
-		io_rw_done(&rw->kiocb, ret);
-
-	if (req->flags & REQ_F_REISSUE) {
-		req->flags &= ~REQ_F_REISSUE;
-		if (io_resubmit_prep(req))
-			io_req_task_queue_reissue(req);
-		else
-			io_req_task_queue_fail(req, ret);
-	}
-}
-
-static int __io_import_fixed(struct io_kiocb *req, int ddir,
-			     struct iov_iter *iter, struct io_mapped_ubuf *imu)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	size_t len = rw->len;
-	u64 buf_end, buf_addr = rw->addr;
-	size_t offset;
-
-	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
-		return -EFAULT;
-	/* not inside the mapped region */
-	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
-		return -EFAULT;
-
-	/*
-	 * May not be a start of buffer, set size appropriately
-	 * and advance us to the beginning.
-	 */
-	offset = buf_addr - imu->ubuf;
-	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
-
-	if (offset) {
-		/*
-		 * Don't use iov_iter_advance() here, as it's really slow for
-		 * using the latter parts of a big fixed buffer - it iterates
-		 * over each segment manually. We can cheat a bit here, because
-		 * we know that:
-		 *
-		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
-		 *    first and last bvec
-		 *
-		 * So just find our index, and adjust the iterator afterwards.
-		 * If the offset is within the first bvec (or the whole first
-		 * bvec, just use iov_iter_advance(). This makes it easier
-		 * since we can just skip the first segment, which may not
-		 * be PAGE_SIZE aligned.
-		 */
-		const struct bio_vec *bvec = imu->bvec;
-
-		if (offset <= bvec->bv_len) {
-			iov_iter_advance(iter, offset);
-		} else {
-			unsigned long seg_skip;
-
-			/* skip first vec */
-			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> PAGE_SHIFT);
-
-			iter->bvec = bvec + seg_skip;
-			iter->nr_segs -= seg_skip;
-			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ~PAGE_MASK;
-		}
-	}
-
-	return 0;
-}
-
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
-			   unsigned int issue_flags)
-{
-	if (WARN_ON_ONCE(!req->imu))
-		return -EFAULT;
-	return __io_import_fixed(req, rw, iter, req->imu);
-}
-
-#ifdef CONFIG_COMPAT
-static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
-				unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct compat_iovec __user *uiov;
-	compat_ssize_t clen;
-	void __user *buf;
-	size_t len;
-
-	uiov = u64_to_user_ptr(rw->addr);
-	if (!access_ok(uiov, sizeof(*uiov)))
-		return -EFAULT;
-	if (__get_user(clen, &uiov->iov_len))
-		return -EFAULT;
-	if (clen < 0)
-		return -EINVAL;
-
-	len = clen;
-	buf = io_buffer_select(req, &len, issue_flags);
-	if (!buf)
-		return -ENOBUFS;
-	rw->addr = (unsigned long) buf;
-	iov[0].iov_base = buf;
-	rw->len = iov[0].iov_len = (compat_size_t) len;
-	return 0;
-}
-#endif
-
-static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
-				      unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
-	void __user *buf;
-	ssize_t len;
-
-	if (copy_from_user(iov, uiov, sizeof(*uiov)))
-		return -EFAULT;
-
-	len = iov[0].iov_len;
-	if (len < 0)
-		return -EINVAL;
-	buf = io_buffer_select(req, &len, issue_flags);
-	if (!buf)
-		return -ENOBUFS;
-	rw->addr = (unsigned long) buf;
-	iov[0].iov_base = buf;
-	rw->len = iov[0].iov_len = len;
-	return 0;
-}
-
-static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
-				    unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-
-	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		iov[0].iov_base = u64_to_user_ptr(rw->addr);
-		iov[0].iov_len = rw->len;
-		return 0;
-	}
-	if (rw->len != 1)
-		return -EINVAL;
-
-#ifdef CONFIG_COMPAT
-	if (req->ctx->compat)
-		return io_compat_import(req, iov, issue_flags);
-#endif
-
-	return __io_iov_buffer_select(req, iov, issue_flags);
-}
-
-static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
-				       struct io_rw_state *s,
-				       unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct iov_iter *iter = &s->iter;
-	u8 opcode = req->opcode;
-	struct iovec *iovec;
-	void __user *buf;
-	size_t sqe_len;
-	ssize_t ret;
-
-	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(req, ddir, iter, issue_flags);
-		if (ret)
-			return ERR_PTR(ret);
-		return NULL;
-	}
-
-	buf = u64_to_user_ptr(rw->addr);
-	sqe_len = rw->len;
-
-	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
-		if (io_do_buffer_select(req)) {
-			buf = io_buffer_select(req, &sqe_len, issue_flags);
-			if (!buf)
-				return ERR_PTR(-ENOBUFS);
-			rw->addr = (unsigned long) buf;
-			rw->len = sqe_len;
-		}
-
-		ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
-		if (ret)
-			return ERR_PTR(ret);
-		return NULL;
-	}
-
-	iovec = s->fast_iov;
-	if (req->flags & REQ_F_BUFFER_SELECT) {
-		ret = io_iov_buffer_select(req, iovec, issue_flags);
-		if (ret)
-			return ERR_PTR(ret);
-		iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len);
-		return NULL;
-	}
-
-	ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
-			      req->ctx->compat);
-	if (unlikely(ret < 0))
-		return ERR_PTR(ret);
-	return iovec;
-}
-
-static inline int io_import_iovec(int rw, struct io_kiocb *req,
-				  struct iovec **iovec, struct io_rw_state *s,
-				  unsigned int issue_flags)
-{
-	*iovec = __io_import_iovec(rw, req, s, issue_flags);
-	if (unlikely(IS_ERR(*iovec)))
-		return PTR_ERR(*iovec);
-
-	iov_iter_save_state(&s->iter, &s->iter_state);
-	return 0;
-}
-
-static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
-{
-	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
-}
-
-/*
- * For files that don't have ->read_iter() and ->write_iter(), handle them
- * by looping over ->read() or ->write() manually.
- */
-static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
-{
-	struct kiocb *kiocb = &rw->kiocb;
-	struct file *file = kiocb->ki_filp;
-	ssize_t ret = 0;
-	loff_t *ppos;
-
-	/*
-	 * Don't support polled IO through this interface, and we can't
-	 * support non-blocking either. For the latter, this just causes
-	 * the kiocb to be handled from an async context.
-	 */
-	if (kiocb->ki_flags & IOCB_HIPRI)
-		return -EOPNOTSUPP;
-	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
-	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
-		return -EAGAIN;
-
-	ppos = io_kiocb_ppos(kiocb);
-
-	while (iov_iter_count(iter)) {
-		struct iovec iovec;
-		ssize_t nr;
-
-		if (!iov_iter_is_bvec(iter)) {
-			iovec = iov_iter_iovec(iter);
-		} else {
-			iovec.iov_base = u64_to_user_ptr(rw->addr);
-			iovec.iov_len = rw->len;
-		}
-
-		if (ddir == READ) {
-			nr = file->f_op->read(file, iovec.iov_base,
-					      iovec.iov_len, ppos);
-		} else {
-			nr = file->f_op->write(file, iovec.iov_base,
-					       iovec.iov_len, ppos);
-		}
-
-		if (nr < 0) {
-			if (!ret)
-				ret = nr;
-			break;
-		}
-		ret += nr;
-		if (!iov_iter_is_bvec(iter)) {
-			iov_iter_advance(iter, nr);
-		} else {
-			rw->addr += nr;
-			rw->len -= nr;
-			if (!rw->len)
-				break;
-		}
-		if (nr != iovec.iov_len)
-			break;
-	}
-
-	return ret;
-}
-
-static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
-			  const struct iovec *fast_iov, struct iov_iter *iter)
-{
-	struct io_async_rw *io = req->async_data;
-
-	memcpy(&io->s.iter, iter, sizeof(*iter));
-	io->free_iovec = iovec;
-	io->bytes_done = 0;
-	/* can only be fixed buffers, no need to do anything */
-	if (iov_iter_is_bvec(iter))
-		return;
-	if (!iovec) {
-		unsigned iov_off = 0;
-
-		io->s.iter.iov = io->s.fast_iov;
-		if (iter->iov != fast_iov) {
-			iov_off = iter->iov - fast_iov;
-			io->s.iter.iov += iov_off;
-		}
-		if (io->s.fast_iov != fast_iov)
-			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
-			       sizeof(struct iovec) * iter->nr_segs);
-	} else {
-		req->flags |= REQ_F_NEED_CLEANUP;
-	}
-}
-
 bool io_alloc_async_data(struct io_kiocb *req)
 {
 	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
@@ -2264,448 +1478,13 @@ bool io_alloc_async_data(struct io_kiocb *req)
 	return true;
 }
 
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-			     struct io_rw_state *s, bool force)
-{
-	if (!force && !io_op_defs[req->opcode].prep_async)
-		return 0;
-	if (!req_has_async_data(req)) {
-		struct io_async_rw *iorw;
-
-		if (io_alloc_async_data(req)) {
-			kfree(iovec);
-			return -ENOMEM;
-		}
-
-		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
-		iorw = req->async_data;
-		/* we've copied and mapped the iter, ensure state is saved */
-		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
-	}
-	return 0;
-}
-
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
-{
-	struct io_async_rw *iorw = req->async_data;
-	struct iovec *iov;
-	int ret;
-
-	/* submission path, ->uring_lock should already be taken */
-	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
-	if (unlikely(ret < 0))
-		return ret;
-
-	iorw->bytes_done = 0;
-	iorw->free_iovec = iov;
-	if (iov)
-		req->flags |= REQ_F_NEED_CLEANUP;
-	return 0;
-}
-
-static int io_readv_prep_async(struct io_kiocb *req)
-{
-	return io_rw_prep_async(req, READ);
-}
-
-static int io_writev_prep_async(struct io_kiocb *req)
-{
-	return io_rw_prep_async(req, WRITE);
-}
-
-/*
- * This is our waitqueue callback handler, registered through __folio_lock_async()
- * when we initially tried to do the IO with the iocb armed our waitqueue.
- * This gets called when the page is unlocked, and we generally expect that to
- * happen when the page IO is completed and the page is now uptodate. This will
- * queue a task_work based retry of the operation, attempting to copy the data
- * again. If the latter fails because the page was NOT uptodate, then we will
- * do a thread based blocking retry of the operation. That's the unexpected
- * slow path.
- */
-static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
-			     int sync, void *arg)
-{
-	struct wait_page_queue *wpq;
-	struct io_kiocb *req = wait->private;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct wait_page_key *key = arg;
-
-	wpq = container_of(wait, struct wait_page_queue, wait);
-
-	if (!wake_page_match(wpq, key))
-		return 0;
-
-	rw->kiocb.ki_flags &= ~IOCB_WAITQ;
-	list_del_init(&wait->entry);
-	io_req_task_queue(req);
-	return 1;
-}
-
-/*
- * This controls whether a given IO request should be armed for async page
- * based retry. If we return false here, the request is handed to the async
- * worker threads for retry. If we're doing buffered reads on a regular file,
- * we prepare a private wait_page_queue entry and retry the operation. This
- * will either succeed because the page is now uptodate and unlocked, or it
- * will register a callback when the page is unlocked at IO completion. Through
- * that callback, io_uring uses task_work to setup a retry of the operation.
- * That retry will attempt the buffered read again. The retry will generally
- * succeed, or in rare cases where it fails, we then fall back to using the
- * async worker threads for a blocking retry.
- */
-static bool io_rw_should_retry(struct io_kiocb *req)
-{
-	struct io_async_rw *io = req->async_data;
-	struct wait_page_queue *wait = &io->wpq;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct kiocb *kiocb = &rw->kiocb;
-
-	/* never retry for NOWAIT, we just complete with -EAGAIN */
-	if (req->flags & REQ_F_NOWAIT)
-		return false;
-
-	/* Only for buffered IO */
-	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
-		return false;
-
-	/*
-	 * just use poll if we can, and don't attempt if the fs doesn't
-	 * support callback based unlocks
-	 */
-	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
-		return false;
-
-	wait->wait.func = io_async_buf_func;
-	wait->wait.private = req;
-	wait->wait.flags = 0;
-	INIT_LIST_HEAD(&wait->wait.entry);
-	kiocb->ki_flags |= IOCB_WAITQ;
-	kiocb->ki_flags &= ~IOCB_NOWAIT;
-	kiocb->ki_waitq = wait;
-	return true;
-}
-
-static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
-{
-	struct file *file = rw->kiocb.ki_filp;
-
-	if (likely(file->f_op->read_iter))
-		return call_read_iter(file, &rw->kiocb, iter);
-	else if (file->f_op->read)
-		return loop_rw_iter(READ, rw, iter);
-	else
-		return -EINVAL;
-}
-
-static bool need_read_all(struct io_kiocb *req)
-{
-	return req->flags & REQ_F_ISREG ||
-		S_ISBLK(file_inode(req->file)->i_mode);
-}
-
-static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct kiocb *kiocb = &rw->kiocb;
-	struct io_ring_ctx *ctx = req->ctx;
-	struct file *file = req->file;
-	int ret;
-
-	if (unlikely(!file || !(file->f_mode & mode)))
-		return -EBADF;
-
-	if (!io_req_ffs_set(req))
-		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
-
-	kiocb->ki_flags = iocb_flags(file);
-	ret = kiocb_set_rw_flags(kiocb, rw->flags);
-	if (unlikely(ret))
-		return ret;
-
-	/*
-	 * If the file is marked O_NONBLOCK, still allow retry for it if it
-	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
-	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
-	 */
-	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
-	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
-		req->flags |= REQ_F_NOWAIT;
-
-	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
-			return -EOPNOTSUPP;
-
-		kiocb->private = NULL;
-		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
-		kiocb->ki_complete = io_complete_rw_iopoll;
-		req->iopoll_completed = 0;
-	} else {
-		if (kiocb->ki_flags & IOCB_HIPRI)
-			return -EINVAL;
-		kiocb->ki_complete = io_complete_rw;
-	}
-
-	return 0;
-}
-
-static int io_read(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct io_rw_state __s, *s = &__s;
-	struct iovec *iovec;
-	struct kiocb *kiocb = &rw->kiocb;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	struct io_async_rw *io;
-	ssize_t ret, ret2;
-	loff_t *ppos;
-
-	if (!req_has_async_data(req)) {
-		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
-		if (unlikely(ret < 0))
-			return ret;
-	} else {
-		io = req->async_data;
-		s = &io->s;
-
-		/*
-		 * Safe and required to re-import if we're using provided
-		 * buffers, as we dropped the selected one before retry.
-		 */
-		if (io_do_buffer_select(req)) {
-			ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
-			if (unlikely(ret < 0))
-				return ret;
-		}
-
-		/*
-		 * We come here from an earlier attempt, restore our state to
-		 * match in case it doesn't. It's cheap enough that we don't
-		 * need to make this conditional.
-		 */
-		iov_iter_restore(&s->iter, &s->iter_state);
-		iovec = NULL;
-	}
-	ret = io_rw_init_file(req, FMODE_READ);
-	if (unlikely(ret)) {
-		kfree(iovec);
-		return ret;
-	}
-	req->cqe.res = iov_iter_count(&s->iter);
-
-	if (force_nonblock) {
-		/* If the file doesn't support async, just async punt */
-		if (unlikely(!io_file_supports_nowait(req))) {
-			ret = io_setup_async_rw(req, iovec, s, true);
-			return ret ?: -EAGAIN;
-		}
-		kiocb->ki_flags |= IOCB_NOWAIT;
-	} else {
-		/* Ensure we clear previously set non-block flag */
-		kiocb->ki_flags &= ~IOCB_NOWAIT;
-	}
-
-	ppos = io_kiocb_update_pos(req);
-
-	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
-	if (unlikely(ret)) {
-		kfree(iovec);
-		return ret;
-	}
-
-	ret = io_iter_do_read(rw, &s->iter);
-
-	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
-		req->flags &= ~REQ_F_REISSUE;
-		/* if we can poll, just do that */
-		if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
-			return -EAGAIN;
-		/* IOPOLL retry should happen for io-wq threads */
-		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
-			goto done;
-		/* no retry on NONBLOCK nor RWF_NOWAIT */
-		if (req->flags & REQ_F_NOWAIT)
-			goto done;
-		ret = 0;
-	} else if (ret == -EIOCBQUEUED) {
-		goto out_free;
-	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
-		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
-		/* read all, failed, already did sync or don't want to retry */
-		goto done;
-	}
-
-	/*
-	 * Don't depend on the iter state matching what was consumed, or being
-	 * untouched in case of error. Restore it and we'll advance it
-	 * manually if we need to.
-	 */
-	iov_iter_restore(&s->iter, &s->iter_state);
-
-	ret2 = io_setup_async_rw(req, iovec, s, true);
-	if (ret2)
-		return ret2;
-
-	iovec = NULL;
-	io = req->async_data;
-	s = &io->s;
-	/*
-	 * Now use our persistent iterator and state, if we aren't already.
-	 * We've restored and mapped the iter to match.
-	 */
-
-	do {
-		/*
-		 * We end up here because of a partial read, either from
-		 * above or inside this loop. Advance the iter by the bytes
-		 * that were consumed.
-		 */
-		iov_iter_advance(&s->iter, ret);
-		if (!iov_iter_count(&s->iter))
-			break;
-		io->bytes_done += ret;
-		iov_iter_save_state(&s->iter, &s->iter_state);
-
-		/* if we can retry, do so with the callbacks armed */
-		if (!io_rw_should_retry(req)) {
-			kiocb->ki_flags &= ~IOCB_WAITQ;
-			return -EAGAIN;
-		}
-
-		/*
-		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
-		 * we get -EIOCBQUEUED, then we'll get a notification when the
-		 * desired page gets unlocked. We can also get a partial read
-		 * here, and if we do, then just retry at the new offset.
-		 */
-		ret = io_iter_do_read(rw, &s->iter);
-		if (ret == -EIOCBQUEUED)
-			return IOU_ISSUE_SKIP_COMPLETE;
-		/* we got some bytes, but not all. retry. */
-		kiocb->ki_flags &= ~IOCB_WAITQ;
-		iov_iter_restore(&s->iter, &s->iter_state);
-	} while (ret > 0);
-done:
-	kiocb_done(req, ret, issue_flags);
-out_free:
-	/* it's faster to check here then delegate to kfree */
-	if (iovec)
-		kfree(iovec);
-	return IOU_ISSUE_SKIP_COMPLETE;
-}
-
-static int io_write(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	struct io_rw_state __s, *s = &__s;
-	struct iovec *iovec;
-	struct kiocb *kiocb = &rw->kiocb;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	ssize_t ret, ret2;
-	loff_t *ppos;
-
-	if (!req_has_async_data(req)) {
-		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
-		if (unlikely(ret < 0))
-			return ret;
-	} else {
-		struct io_async_rw *io = req->async_data;
-
-		s = &io->s;
-		iov_iter_restore(&s->iter, &s->iter_state);
-		iovec = NULL;
-	}
-	ret = io_rw_init_file(req, FMODE_WRITE);
-	if (unlikely(ret)) {
-		kfree(iovec);
-		return ret;
-	}
-	req->cqe.res = iov_iter_count(&s->iter);
-
-	if (force_nonblock) {
-		/* If the file doesn't support async, just async punt */
-		if (unlikely(!io_file_supports_nowait(req)))
-			goto copy_iov;
-
-		/* file path doesn't support NOWAIT for non-direct_IO */
-		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
-		    (req->flags & REQ_F_ISREG))
-			goto copy_iov;
-
-		kiocb->ki_flags |= IOCB_NOWAIT;
-	} else {
-		/* Ensure we clear previously set non-block flag */
-		kiocb->ki_flags &= ~IOCB_NOWAIT;
-	}
-
-	ppos = io_kiocb_update_pos(req);
-
-	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
-	if (unlikely(ret))
-		goto out_free;
-
-	/*
-	 * Open-code file_start_write here to grab freeze protection,
-	 * which will be released by another thread in
-	 * io_complete_rw().  Fool lockdep by telling it the lock got
-	 * released so that it doesn't complain about the held lock when
-	 * we return to userspace.
-	 */
-	if (req->flags & REQ_F_ISREG) {
-		sb_start_write(file_inode(req->file)->i_sb);
-		__sb_writers_release(file_inode(req->file)->i_sb,
-					SB_FREEZE_WRITE);
-	}
-	kiocb->ki_flags |= IOCB_WRITE;
-
-	if (likely(req->file->f_op->write_iter))
-		ret2 = call_write_iter(req->file, kiocb, &s->iter);
-	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
-	else
-		ret2 = -EINVAL;
-
-	if (req->flags & REQ_F_REISSUE) {
-		req->flags &= ~REQ_F_REISSUE;
-		ret2 = -EAGAIN;
-	}
-
-	/*
-	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
-	 * retry them without IOCB_NOWAIT.
-	 */
-	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
-		ret2 = -EAGAIN;
-	/* no retry on NONBLOCK nor RWF_NOWAIT */
-	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
-		goto done;
-	if (!force_nonblock || ret2 != -EAGAIN) {
-		/* IOPOLL retry should happen for io-wq threads */
-		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
-			goto copy_iov;
-done:
-		kiocb_done(req, ret2, issue_flags);
-		ret = IOU_ISSUE_SKIP_COMPLETE;
-	} else {
-copy_iov:
-		iov_iter_restore(&s->iter, &s->iter_state);
-		ret = io_setup_async_rw(req, iovec, s, false);
-		return ret ?: -EAGAIN;
-	}
-out_free:
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
-	return ret;
-}
-
 static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 					     const struct io_uring_sqe *sqe)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_req_prep_async(struct io_kiocb *req)
+int io_req_prep_async(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 71afb46070e364..22e6e52c42d261 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -5,11 +5,125 @@
 #include <linux/lockdep.h>
 #include "io_uring_types.h"
 
+#ifndef CREATE_TRACE_POINTS
+#include <trace/events/io_uring.h>
+#endif
+
 enum {
 	IOU_OK			= 0,
 	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
 };
 
+bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
+			      u32 cflags, u64 extra1, u64 extra2);
+
+static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
+{
+	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+}
+
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+static inline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+	unsigned int shift = 0;
+	unsigned int free, queued, len;
+
+	if (ctx->flags & IORING_SETUP_CQE32)
+		shift = 1;
+
+	/* userspace may cheat modifying the tail, be safe and do min */
+	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+	free = ctx->cq_entries - queued;
+	/* we need a contiguous range, limit based on the current array offset */
+	len = min(free, ctx->cq_entries - off);
+	if (!len)
+		return NULL;
+
+	ctx->cached_cq_tail++;
+	ctx->cqe_cached = &rings->cqes[off];
+	ctx->cqe_sentinel = ctx->cqe_cached + len;
+	ctx->cqe_cached++;
+	return &rings->cqes[off << shift];
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
+		struct io_uring_cqe *cqe = ctx->cqe_cached;
+
+		if (ctx->flags & IORING_SETUP_CQE32) {
+			unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
+
+			cqe += off;
+		}
+
+		ctx->cached_cq_tail++;
+		ctx->cqe_cached++;
+		return cqe;
+	}
+
+	return __io_get_cqe(ctx);
+}
+
+static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
+				     struct io_kiocb *req)
+{
+	struct io_uring_cqe *cqe;
+
+	if (!(ctx->flags & IORING_SETUP_CQE32)) {
+		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+					req->cqe.res, req->cqe.flags, 0, 0);
+
+		/*
+		 * If we can't get a cq entry, userspace overflowed the
+		 * submission (by quite a lot). Increment the overflow count in
+		 * the ring.
+		 */
+		cqe = io_get_cqe(ctx);
+		if (likely(cqe)) {
+			memcpy(cqe, &req->cqe, sizeof(*cqe));
+			return true;
+		}
+
+		return io_cqring_event_overflow(ctx, req->cqe.user_data,
+						req->cqe.res, req->cqe.flags,
+						0, 0);
+	} else {
+		u64 extra1 = 0, extra2 = 0;
+
+		if (req->flags & REQ_F_CQE32_INIT) {
+			extra1 = req->extra1;
+			extra2 = req->extra2;
+		}
+
+		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+					req->cqe.res, req->cqe.flags, extra1, extra2);
+
+		/*
+		 * If we can't get a cq entry, userspace overflowed the
+		 * submission (by quite a lot). Increment the overflow count in
+		 * the ring.
+		 */
+		cqe = io_get_cqe(ctx);
+		if (likely(cqe)) {
+			memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
+			WRITE_ONCE(cqe->big_cqe[0], extra1);
+			WRITE_ONCE(cqe->big_cqe[1], extra2);
+			return true;
+		}
+
+		return io_cqring_event_overflow(ctx, req->cqe.user_data,
+				req->cqe.res, req->cqe.flags,
+				extra1, extra2);
+	}
+}
+
 static inline void req_set_fail(struct io_kiocb *req)
 {
 	req->flags |= REQ_F_FAIL;
@@ -64,6 +178,17 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 }
 
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
+{
+	/*
+	 * wake_up_all() may seem excessive, but io_wake_function() and
+	 * io_should_wake() handle the termination of the loop and only
+	 * wake as many waiters as we need to.
+	 */
+	if (wq_has_sleeper(&ctx->cq_wait))
+		wake_up_all(&ctx->cq_wait);
+}
+
 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 {
 	struct io_rings *r = ctx->rings;
@@ -100,6 +225,7 @@ void __io_req_complete_post(struct io_kiocb *req);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 		     u32 cflags);
 void io_cqring_ev_posted(struct io_ring_ctx *ctx);
+void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
@@ -110,7 +236,10 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
+void io_req_task_prio_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
+void io_req_task_queue(struct io_kiocb *req);
+void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
 void io_req_task_complete(struct io_kiocb *req, bool *locked);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
 void io_req_task_submit(struct io_kiocb *req, bool *locked);
@@ -122,6 +251,8 @@ int io_uring_alloc_task_context(struct task_struct *task,
 int io_poll_issue(struct io_kiocb *req, bool *locked);
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
+void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
+int io_req_prep_async(struct io_kiocb *req);
 
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
 void io_wq_submit_work(struct io_wq_work *work);
diff --git a/io_uring/rw.c b/io_uring/rw.c
new file mode 100644
index 00000000000000..f0b60199ee2279
--- /dev/null
+++ b/io_uring/rw.c
@@ -0,0 +1,1099 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fsnotify.h>
+#include <linux/poll.h>
+#include <linux/nospec.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "opdef.h"
+#include "kbuf.h"
+#include "rsrc.h"
+#include "rw.h"
+
+struct io_rw {
+	/* NOTE: kiocb has the file as the first member, so don't do it here */
+	struct kiocb			kiocb;
+	u64				addr;
+	u32				len;
+	rwf_t				flags;
+};
+
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_SUPPORT_NOWAIT;
+}
+
+int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	unsigned ioprio;
+	int ret;
+
+	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
+	/* used for fixed read/write too - just read unconditionally */
+	req->buf_index = READ_ONCE(sqe->buf_index);
+
+	if (req->opcode == IORING_OP_READ_FIXED ||
+	    req->opcode == IORING_OP_WRITE_FIXED) {
+		struct io_ring_ctx *ctx = req->ctx;
+		u16 index;
+
+		if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+			return -EFAULT;
+		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
+		req->imu = ctx->user_bufs[index];
+		io_req_set_rsrc_node(req, ctx, 0);
+	}
+
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
+		if (ret)
+			return ret;
+
+		rw->kiocb.ki_ioprio = ioprio;
+	} else {
+		rw->kiocb.ki_ioprio = get_current_ioprio();
+	}
+
+	rw->addr = READ_ONCE(sqe->addr);
+	rw->len = READ_ONCE(sqe->len);
+	rw->flags = READ_ONCE(sqe->rw_flags);
+	return 0;
+}
+
+void io_readv_writev_cleanup(struct io_kiocb *req)
+{
+	struct io_async_rw *io = req->async_data;
+
+	kfree(io->free_iovec);
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+	switch (ret) {
+	case -EIOCBQUEUED:
+		break;
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/*
+		 * We can't just restart the syscall, since previously
+		 * submitted sqes may already be in progress. Just fail this
+		 * IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		kiocb->ki_complete(kiocb, ret);
+	}
+}
+
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+
+	if (rw->kiocb.ki_pos != -1)
+		return &rw->kiocb.ki_pos;
+
+	if (!(req->file->f_mode & FMODE_STREAM)) {
+		req->flags |= REQ_F_CUR_POS;
+		rw->kiocb.ki_pos = req->file->f_pos;
+		return &rw->kiocb.ki_pos;
+	}
+
+	rw->kiocb.ki_pos = 0;
+	return NULL;
+}
+
+static void io_req_task_queue_reissue(struct io_kiocb *req)
+{
+	req->io_task_work.func = io_queue_iowq;
+	io_req_task_work_add(req);
+}
+
+#ifdef CONFIG_BLOCK
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	struct io_async_rw *io = req->async_data;
+
+	if (!req_has_async_data(req))
+		return !io_req_prep_async(req);
+	iov_iter_restore(&io->s.iter, &io->s.iter_state);
+	return true;
+}
+
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+	umode_t mode = file_inode(req->file)->i_mode;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!S_ISBLK(mode) && !S_ISREG(mode))
+		return false;
+	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+	    !(ctx->flags & IORING_SETUP_IOPOLL)))
+		return false;
+	/*
+	 * If ref is dying, we might be running poll reap from the exit work.
+	 * Don't attempt to reissue from that path, just let it fail with
+	 * -EAGAIN.
+	 */
+	if (percpu_ref_is_dying(&ctx->refs))
+		return false;
+	/*
+	 * Play it safe and assume not safe to re-import and reissue if we're
+	 * not in the original thread group (or in task context).
+	 */
+	if (!same_thread_group(req->task, current) || !in_task())
+		return false;
+	return true;
+}
+#else
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	return false;
+}
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+	return false;
+}
+#endif
+
+static void kiocb_end_write(struct io_kiocb *req)
+{
+	/*
+	 * Tell lockdep we inherited freeze protection from submission
+	 * thread.
+	 */
+	if (req->flags & REQ_F_ISREG) {
+		struct super_block *sb = file_inode(req->file)->i_sb;
+
+		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
+		sb_end_write(sb);
+	}
+}
+
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+
+	if (rw->kiocb.ki_flags & IOCB_WRITE) {
+		kiocb_end_write(req);
+		fsnotify_modify(req->file);
+	} else {
+		fsnotify_access(req->file);
+	}
+	if (unlikely(res != req->cqe.res)) {
+		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
+		    io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+			return true;
+		}
+		req_set_fail(req);
+		req->cqe.res = res;
+	}
+	return false;
+}
+
+static void __io_complete_rw(struct io_kiocb *req, long res,
+			     unsigned int issue_flags)
+{
+	if (__io_complete_rw_common(req, res))
+		return;
+	io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags));
+	__io_req_complete(req, issue_flags);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res)
+{
+	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
+
+	if (__io_complete_rw_common(req, res))
+		return;
+	io_req_set_res(req, res, 0);
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_prio_work_add(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
+{
+	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
+
+	if (kiocb->ki_flags & IOCB_WRITE)
+		kiocb_end_write(req);
+	if (unlikely(res != req->cqe.res)) {
+		if (res == -EAGAIN && io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+			return;
+		}
+		req->cqe.res = res;
+	}
+
+	/* order with io_iopoll_complete() checking ->iopoll_completed */
+	smp_store_release(&req->iopoll_completed, 1);
+}
+
+static void kiocb_done(struct io_kiocb *req, ssize_t ret,
+		       unsigned int issue_flags)
+{
+	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+
+	/* add previously done IO, if any */
+	if (req_has_async_data(req) && io->bytes_done > 0) {
+		if (ret < 0)
+			ret = io->bytes_done;
+		else
+			ret += io->bytes_done;
+	}
+
+	if (req->flags & REQ_F_CUR_POS)
+		req->file->f_pos = rw->kiocb.ki_pos;
+	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw))
+		__io_complete_rw(req, ret, issue_flags);
+	else
+		io_rw_done(&rw->kiocb, ret);
+
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		if (io_resubmit_prep(req))
+			io_req_task_queue_reissue(req);
+		else
+			io_req_task_queue_fail(req, ret);
+	}
+}
+
+static int __io_import_fixed(struct io_kiocb *req, int ddir,
+			     struct iov_iter *iter, struct io_mapped_ubuf *imu)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	size_t len = rw->len;
+	u64 buf_end, buf_addr = rw->addr;
+	size_t offset;
+
+	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
+		return -EFAULT;
+	/* not inside the mapped region */
+	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
+		return -EFAULT;
+
+	/*
+	 * May not be a start of buffer, set size appropriately
+	 * and advance us to the beginning.
+	 */
+	offset = buf_addr - imu->ubuf;
+	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
+
+	if (offset) {
+		/*
+		 * Don't use iov_iter_advance() here, as it's really slow for
+		 * using the latter parts of a big fixed buffer - it iterates
+		 * over each segment manually. We can cheat a bit here, because
+		 * we know that:
+		 *
+		 * 1) it's a BVEC iter, we set it up
+		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 *    first and last bvec
+		 *
+		 * So just find our index, and adjust the iterator afterwards.
+		 * If the offset is within the first bvec (or the whole first
+		 * bvec, just use iov_iter_advance(). This makes it easier
+		 * since we can just skip the first segment, which may not
+		 * be PAGE_SIZE aligned.
+		 */
+		const struct bio_vec *bvec = imu->bvec;
+
+		if (offset <= bvec->bv_len) {
+			iov_iter_advance(iter, offset);
+		} else {
+			unsigned long seg_skip;
+
+			/* skip first vec */
+			offset -= bvec->bv_len;
+			seg_skip = 1 + (offset >> PAGE_SHIFT);
+
+			iter->bvec = bvec + seg_skip;
+			iter->nr_segs -= seg_skip;
+			iter->count -= bvec->bv_len + offset;
+			iter->iov_offset = offset & ~PAGE_MASK;
+		}
+	}
+
+	return 0;
+}
+
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+			   unsigned int issue_flags)
+{
+	if (WARN_ON_ONCE(!req->imu))
+		return -EFAULT;
+	return __io_import_fixed(req, rw, iter, req->imu);
+}
+
+#ifdef CONFIG_COMPAT
+static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
+				unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct compat_iovec __user *uiov;
+	compat_ssize_t clen;
+	void __user *buf;
+	size_t len;
+
+	uiov = u64_to_user_ptr(rw->addr);
+	if (!access_ok(uiov, sizeof(*uiov)))
+		return -EFAULT;
+	if (__get_user(clen, &uiov->iov_len))
+		return -EFAULT;
+	if (clen < 0)
+		return -EINVAL;
+
+	len = clen;
+	buf = io_buffer_select(req, &len, issue_flags);
+	if (!buf)
+		return -ENOBUFS;
+	rw->addr = (unsigned long) buf;
+	iov[0].iov_base = buf;
+	rw->len = iov[0].iov_len = (compat_size_t) len;
+	return 0;
+}
+#endif
+
+static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+				      unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
+	void __user *buf;
+	ssize_t len;
+
+	if (copy_from_user(iov, uiov, sizeof(*uiov)))
+		return -EFAULT;
+
+	len = iov[0].iov_len;
+	if (len < 0)
+		return -EINVAL;
+	buf = io_buffer_select(req, &len, issue_flags);
+	if (!buf)
+		return -ENOBUFS;
+	rw->addr = (unsigned long) buf;
+	iov[0].iov_base = buf;
+	rw->len = iov[0].iov_len = len;
+	return 0;
+}
+
+static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+				    unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+		iov[0].iov_base = u64_to_user_ptr(rw->addr);
+		iov[0].iov_len = rw->len;
+		return 0;
+	}
+	if (rw->len != 1)
+		return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		return io_compat_import(req, iov, issue_flags);
+#endif
+
+	return __io_iov_buffer_select(req, iov, issue_flags);
+}
+
+static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
+				       struct io_rw_state *s,
+				       unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct iov_iter *iter = &s->iter;
+	u8 opcode = req->opcode;
+	struct iovec *iovec;
+	void __user *buf;
+	size_t sqe_len;
+	ssize_t ret;
+
+	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
+		ret = io_import_fixed(req, ddir, iter, issue_flags);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	buf = u64_to_user_ptr(rw->addr);
+	sqe_len = rw->len;
+
+	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
+		if (io_do_buffer_select(req)) {
+			buf = io_buffer_select(req, &sqe_len, issue_flags);
+			if (!buf)
+				return ERR_PTR(-ENOBUFS);
+			rw->addr = (unsigned long) buf;
+			rw->len = sqe_len;
+		}
+
+		ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	iovec = s->fast_iov;
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		ret = io_iov_buffer_select(req, iovec, issue_flags);
+		if (ret)
+			return ERR_PTR(ret);
+		iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len);
+		return NULL;
+	}
+
+	ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
+			      req->ctx->compat);
+	if (unlikely(ret < 0))
+		return ERR_PTR(ret);
+	return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+				  struct iovec **iovec, struct io_rw_state *s,
+				  unsigned int issue_flags)
+{
+	*iovec = __io_import_iovec(rw, req, s, issue_flags);
+	if (unlikely(IS_ERR(*iovec)))
+		return PTR_ERR(*iovec);
+
+	iov_iter_save_state(&s->iter, &s->iter_state);
+	return 0;
+}
+
+static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
+{
+	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
+}
+
+/*
+ * For files that don't have ->read_iter() and ->write_iter(), handle them
+ * by looping over ->read() or ->write() manually.
+ */
+static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
+{
+	struct kiocb *kiocb = &rw->kiocb;
+	struct file *file = kiocb->ki_filp;
+	ssize_t ret = 0;
+	loff_t *ppos;
+
+	/*
+	 * Don't support polled IO through this interface, and we can't
+	 * support non-blocking either. For the latter, this just causes
+	 * the kiocb to be handled from an async context.
+	 */
+	if (kiocb->ki_flags & IOCB_HIPRI)
+		return -EOPNOTSUPP;
+	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
+		return -EAGAIN;
+
+	ppos = io_kiocb_ppos(kiocb);
+
+	while (iov_iter_count(iter)) {
+		struct iovec iovec;
+		ssize_t nr;
+
+		if (!iov_iter_is_bvec(iter)) {
+			iovec = iov_iter_iovec(iter);
+		} else {
+			iovec.iov_base = u64_to_user_ptr(rw->addr);
+			iovec.iov_len = rw->len;
+		}
+
+		if (ddir == READ) {
+			nr = file->f_op->read(file, iovec.iov_base,
+					      iovec.iov_len, ppos);
+		} else {
+			nr = file->f_op->write(file, iovec.iov_base,
+					       iovec.iov_len, ppos);
+		}
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (!iov_iter_is_bvec(iter)) {
+			iov_iter_advance(iter, nr);
+		} else {
+			rw->addr += nr;
+			rw->len -= nr;
+			if (!rw->len)
+				break;
+		}
+		if (nr != iovec.iov_len)
+			break;
+	}
+
+	return ret;
+}
+
+static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
+			  const struct iovec *fast_iov, struct iov_iter *iter)
+{
+	struct io_async_rw *io = req->async_data;
+
+	memcpy(&io->s.iter, iter, sizeof(*iter));
+	io->free_iovec = iovec;
+	io->bytes_done = 0;
+	/* can only be fixed buffers, no need to do anything */
+	if (iov_iter_is_bvec(iter))
+		return;
+	if (!iovec) {
+		unsigned iov_off = 0;
+
+		io->s.iter.iov = io->s.fast_iov;
+		if (iter->iov != fast_iov) {
+			iov_off = iter->iov - fast_iov;
+			io->s.iter.iov += iov_off;
+		}
+		if (io->s.fast_iov != fast_iov)
+			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
+			       sizeof(struct iovec) * iter->nr_segs);
+	} else {
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
+}
+
+static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
+			     struct io_rw_state *s, bool force)
+{
+	if (!force && !io_op_defs[req->opcode].prep_async)
+		return 0;
+	if (!req_has_async_data(req)) {
+		struct io_async_rw *iorw;
+
+		if (io_alloc_async_data(req)) {
+			kfree(iovec);
+			return -ENOMEM;
+		}
+
+		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
+		iorw = req->async_data;
+		/* we've copied and mapped the iter, ensure state is saved */
+		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
+	}
+	return 0;
+}
+
+static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
+{
+	struct io_async_rw *iorw = req->async_data;
+	struct iovec *iov;
+	int ret;
+
+	/* submission path, ->uring_lock should already be taken */
+	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
+	if (unlikely(ret < 0))
+		return ret;
+
+	iorw->bytes_done = 0;
+	iorw->free_iovec = iov;
+	if (iov)
+		req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+int io_readv_prep_async(struct io_kiocb *req)
+{
+	return io_rw_prep_async(req, READ);
+}
+
+int io_writev_prep_async(struct io_kiocb *req)
+{
+	return io_rw_prep_async(req, WRITE);
+}
+
+/*
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
+ * when we initially tried to do the IO with the iocb armed our waitqueue.
+ * This gets called when the page is unlocked, and we generally expect that to
+ * happen when the page IO is completed and the page is now uptodate. This will
+ * queue a task_work based retry of the operation, attempting to copy the data
+ * again. If the latter fails because the page was NOT uptodate, then we will
+ * do a thread based blocking retry of the operation. That's the unexpected
+ * slow path.
+ */
+static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
+			     int sync, void *arg)
+{
+	struct wait_page_queue *wpq;
+	struct io_kiocb *req = wait->private;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct wait_page_key *key = arg;
+
+	wpq = container_of(wait, struct wait_page_queue, wait);
+
+	if (!wake_page_match(wpq, key))
+		return 0;
+
+	rw->kiocb.ki_flags &= ~IOCB_WAITQ;
+	list_del_init(&wait->entry);
+	io_req_task_queue(req);
+	return 1;
+}
+
+/*
+ * This controls whether a given IO request should be armed for async page
+ * based retry. If we return false here, the request is handed to the async
+ * worker threads for retry. If we're doing buffered reads on a regular file,
+ * we prepare a private wait_page_queue entry and retry the operation. This
+ * will either succeed because the page is now uptodate and unlocked, or it
+ * will register a callback when the page is unlocked at IO completion. Through
+ * that callback, io_uring uses task_work to setup a retry of the operation.
+ * That retry will attempt the buffered read again. The retry will generally
+ * succeed, or in rare cases where it fails, we then fall back to using the
+ * async worker threads for a blocking retry.
+ */
+static bool io_rw_should_retry(struct io_kiocb *req)
+{
+	struct io_async_rw *io = req->async_data;
+	struct wait_page_queue *wait = &io->wpq;
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct kiocb *kiocb = &rw->kiocb;
+
+	/* never retry for NOWAIT, we just complete with -EAGAIN */
+	if (req->flags & REQ_F_NOWAIT)
+		return false;
+
+	/* Only for buffered IO */
+	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
+		return false;
+
+	/*
+	 * just use poll if we can, and don't attempt if the fs doesn't
+	 * support callback based unlocks
+	 */
+	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+		return false;
+
+	wait->wait.func = io_async_buf_func;
+	wait->wait.private = req;
+	wait->wait.flags = 0;
+	INIT_LIST_HEAD(&wait->wait.entry);
+	kiocb->ki_flags |= IOCB_WAITQ;
+	kiocb->ki_flags &= ~IOCB_NOWAIT;
+	kiocb->ki_waitq = wait;
+	return true;
+}
+
+static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
+{
+	struct file *file = rw->kiocb.ki_filp;
+
+	if (likely(file->f_op->read_iter))
+		return call_read_iter(file, &rw->kiocb, iter);
+	else if (file->f_op->read)
+		return loop_rw_iter(READ, rw, iter);
+	else
+		return -EINVAL;
+}
+
+static bool need_read_all(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_ISREG ||
+		S_ISBLK(file_inode(req->file)->i_mode);
+}
+
+static inline bool io_req_ffs_set(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_FIXED_FILE;
+}
+
+static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct kiocb *kiocb = &rw->kiocb;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file = req->file;
+	int ret;
+
+	if (unlikely(!file || !(file->f_mode & mode)))
+		return -EBADF;
+
+	if (!io_req_ffs_set(req))
+		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+	kiocb->ki_flags = iocb_flags(file);
+	ret = kiocb_set_rw_flags(kiocb, rw->flags);
+	if (unlikely(ret))
+		return ret;
+
+	/*
+	 * If the file is marked O_NONBLOCK, still allow retry for it if it
+	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
+	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+	 */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
+		req->flags |= REQ_F_NOWAIT;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
+			return -EOPNOTSUPP;
+
+		kiocb->private = NULL;
+		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+		kiocb->ki_complete = io_complete_rw_iopoll;
+		req->iopoll_completed = 0;
+	} else {
+		if (kiocb->ki_flags & IOCB_HIPRI)
+			return -EINVAL;
+		kiocb->ki_complete = io_complete_rw;
+	}
+
+	return 0;
+}
+
+int io_read(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw_state __s, *s = &__s;
+	struct iovec *iovec;
+	struct kiocb *kiocb = &rw->kiocb;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	struct io_async_rw *io;
+	ssize_t ret, ret2;
+	loff_t *ppos;
+
+	if (!req_has_async_data(req)) {
+		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		io = req->async_data;
+		s = &io->s;
+
+		/*
+		 * Safe and required to re-import if we're using provided
+		 * buffers, as we dropped the selected one before retry.
+		 */
+		if (io_do_buffer_select(req)) {
+			ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+			if (unlikely(ret < 0))
+				return ret;
+		}
+
+		/*
+		 * We come here from an earlier attempt, restore our state to
+		 * match in case it doesn't. It's cheap enough that we don't
+		 * need to make this conditional.
+		 */
+		iov_iter_restore(&s->iter, &s->iter_state);
+		iovec = NULL;
+	}
+	ret = io_rw_init_file(req, FMODE_READ);
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
+	req->cqe.res = iov_iter_count(&s->iter);
+
+	if (force_nonblock) {
+		/* If the file doesn't support async, just async punt */
+		if (unlikely(!io_file_supports_nowait(req))) {
+			ret = io_setup_async_rw(req, iovec, s, true);
+			return ret ?: -EAGAIN;
+		}
+		kiocb->ki_flags |= IOCB_NOWAIT;
+	} else {
+		/* Ensure we clear previously set non-block flag */
+		kiocb->ki_flags &= ~IOCB_NOWAIT;
+	}
+
+	ppos = io_kiocb_update_pos(req);
+
+	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
+
+	ret = io_iter_do_read(rw, &s->iter);
+
+	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
+		req->flags &= ~REQ_F_REISSUE;
+		/* if we can poll, just do that */
+		if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+			return -EAGAIN;
+		/* IOPOLL retry should happen for io-wq threads */
+		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
+			goto done;
+		/* no retry on NONBLOCK nor RWF_NOWAIT */
+		if (req->flags & REQ_F_NOWAIT)
+			goto done;
+		ret = 0;
+	} else if (ret == -EIOCBQUEUED) {
+		goto out_free;
+	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
+		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
+		/* read all, failed, already did sync or don't want to retry */
+		goto done;
+	}
+
+	/*
+	 * Don't depend on the iter state matching what was consumed, or being
+	 * untouched in case of error. Restore it and we'll advance it
+	 * manually if we need to.
+	 */
+	iov_iter_restore(&s->iter, &s->iter_state);
+
+	ret2 = io_setup_async_rw(req, iovec, s, true);
+	if (ret2)
+		return ret2;
+
+	iovec = NULL;
+	io = req->async_data;
+	s = &io->s;
+	/*
+	 * Now use our persistent iterator and state, if we aren't already.
+	 * We've restored and mapped the iter to match.
+	 */
+
+	do {
+		/*
+		 * We end up here because of a partial read, either from
+		 * above or inside this loop. Advance the iter by the bytes
+		 * that were consumed.
+		 */
+		iov_iter_advance(&s->iter, ret);
+		if (!iov_iter_count(&s->iter))
+			break;
+		io->bytes_done += ret;
+		iov_iter_save_state(&s->iter, &s->iter_state);
+
+		/* if we can retry, do so with the callbacks armed */
+		if (!io_rw_should_retry(req)) {
+			kiocb->ki_flags &= ~IOCB_WAITQ;
+			return -EAGAIN;
+		}
+
+		/*
+		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
+		 * we get -EIOCBQUEUED, then we'll get a notification when the
+		 * desired page gets unlocked. We can also get a partial read
+		 * here, and if we do, then just retry at the new offset.
+		 */
+		ret = io_iter_do_read(rw, &s->iter);
+		if (ret == -EIOCBQUEUED)
+			return IOU_ISSUE_SKIP_COMPLETE;
+		/* we got some bytes, but not all. retry. */
+		kiocb->ki_flags &= ~IOCB_WAITQ;
+		iov_iter_restore(&s->iter, &s->iter_state);
+	} while (ret > 0);
+done:
+	kiocb_done(req, ret, issue_flags);
+out_free:
+	/* it's faster to check here then delegate to kfree */
+	if (iovec)
+		kfree(iovec);
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_write(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw_state __s, *s = &__s;
+	struct iovec *iovec;
+	struct kiocb *kiocb = &rw->kiocb;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	ssize_t ret, ret2;
+	loff_t *ppos;
+
+	if (!req_has_async_data(req)) {
+		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		struct io_async_rw *io = req->async_data;
+
+		s = &io->s;
+		iov_iter_restore(&s->iter, &s->iter_state);
+		iovec = NULL;
+	}
+	ret = io_rw_init_file(req, FMODE_WRITE);
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
+	req->cqe.res = iov_iter_count(&s->iter);
+
+	if (force_nonblock) {
+		/* If the file doesn't support async, just async punt */
+		if (unlikely(!io_file_supports_nowait(req)))
+			goto copy_iov;
+
+		/* file path doesn't support NOWAIT for non-direct_IO */
+		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+		    (req->flags & REQ_F_ISREG))
+			goto copy_iov;
+
+		kiocb->ki_flags |= IOCB_NOWAIT;
+	} else {
+		/* Ensure we clear previously set non-block flag */
+		kiocb->ki_flags &= ~IOCB_NOWAIT;
+	}
+
+	ppos = io_kiocb_update_pos(req);
+
+	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
+	if (unlikely(ret))
+		goto out_free;
+
+	/*
+	 * Open-code file_start_write here to grab freeze protection,
+	 * which will be released by another thread in
+	 * io_complete_rw().  Fool lockdep by telling it the lock got
+	 * released so that it doesn't complain about the held lock when
+	 * we return to userspace.
+	 */
+	if (req->flags & REQ_F_ISREG) {
+		sb_start_write(file_inode(req->file)->i_sb);
+		__sb_writers_release(file_inode(req->file)->i_sb,
+					SB_FREEZE_WRITE);
+	}
+	kiocb->ki_flags |= IOCB_WRITE;
+
+	if (likely(req->file->f_op->write_iter))
+		ret2 = call_write_iter(req->file, kiocb, &s->iter);
+	else if (req->file->f_op->write)
+		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
+	else
+		ret2 = -EINVAL;
+
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		ret2 = -EAGAIN;
+	}
+
+	/*
+	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+	 * retry them without IOCB_NOWAIT.
+	 */
+	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+		ret2 = -EAGAIN;
+	/* no retry on NONBLOCK nor RWF_NOWAIT */
+	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
+		goto done;
+	if (!force_nonblock || ret2 != -EAGAIN) {
+		/* IOPOLL retry should happen for io-wq threads */
+		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
+			goto copy_iov;
+done:
+		kiocb_done(req, ret2, issue_flags);
+		ret = IOU_ISSUE_SKIP_COMPLETE;
+	} else {
+copy_iov:
+		iov_iter_restore(&s->iter, &s->iter_state);
+		ret = io_setup_async_rw(req, iovec, s, false);
+		return ret ?: -EAGAIN;
+	}
+out_free:
+	/* it's reportedly faster than delegating the null check to kfree() */
+	if (iovec)
+		kfree(iovec);
+	return ret;
+}
+
+static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
+{
+	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+		     ctx->has_evfd))
+		__io_commit_cqring_flush(ctx);
+
+	if (ctx->flags & IORING_SETUP_SQPOLL)
+		io_cqring_wake(ctx);
+}
+
+int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
+{
+	struct io_wq_work_node *pos, *start, *prev;
+	unsigned int poll_flags = BLK_POLL_NOSLEEP;
+	DEFINE_IO_COMP_BATCH(iob);
+	int nr_events = 0;
+
+	/*
+	 * Only spin for completions if we don't have multiple devices hanging
+	 * off our complete list.
+	 */
+	if (ctx->poll_multi_queue || force_nonspin)
+		poll_flags |= BLK_POLL_ONESHOT;
+
+	wq_list_for_each(pos, start, &ctx->iopoll_list) {
+		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+		struct io_rw *rw = io_kiocb_to_cmd(req);
+		int ret;
+
+		/*
+		 * Move completed and retryable entries to our local lists.
+		 * If we find a request that requires polling, break out
+		 * and complete those lists first, if we have entries there.
+		 */
+		if (READ_ONCE(req->iopoll_completed))
+			break;
+
+		ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
+		if (unlikely(ret < 0))
+			return ret;
+		else if (ret)
+			poll_flags |= BLK_POLL_ONESHOT;
+
+		/* iopoll may have completed current req */
+		if (!rq_list_empty(iob.req_list) ||
+		    READ_ONCE(req->iopoll_completed))
+			break;
+	}
+
+	if (!rq_list_empty(iob.req_list))
+		iob.complete(&iob);
+	else if (!pos)
+		return 0;
+
+	prev = start;
+	wq_list_for_each_resume(pos, prev) {
+		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+
+		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
+		if (!smp_load_acquire(&req->iopoll_completed))
+			break;
+		nr_events++;
+		if (unlikely(req->flags & REQ_F_CQE_SKIP))
+			continue;
+
+		req->cqe.flags = io_put_kbuf(req, 0);
+		__io_fill_cqe_req(req->ctx, req);
+	}
+
+	if (unlikely(!nr_events))
+		return 0;
+
+	io_commit_cqring(ctx);
+	io_cqring_ev_posted_iopoll(ctx);
+	pos = start ? start->next : ctx->iopoll_list.first;
+	wq_list_cut(&ctx->iopoll_list, prev, start);
+	io_free_batch_list(ctx, pos);
+	return nr_events;
+}
diff --git a/io_uring/rw.h b/io_uring/rw.h
new file mode 100644
index 00000000000000..0204c3fcafa517
--- /dev/null
+++ b/io_uring/rw.h
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pagemap.h>
+
+struct io_rw_state {
+	struct iov_iter			iter;
+	struct iov_iter_state		iter_state;
+	struct iovec			fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+	struct io_rw_state		s;
+	const struct iovec		*free_iovec;
+	size_t				bytes_done;
+	struct wait_page_queue		wpq;
+};
+
+int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_read(struct io_kiocb *req, unsigned int issue_flags);
+int io_readv_prep_async(struct io_kiocb *req);
+int io_write(struct io_kiocb *req, unsigned int issue_flags);
+int io_writev_prep_async(struct io_kiocb *req);
+void io_readv_writev_cleanup(struct io_kiocb *req);

From d9b57aa3cfc792ccac6858376c017dbea6cb2872 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 15 Jun 2022 16:27:42 -0600
Subject: [PATCH 055/172] io_uring: move opcode table to opdef.c

We already have the declarations in opdef.h, move the rest into its own
file rather than in the main io_uring.c file.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile         |   2 +-
 io_uring/io_uring.c       | 469 +-----------------------------------
 io_uring/io_uring_types.h |   2 +
 io_uring/opdef.c          | 495 ++++++++++++++++++++++++++++++++++++++
 io_uring/opdef.h          |   2 +
 5 files changed, 501 insertions(+), 469 deletions(-)
 create mode 100644 io_uring/opdef.c

diff --git a/io_uring/Makefile b/io_uring/Makefile
index d70deed65a0bb2..466639c289be7f 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0af61a6c29cfee..c703190986270b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -90,22 +90,8 @@
 #include "kbuf.h"
 #include "rsrc.h"
 
-#include "xattr.h"
-#include "nop.h"
-#include "fs.h"
-#include "splice.h"
-#include "sync.h"
-#include "advise.h"
-#include "openclose.h"
-#include "uring_cmd.h"
-#include "epoll.h"
-#include "statx.h"
-#include "net.h"
-#include "msg_ring.h"
 #include "timeout.h"
 #include "poll.h"
-#include "cancel.h"
-#include "rw.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -161,13 +147,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx);
 
 static struct kmem_cache *req_cachep;
 
-const char *io_uring_get_opcode(u8 opcode)
-{
-	if (opcode < IORING_OP_LAST)
-		return io_op_defs[opcode].name;
-	return "INVALID";
-}
-
 struct sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
@@ -1478,12 +1457,6 @@ bool io_alloc_async_data(struct io_kiocb *req)
 	return true;
 }
 
-static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
-					     const struct io_uring_sqe *sqe)
-{
-	return -EOPNOTSUPP;
-}
-
 int io_req_prep_async(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -3909,442 +3882,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	return ret;
 }
 
-static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
-{
-	WARN_ON_ONCE(1);
-	return -ECANCELED;
-}
-
-const struct io_op_def io_op_defs[] = {
-	[IORING_OP_NOP] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-		.name			= "NOP",
-		.prep			= io_nop_prep,
-		.issue			= io_nop,
-	},
-	[IORING_OP_READV] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "READV",
-		.prep			= io_prep_rw,
-		.issue			= io_read,
-		.prep_async		= io_readv_prep_async,
-		.cleanup		= io_readv_writev_cleanup,
-	},
-	[IORING_OP_WRITEV] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "WRITEV",
-		.prep			= io_prep_rw,
-		.issue			= io_write,
-		.prep_async		= io_writev_prep_async,
-		.cleanup		= io_readv_writev_cleanup,
-	},
-	[IORING_OP_FSYNC] = {
-		.needs_file		= 1,
-		.audit_skip		= 1,
-		.name			= "FSYNC",
-		.prep			= io_fsync_prep,
-		.issue			= io_fsync,
-	},
-	[IORING_OP_READ_FIXED] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "READ_FIXED",
-		.prep			= io_prep_rw,
-		.issue			= io_read,
-	},
-	[IORING_OP_WRITE_FIXED] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "WRITE_FIXED",
-		.prep			= io_prep_rw,
-		.issue			= io_write,
-	},
-	[IORING_OP_POLL_ADD] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-		.name			= "POLL_ADD",
-		.prep			= io_poll_add_prep,
-		.issue			= io_poll_add,
-	},
-	[IORING_OP_POLL_REMOVE] = {
-		.audit_skip		= 1,
-		.name			= "POLL_REMOVE",
-		.prep			= io_poll_remove_prep,
-		.issue			= io_poll_remove,
-	},
-	[IORING_OP_SYNC_FILE_RANGE] = {
-		.needs_file		= 1,
-		.audit_skip		= 1,
-		.name			= "SYNC_FILE_RANGE",
-		.prep			= io_sfr_prep,
-		.issue			= io_sync_file_range,
-	},
-	[IORING_OP_SENDMSG] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.ioprio			= 1,
-		.name			= "SENDMSG",
-#if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
-		.prep			= io_sendmsg_prep,
-		.issue			= io_sendmsg,
-		.prep_async		= io_sendmsg_prep_async,
-		.cleanup		= io_sendmsg_recvmsg_cleanup,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_RECVMSG] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.ioprio			= 1,
-		.name			= "RECVMSG",
-#if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
-		.prep			= io_recvmsg_prep,
-		.issue			= io_recvmsg,
-		.prep_async		= io_recvmsg_prep_async,
-		.cleanup		= io_sendmsg_recvmsg_cleanup,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_TIMEOUT] = {
-		.audit_skip		= 1,
-		.async_size		= sizeof(struct io_timeout_data),
-		.name			= "TIMEOUT",
-		.prep			= io_timeout_prep,
-		.issue			= io_timeout,
-	},
-	[IORING_OP_TIMEOUT_REMOVE] = {
-		/* used by timeout updates' prep() */
-		.audit_skip		= 1,
-		.name			= "TIMEOUT_REMOVE",
-		.prep			= io_timeout_remove_prep,
-		.issue			= io_timeout_remove,
-	},
-	[IORING_OP_ACCEPT] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.poll_exclusive		= 1,
-		.ioprio			= 1,	/* used for flags */
-		.name			= "ACCEPT",
-#if defined(CONFIG_NET)
-		.prep			= io_accept_prep,
-		.issue			= io_accept,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_ASYNC_CANCEL] = {
-		.audit_skip		= 1,
-		.name			= "ASYNC_CANCEL",
-		.prep			= io_async_cancel_prep,
-		.issue			= io_async_cancel,
-	},
-	[IORING_OP_LINK_TIMEOUT] = {
-		.audit_skip		= 1,
-		.async_size		= sizeof(struct io_timeout_data),
-		.name			= "LINK_TIMEOUT",
-		.prep			= io_link_timeout_prep,
-		.issue			= io_no_issue,
-	},
-	[IORING_OP_CONNECT] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.name			= "CONNECT",
-#if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_connect),
-		.prep			= io_connect_prep,
-		.issue			= io_connect,
-		.prep_async		= io_connect_prep_async,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_FALLOCATE] = {
-		.needs_file		= 1,
-		.name			= "FALLOCATE",
-		.prep			= io_fallocate_prep,
-		.issue			= io_fallocate,
-	},
-	[IORING_OP_OPENAT] = {
-		.name			= "OPENAT",
-		.prep			= io_openat_prep,
-		.issue			= io_openat,
-		.cleanup		= io_open_cleanup,
-	},
-	[IORING_OP_CLOSE] = {
-		.name			= "CLOSE",
-		.prep			= io_close_prep,
-		.issue			= io_close,
-	},
-	[IORING_OP_FILES_UPDATE] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-		.name			= "FILES_UPDATE",
-		.prep			= io_files_update_prep,
-		.issue			= io_files_update,
-	},
-	[IORING_OP_STATX] = {
-		.audit_skip		= 1,
-		.name			= "STATX",
-		.prep			= io_statx_prep,
-		.issue			= io_statx,
-		.cleanup		= io_statx_cleanup,
-	},
-	[IORING_OP_READ] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "READ",
-		.prep			= io_prep_rw,
-		.issue			= io_read,
-	},
-	[IORING_OP_WRITE] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.plug			= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.iopoll			= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "WRITE",
-		.prep			= io_prep_rw,
-		.issue			= io_write,
-	},
-	[IORING_OP_FADVISE] = {
-		.needs_file		= 1,
-		.audit_skip		= 1,
-		.name			= "FADVISE",
-		.prep			= io_fadvise_prep,
-		.issue			= io_fadvise,
-	},
-	[IORING_OP_MADVISE] = {
-		.name			= "MADVISE",
-		.prep			= io_madvise_prep,
-		.issue			= io_madvise,
-	},
-	[IORING_OP_SEND] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollout		= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.name			= "SEND",
-#if defined(CONFIG_NET)
-		.prep			= io_sendmsg_prep,
-		.issue			= io_send,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_RECV] = {
-		.needs_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.pollin			= 1,
-		.buffer_select		= 1,
-		.audit_skip		= 1,
-		.ioprio			= 1,
-		.name			= "RECV",
-#if defined(CONFIG_NET)
-		.prep			= io_recvmsg_prep,
-		.issue			= io_recv,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_OPENAT2] = {
-		.name			= "OPENAT2",
-		.prep			= io_openat2_prep,
-		.issue			= io_openat2,
-		.cleanup		= io_open_cleanup,
-	},
-	[IORING_OP_EPOLL_CTL] = {
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-		.name			= "EPOLL",
-#if defined(CONFIG_EPOLL)
-		.prep			= io_epoll_ctl_prep,
-		.issue			= io_epoll_ctl,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_SPLICE] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-		.name			= "SPLICE",
-		.prep			= io_splice_prep,
-		.issue			= io_splice,
-	},
-	[IORING_OP_PROVIDE_BUFFERS] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-		.name			= "PROVIDE_BUFFERS",
-		.prep			= io_provide_buffers_prep,
-		.issue			= io_provide_buffers,
-	},
-	[IORING_OP_REMOVE_BUFFERS] = {
-		.audit_skip		= 1,
-		.iopoll			= 1,
-		.name			= "REMOVE_BUFFERS",
-		.prep			= io_remove_buffers_prep,
-		.issue			= io_remove_buffers,
-	},
-	[IORING_OP_TEE] = {
-		.needs_file		= 1,
-		.hash_reg_file		= 1,
-		.unbound_nonreg_file	= 1,
-		.audit_skip		= 1,
-		.name			= "TEE",
-		.prep			= io_tee_prep,
-		.issue			= io_tee,
-	},
-	[IORING_OP_SHUTDOWN] = {
-		.needs_file		= 1,
-		.name			= "SHUTDOWN",
-#if defined(CONFIG_NET)
-		.prep			= io_shutdown_prep,
-		.issue			= io_shutdown,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_RENAMEAT] = {
-		.name			= "RENAMEAT",
-		.prep			= io_renameat_prep,
-		.issue			= io_renameat,
-		.cleanup		= io_renameat_cleanup,
-	},
-	[IORING_OP_UNLINKAT] = {
-		.name			= "UNLINKAT",
-		.prep			= io_unlinkat_prep,
-		.issue			= io_unlinkat,
-		.cleanup		= io_unlinkat_cleanup,
-	},
-	[IORING_OP_MKDIRAT] = {
-		.name			= "MKDIRAT",
-		.prep			= io_mkdirat_prep,
-		.issue			= io_mkdirat,
-		.cleanup		= io_mkdirat_cleanup,
-	},
-	[IORING_OP_SYMLINKAT] = {
-		.name			= "SYMLINKAT",
-		.prep			= io_symlinkat_prep,
-		.issue			= io_symlinkat,
-		.cleanup		= io_link_cleanup,
-	},
-	[IORING_OP_LINKAT] = {
-		.name			= "LINKAT",
-		.prep			= io_linkat_prep,
-		.issue			= io_linkat,
-		.cleanup		= io_link_cleanup,
-	},
-	[IORING_OP_MSG_RING] = {
-		.needs_file		= 1,
-		.iopoll			= 1,
-		.name			= "MSG_RING",
-		.prep			= io_msg_ring_prep,
-		.issue			= io_msg_ring,
-	},
-	[IORING_OP_FSETXATTR] = {
-		.needs_file = 1,
-		.name			= "FSETXATTR",
-		.prep			= io_fsetxattr_prep,
-		.issue			= io_fsetxattr,
-		.cleanup		= io_xattr_cleanup,
-	},
-	[IORING_OP_SETXATTR] = {
-		.name			= "SETXATTR",
-		.prep			= io_setxattr_prep,
-		.issue			= io_setxattr,
-		.cleanup		= io_xattr_cleanup,
-	},
-	[IORING_OP_FGETXATTR] = {
-		.needs_file = 1,
-		.name			= "FGETXATTR",
-		.prep			= io_fgetxattr_prep,
-		.issue			= io_fgetxattr,
-		.cleanup		= io_xattr_cleanup,
-	},
-	[IORING_OP_GETXATTR] = {
-		.name			= "GETXATTR",
-		.prep			= io_getxattr_prep,
-		.issue			= io_getxattr,
-		.cleanup		= io_xattr_cleanup,
-	},
-	[IORING_OP_SOCKET] = {
-		.audit_skip		= 1,
-		.name			= "SOCKET",
-#if defined(CONFIG_NET)
-		.prep			= io_socket_prep,
-		.issue			= io_socket,
-#else
-		.prep			= io_eopnotsupp_prep,
-#endif
-	},
-	[IORING_OP_URING_CMD] = {
-		.needs_file		= 1,
-		.plug			= 1,
-		.name			= "URING_CMD",
-		.async_size		= uring_cmd_pdu_size(1),
-		.prep			= io_uring_cmd_prep,
-		.issue			= io_uring_cmd,
-		.prep_async		= io_uring_cmd_prep_async,
-	},
-};
-
 static int __init io_uring_init(void)
 {
-	int i;
-
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -4400,17 +3939,11 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
 	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
 
-	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
 
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
 
-	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
-		BUG_ON(!io_op_defs[i].prep);
-		if (io_op_defs[i].prep != io_eopnotsupp_prep)
-			BUG_ON(!io_op_defs[i].issue);
-		WARN_ON_ONCE(!io_op_defs[i].name);
-	}
+	io_uring_optable_init();
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT);
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 147e1e597530a9..fb87af5fd8e751 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -3,6 +3,8 @@
 
 #include <linux/blkdev.h>
 #include <linux/task_work.h>
+#include <linux/bitmap.h>
+#include <uapi/linux/io_uring.h>
 
 #include "io-wq.h"
 #include "filetable.h"
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
new file mode 100644
index 00000000000000..d687d33f9c0c03
--- /dev/null
+++ b/io_uring/opdef.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_uring opcode handling table
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "opdef.h"
+#include "refs.h"
+#include "tctx.h"
+#include "sqpoll.h"
+#include "fdinfo.h"
+#include "kbuf.h"
+#include "rsrc.h"
+
+#include "xattr.h"
+#include "nop.h"
+#include "fs.h"
+#include "splice.h"
+#include "sync.h"
+#include "advise.h"
+#include "openclose.h"
+#include "uring_cmd.h"
+#include "epoll.h"
+#include "statx.h"
+#include "net.h"
+#include "msg_ring.h"
+#include "timeout.h"
+#include "poll.h"
+#include "cancel.h"
+#include "rw.h"
+
+static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
+{
+	WARN_ON_ONCE(1);
+	return -ECANCELED;
+}
+
+static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
+					     const struct io_uring_sqe *sqe)
+{
+	return -EOPNOTSUPP;
+}
+
+const struct io_op_def io_op_defs[] = {
+	[IORING_OP_NOP] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.name			= "NOP",
+		.prep			= io_nop_prep,
+		.issue			= io_nop,
+	},
+	[IORING_OP_READV] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READV",
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+		.prep_async		= io_readv_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+	},
+	[IORING_OP_WRITEV] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITEV",
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+		.prep_async		= io_writev_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+	},
+	[IORING_OP_FSYNC] = {
+		.needs_file		= 1,
+		.audit_skip		= 1,
+		.name			= "FSYNC",
+		.prep			= io_fsync_prep,
+		.issue			= io_fsync,
+	},
+	[IORING_OP_READ_FIXED] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ_FIXED",
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITE_FIXED] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE_FIXED",
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
+	[IORING_OP_POLL_ADD] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.name			= "POLL_ADD",
+		.prep			= io_poll_add_prep,
+		.issue			= io_poll_add,
+	},
+	[IORING_OP_POLL_REMOVE] = {
+		.audit_skip		= 1,
+		.name			= "POLL_REMOVE",
+		.prep			= io_poll_remove_prep,
+		.issue			= io_poll_remove,
+	},
+	[IORING_OP_SYNC_FILE_RANGE] = {
+		.needs_file		= 1,
+		.audit_skip		= 1,
+		.name			= "SYNC_FILE_RANGE",
+		.prep			= io_sfr_prep,
+		.issue			= io_sync_file_range,
+	},
+	[IORING_OP_SENDMSG] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.ioprio			= 1,
+		.name			= "SENDMSG",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep			= io_sendmsg_prep,
+		.issue			= io_sendmsg,
+		.prep_async		= io_sendmsg_prep_async,
+		.cleanup		= io_sendmsg_recvmsg_cleanup,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_RECVMSG] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.ioprio			= 1,
+		.name			= "RECVMSG",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep			= io_recvmsg_prep,
+		.issue			= io_recvmsg,
+		.prep_async		= io_recvmsg_prep_async,
+		.cleanup		= io_sendmsg_recvmsg_cleanup,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_TIMEOUT] = {
+		.audit_skip		= 1,
+		.async_size		= sizeof(struct io_timeout_data),
+		.name			= "TIMEOUT",
+		.prep			= io_timeout_prep,
+		.issue			= io_timeout,
+	},
+	[IORING_OP_TIMEOUT_REMOVE] = {
+		/* used by timeout updates' prep() */
+		.audit_skip		= 1,
+		.name			= "TIMEOUT_REMOVE",
+		.prep			= io_timeout_remove_prep,
+		.issue			= io_timeout_remove,
+	},
+	[IORING_OP_ACCEPT] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.poll_exclusive		= 1,
+		.ioprio			= 1,	/* used for flags */
+		.name			= "ACCEPT",
+#if defined(CONFIG_NET)
+		.prep			= io_accept_prep,
+		.issue			= io_accept,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_ASYNC_CANCEL] = {
+		.audit_skip		= 1,
+		.name			= "ASYNC_CANCEL",
+		.prep			= io_async_cancel_prep,
+		.issue			= io_async_cancel,
+	},
+	[IORING_OP_LINK_TIMEOUT] = {
+		.audit_skip		= 1,
+		.async_size		= sizeof(struct io_timeout_data),
+		.name			= "LINK_TIMEOUT",
+		.prep			= io_link_timeout_prep,
+		.issue			= io_no_issue,
+	},
+	[IORING_OP_CONNECT] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.name			= "CONNECT",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_connect),
+		.prep			= io_connect_prep,
+		.issue			= io_connect,
+		.prep_async		= io_connect_prep_async,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_FALLOCATE] = {
+		.needs_file		= 1,
+		.name			= "FALLOCATE",
+		.prep			= io_fallocate_prep,
+		.issue			= io_fallocate,
+	},
+	[IORING_OP_OPENAT] = {
+		.name			= "OPENAT",
+		.prep			= io_openat_prep,
+		.issue			= io_openat,
+		.cleanup		= io_open_cleanup,
+	},
+	[IORING_OP_CLOSE] = {
+		.name			= "CLOSE",
+		.prep			= io_close_prep,
+		.issue			= io_close,
+	},
+	[IORING_OP_FILES_UPDATE] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.name			= "FILES_UPDATE",
+		.prep			= io_files_update_prep,
+		.issue			= io_files_update,
+	},
+	[IORING_OP_STATX] = {
+		.audit_skip		= 1,
+		.name			= "STATX",
+		.prep			= io_statx_prep,
+		.issue			= io_statx,
+		.cleanup		= io_statx_cleanup,
+	},
+	[IORING_OP_READ] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ",
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE",
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
+	[IORING_OP_FADVISE] = {
+		.needs_file		= 1,
+		.audit_skip		= 1,
+		.name			= "FADVISE",
+		.prep			= io_fadvise_prep,
+		.issue			= io_fadvise,
+	},
+	[IORING_OP_MADVISE] = {
+		.name			= "MADVISE",
+		.prep			= io_madvise_prep,
+		.issue			= io_madvise,
+	},
+	[IORING_OP_SEND] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.name			= "SEND",
+#if defined(CONFIG_NET)
+		.prep			= io_sendmsg_prep,
+		.issue			= io_send,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_RECV] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.name			= "RECV",
+#if defined(CONFIG_NET)
+		.prep			= io_recvmsg_prep,
+		.issue			= io_recv,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_OPENAT2] = {
+		.name			= "OPENAT2",
+		.prep			= io_openat2_prep,
+		.issue			= io_openat2,
+		.cleanup		= io_open_cleanup,
+	},
+	[IORING_OP_EPOLL_CTL] = {
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.name			= "EPOLL",
+#if defined(CONFIG_EPOLL)
+		.prep			= io_epoll_ctl_prep,
+		.issue			= io_epoll_ctl,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_SPLICE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.name			= "SPLICE",
+		.prep			= io_splice_prep,
+		.issue			= io_splice,
+	},
+	[IORING_OP_PROVIDE_BUFFERS] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.name			= "PROVIDE_BUFFERS",
+		.prep			= io_provide_buffers_prep,
+		.issue			= io_provide_buffers,
+	},
+	[IORING_OP_REMOVE_BUFFERS] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+		.name			= "REMOVE_BUFFERS",
+		.prep			= io_remove_buffers_prep,
+		.issue			= io_remove_buffers,
+	},
+	[IORING_OP_TEE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.audit_skip		= 1,
+		.name			= "TEE",
+		.prep			= io_tee_prep,
+		.issue			= io_tee,
+	},
+	[IORING_OP_SHUTDOWN] = {
+		.needs_file		= 1,
+		.name			= "SHUTDOWN",
+#if defined(CONFIG_NET)
+		.prep			= io_shutdown_prep,
+		.issue			= io_shutdown,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_RENAMEAT] = {
+		.name			= "RENAMEAT",
+		.prep			= io_renameat_prep,
+		.issue			= io_renameat,
+		.cleanup		= io_renameat_cleanup,
+	},
+	[IORING_OP_UNLINKAT] = {
+		.name			= "UNLINKAT",
+		.prep			= io_unlinkat_prep,
+		.issue			= io_unlinkat,
+		.cleanup		= io_unlinkat_cleanup,
+	},
+	[IORING_OP_MKDIRAT] = {
+		.name			= "MKDIRAT",
+		.prep			= io_mkdirat_prep,
+		.issue			= io_mkdirat,
+		.cleanup		= io_mkdirat_cleanup,
+	},
+	[IORING_OP_SYMLINKAT] = {
+		.name			= "SYMLINKAT",
+		.prep			= io_symlinkat_prep,
+		.issue			= io_symlinkat,
+		.cleanup		= io_link_cleanup,
+	},
+	[IORING_OP_LINKAT] = {
+		.name			= "LINKAT",
+		.prep			= io_linkat_prep,
+		.issue			= io_linkat,
+		.cleanup		= io_link_cleanup,
+	},
+	[IORING_OP_MSG_RING] = {
+		.needs_file		= 1,
+		.iopoll			= 1,
+		.name			= "MSG_RING",
+		.prep			= io_msg_ring_prep,
+		.issue			= io_msg_ring,
+	},
+	[IORING_OP_FSETXATTR] = {
+		.needs_file = 1,
+		.name			= "FSETXATTR",
+		.prep			= io_fsetxattr_prep,
+		.issue			= io_fsetxattr,
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_SETXATTR] = {
+		.name			= "SETXATTR",
+		.prep			= io_setxattr_prep,
+		.issue			= io_setxattr,
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_FGETXATTR] = {
+		.needs_file = 1,
+		.name			= "FGETXATTR",
+		.prep			= io_fgetxattr_prep,
+		.issue			= io_fgetxattr,
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_GETXATTR] = {
+		.name			= "GETXATTR",
+		.prep			= io_getxattr_prep,
+		.issue			= io_getxattr,
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_SOCKET] = {
+		.audit_skip		= 1,
+		.name			= "SOCKET",
+#if defined(CONFIG_NET)
+		.prep			= io_socket_prep,
+		.issue			= io_socket,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_URING_CMD] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.name			= "URING_CMD",
+		.async_size		= uring_cmd_pdu_size(1),
+		.prep			= io_uring_cmd_prep,
+		.issue			= io_uring_cmd,
+		.prep_async		= io_uring_cmd_prep_async,
+	},
+};
+
+const char *io_uring_get_opcode(u8 opcode)
+{
+	if (opcode < IORING_OP_LAST)
+		return io_op_defs[opcode].name;
+	return "INVALID";
+}
+
+void __init io_uring_optable_init(void)
+{
+	int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
+
+	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
+		BUG_ON(!io_op_defs[i].prep);
+		if (io_op_defs[i].prep != io_eopnotsupp_prep)
+			BUG_ON(!io_op_defs[i].issue);
+		WARN_ON_ONCE(!io_op_defs[i].name);
+	}
+}
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 4578adcdba8a41..ece8ed4f96c434 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -37,4 +37,6 @@ struct io_op_def {
 };
 
 extern const struct io_op_def io_op_defs[];
+
+void io_uring_optable_init(void);
 #endif

From b9ba8a4463cd78d0aee520c4bf2569820ac29929 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 May 2022 10:55:07 -0600
Subject: [PATCH 056/172] io_uring: add support for level triggered poll

By default, the POLL_ADD command does edge triggered poll - if we get
a non-zero mask on the initial poll attempt, we complete the request
successfully.

Support level triggered by always waiting for a notification, regardless
of whether or not the initial mask matches the file state.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  3 +++
 io_uring/poll.c               | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0ad3da28d2fce8..4927bb69387a29 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -229,10 +229,13 @@ enum io_uring_op {
  *
  * IORING_POLL_UPDATE		Update existing poll request, matching
  *				sqe->addr as the old user_data field.
+ *
+ * IORING_POLL_LEVEL		Level triggered poll.
  */
 #define IORING_POLL_ADD_MULTI	(1U << 0)
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
+#define IORING_POLL_ADD_LEVEL		(1U << 3)
 
 /*
  * ASYNC_CANCEL flags.
diff --git a/io_uring/poll.c b/io_uring/poll.c
index b80f7fa261232d..558dc170468ad8 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -423,11 +423,13 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	atomic_set(&req->poll_refs, 1);
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 
-	if (mask && (poll->events & EPOLLONESHOT)) {
+	if (mask &&
+	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
 		io_poll_remove_entries(req);
 		/* no one else has access to the req, forget about the ref */
 		return mask;
 	}
+
 	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
 		io_poll_remove_entries(req);
 		if (!ipt->error)
@@ -439,7 +441,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	io_poll_req_insert(req);
 	spin_unlock(&ctx->completion_lock);
 
-	if (mask) {
+	if (mask && (poll->events & EPOLLET)) {
 		/* can't multishot if failed, just queue the event we've got */
 		if (unlikely(ipt->error || !ipt->nr_entries)) {
 			poll->events |= EPOLLONESHOT;
@@ -475,7 +477,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
-	__poll_t mask = POLLPRI | POLLERR;
+	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
 	int ret;
 
 	if (!def->pollin && !def->pollout)
@@ -638,7 +640,10 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 #endif
 	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
-	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+	if (!(flags & IORING_POLL_ADD_LEVEL))
+		events |= EPOLLET;
+	return demangle_poll(events) |
+		(events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
 }
 
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -679,7 +684,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL))
 		return -EINVAL;
 	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 		return -EINVAL;

From 61a2732af4b0337f7e36093612c846e9f5962965 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 1 Jun 2022 12:36:42 -0600
Subject: [PATCH 057/172] io_uring: deprecate epoll_ctl support

As far as we know, nobody ever adopted the epoll_ctl management via
io_uring. Deprecate it now with a warning, and plan on removing it in
a later kernel version. When we do remove it, we can revert the following
commits as well:

39220e8d4a2a ("eventpoll: support non-blocking do_epoll_ctl() calls")
58e41a44c488 ("eventpoll: abstract out epoll_ctl() handler")

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/io-uring/CAHk-=wiTyisXBgKnVHAGYCNvkmjk=50agS2Uk6nr+n3ssLZg2w@mail.gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/epoll.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index acbb32498127ad..10853e8ed07887 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -26,6 +26,10 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_epoll *epoll = io_kiocb_to_cmd(req);
 
+	pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will "
+		     "be removed in a future Linux kernel version.\n",
+		     current->comm);
+
 	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 

From 5ff4fdffad483dad815b3701be8b82c5a07b156e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:47 +0100
Subject: [PATCH 058/172] io_uring: make reg buf init consistent

The default (i.e. empty) state of register buffer is dummy_ubuf, so set
it to dummy on init instead of NULL.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c5456aecf03d9627fbd6e65e100e2b5293a6151e.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 8c40b20659d40a..214ff0dfa6a48e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -567,7 +567,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 				io_buffer_unmap(ctx, &imu);
 				break;
 			}
-			ctx->user_bufs[i] = NULL;
+			ctx->user_bufs[i] = ctx->dummy_ubuf;
 			needs_switch = true;
 		}
 
@@ -1203,14 +1203,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	size_t size;
 	int ret, nr_pages, i;
 
-	if (!iov->iov_base) {
-		*pimu = ctx->dummy_ubuf;
+	*pimu = ctx->dummy_ubuf;
+	if (!iov->iov_base)
 		return 0;
-	}
 
-	*pimu = NULL;
 	ret = -ENOMEM;
-
 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
 				&nr_pages);
 	if (IS_ERR(pages)) {

From b25436038f6cc20c3198792cbfab8a312d09282e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:48 +0100
Subject: [PATCH 059/172] io_uring: move defer_list to slow data

draining is slow path, move defer_list to the end where slow data lives
inside the context.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e16379391ca72b490afdd24e8944baab849b4a7b.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring_types.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index fb87af5fd8e751..ee58273569702d 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -162,7 +162,6 @@ struct io_ring_ctx {
 		struct io_uring_sqe	*sq_sqes;
 		unsigned		cached_sq_head;
 		unsigned		sq_entries;
-		struct list_head	defer_list;
 
 		/*
 		 * Fixed resources fast path, should be accessed only under
@@ -274,8 +273,12 @@ struct io_ring_ctx {
 		struct work_struct		exit_work;
 		struct list_head		tctx_list;
 		struct completion		ref_comp;
+
+		/* io-wq management, e.g. thread count */
 		u32				iowq_limits[2];
 		bool				iowq_limits_set;
+
+		struct list_head		defer_list;
 	};
 };
 

From aff5b2df9e8b35f9814c5a4907f471472cd6be77 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:49 +0100
Subject: [PATCH 060/172] io_uring: better caching for ctx timeout fields

Following timeout fields access patterns, move all of them into a
separate cache line inside ctx, so they don't intervene with normal
completion caching, especially since timeout removals and completion
are separated and the later is done via tw.

It also sheds some bytes from io_ring_ctx, 1216B -> 1152B

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4b163793072840de53b3cb66e0c2995e7226ff78.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring_types.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index ee58273569702d..4fc42055dbdd1c 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -181,8 +181,6 @@ struct io_ring_ctx {
 		struct xarray		io_bl_xa;
 		struct list_head	io_buffers_cache;
 
-		struct list_head	timeout_list;
-		struct list_head	ltimeout_list;
 		struct list_head	cq_overflow_list;
 		struct list_head	apoll_cache;
 		struct xarray		personalities;
@@ -215,15 +213,11 @@ struct io_ring_ctx {
 		struct io_ev_fd	__rcu	*io_ev_fd;
 		struct wait_queue_head	cq_wait;
 		unsigned		cq_extra;
-		atomic_t		cq_timeouts;
-		unsigned		cq_last_tm_flush;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
 		spinlock_t		completion_lock;
 
-		spinlock_t		timeout_lock;
-
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -255,6 +249,15 @@ struct io_ring_ctx {
 		struct list_head	io_buffers_pages;
 	};
 
+	/* timeouts */
+	struct {
+		spinlock_t		timeout_lock;
+		atomic_t		cq_timeouts;
+		struct list_head	timeout_list;
+		struct list_head	ltimeout_list;
+		unsigned		cq_last_tm_flush;
+	} ____cacheline_aligned_in_smp;
+
 	/* Keep this last, we don't need it for the fast path */
 	struct {
 		#if defined(CONFIG_UNIX)

From 22eb2a3fdea03e6056dcfb4c8bb9d3fde5ce02ff Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:50 +0100
Subject: [PATCH 061/172] io_uring: refactor ctx slow data placement

Shove all slow path data at the end of ctx and get rid of extra
indention.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bcaf200298dd469af20787650550efc66d89bef2.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring_types.h | 81 +++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 4fc42055dbdd1c..ef1cf86e893264 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -185,7 +185,6 @@ struct io_ring_ctx {
 		struct list_head	apoll_cache;
 		struct xarray		personalities;
 		u32			pers_next;
-		unsigned		sq_thread_idle;
 	} ____cacheline_aligned_in_smp;
 
 	/* IRQ completion list, under ->completion_lock */
@@ -232,23 +231,6 @@ struct io_ring_ctx {
 		struct list_head	io_buffers_comp;
 	} ____cacheline_aligned_in_smp;
 
-	struct io_restriction		restrictions;
-
-	/* slow path rsrc auxilary data, used by update/register */
-	struct {
-		struct io_rsrc_node		*rsrc_backup_node;
-		struct io_mapped_ubuf		*dummy_ubuf;
-		struct io_rsrc_data		*file_data;
-		struct io_rsrc_data		*buf_data;
-
-		struct delayed_work		rsrc_put_work;
-		struct llist_head		rsrc_put_llist;
-		struct list_head		rsrc_ref_list;
-		spinlock_t			rsrc_ref_lock;
-
-		struct list_head	io_buffers_pages;
-	};
-
 	/* timeouts */
 	struct {
 		spinlock_t		timeout_lock;
@@ -259,30 +241,45 @@ struct io_ring_ctx {
 	} ____cacheline_aligned_in_smp;
 
 	/* Keep this last, we don't need it for the fast path */
-	struct {
-		#if defined(CONFIG_UNIX)
-			struct socket		*ring_sock;
-		#endif
-		/* hashed buffered write serialization */
-		struct io_wq_hash		*hash_map;
-
-		/* Only used for accounting purposes */
-		struct user_struct		*user;
-		struct mm_struct		*mm_account;
-
-		/* ctx exit and cancelation */
-		struct llist_head		fallback_llist;
-		struct delayed_work		fallback_work;
-		struct work_struct		exit_work;
-		struct list_head		tctx_list;
-		struct completion		ref_comp;
-
-		/* io-wq management, e.g. thread count */
-		u32				iowq_limits[2];
-		bool				iowq_limits_set;
-
-		struct list_head		defer_list;
-	};
+
+	struct io_restriction		restrictions;
+
+	/* slow path rsrc auxilary data, used by update/register */
+	struct io_rsrc_node		*rsrc_backup_node;
+	struct io_mapped_ubuf		*dummy_ubuf;
+	struct io_rsrc_data		*file_data;
+	struct io_rsrc_data		*buf_data;
+
+	struct delayed_work		rsrc_put_work;
+	struct llist_head		rsrc_put_llist;
+	struct list_head		rsrc_ref_list;
+	spinlock_t			rsrc_ref_lock;
+
+	struct list_head		io_buffers_pages;
+
+	#if defined(CONFIG_UNIX)
+		struct socket		*ring_sock;
+	#endif
+	/* hashed buffered write serialization */
+	struct io_wq_hash		*hash_map;
+
+	/* Only used for accounting purposes */
+	struct user_struct		*user;
+	struct mm_struct		*mm_account;
+
+	/* ctx exit and cancelation */
+	struct llist_head		fallback_llist;
+	struct delayed_work		fallback_work;
+	struct work_struct		exit_work;
+	struct list_head		tctx_list;
+	struct completion		ref_comp;
+
+	/* io-wq management, e.g. thread count */
+	u32				iowq_limits[2];
+	bool				iowq_limits_set;
+
+	struct list_head		defer_list;
+	unsigned			sq_thread_idle;
 };
 
 enum {

From aa1e90f64ee5c82f6d3feb69b2a19370686f1f02 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:51 +0100
Subject: [PATCH 062/172] io_uring: move small helpers to headers

There is a bunch of inline helpers that will be useful not only to the
core of io_uring, move them to headers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/22df99c83723e44cba7e945e8519e64e3642c064.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 -----------------
 io_uring/io_uring.h | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c703190986270b..fd3ad784865287 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -160,14 +160,6 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
-static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
-{
-	if (!*locked) {
-		mutex_lock(&ctx->uring_lock);
-		*locked = true;
-	}
-}
-
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
@@ -423,15 +415,6 @@ static void io_prep_async_link(struct io_kiocb *req)
 	}
 }
 
-static inline void io_req_add_compl_list(struct io_kiocb *req)
-{
-	struct io_submit_state *state = &req->ctx->submit_state;
-
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		state->flush_cqes = true;
-	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
-}
-
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 22e6e52c42d261..6744ce111e3812 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -217,6 +217,28 @@ static inline bool io_run_task_work(void)
 	return false;
 }
 
+static inline void io_req_complete_state(struct io_kiocb *req)
+{
+	req->flags |= REQ_F_COMPLETE_INLINE;
+}
+
+static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+{
+	if (!*locked) {
+		mutex_lock(&ctx->uring_lock);
+		*locked = true;
+	}
+}
+
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+	struct io_submit_state *state = &req->ctx->submit_state;
+
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		state->flush_cqes = true;
+	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
 int io_run_task_work_sig(void);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);

From 48c13d8980840e489a537e4af4b2503eb9d8a1ec Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:52 +0100
Subject: [PATCH 063/172] io_uring: explain io_wq_work::cancel_seq placement

Add a comment on why we keep ->cancel_seq in struct io_wq_work instead
of struct io_kiocb despite it needed only by io_uring but not io-wq.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/988e87eec9dc700b5dae933df3aefef303502f6c.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index ba6eee76d028f6..3f54ee2a8eebd3 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
 struct io_wq_work {
 	struct io_wq_work_node list;
 	unsigned flags;
+	/* place it here instead of io_kiocb as it fills padding and saves 4B */
 	int cancel_seq;
 };
 

From 6a02e4be8187588ac476136496af9e3feeeb9a75 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:53 +0100
Subject: [PATCH 064/172] io_uring: inline ->registered_rings

There can be only 16 registered rings, no need to allocate an array for
them separately but store it in tctx.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/495f0b953c87994dd9e13de2134019054fa5830d.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/tctx.c | 10 ----------
 io_uring/tctx.h |  3 ++-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 3f7e9feb6ca2bc..5a5d4f908529a2 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -53,7 +53,6 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
-	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -69,16 +68,8 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
-	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
-					 sizeof(struct file *), GFP_KERNEL);
-	if (unlikely(!tctx->registered_rings)) {
-		kfree(tctx);
-		return -ENOMEM;
-	}
-
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
-		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -87,7 +78,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
-		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index f4964e40d07e0d..7684713e950f4f 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -20,8 +20,9 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prio_task_list;
 	struct callback_head	task_work;
-	struct file		**registered_rings;
 	bool			task_running;
+
+	struct file		*registered_rings[IO_RINGFD_REG_MAX];
 };
 
 struct io_tctx_node {

From aeaa72c69473d7e68addbd31f43c7c12af252bfc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:54 +0100
Subject: [PATCH 065/172] io_uring: never defer-complete multi-apoll

Luckily, nnobody completes multi-apoll requests outside the polling
functions, but don't set IO_URING_F_COMPLETE_DEFER in any case as
there is nobody who is catching REQ_F_COMPLETE_INLINE, and so will leak
requests if used.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a65ed3f5effd9321ee06e6edea294a03be3e15a0.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fd3ad784865287..8a8d8b323519a0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1599,7 +1599,7 @@ int io_poll_issue(struct io_kiocb *req, bool *locked)
 	io_tw_lock(req->ctx, locked);
 	if (unlikely(req->task->flags & PF_EXITING))
 		return -EFAULT;
-	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+	return io_issue_sqe(req, IO_URING_F_NONBLOCK);
 }
 
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)

From 3a08576b96e365d424225dd034c651e963b3ae64 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:55 +0100
Subject: [PATCH 066/172] io_uring: remove check_cq checking from hot paths

All ctx->check_cq events are slow path, don't test every single flag one
by one in the hot path, but add a common guarding if.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dff026585cea7ff3a172a7c83894a3b0111bbf6a.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8a8d8b323519a0..a4c1746d0691d2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1259,24 +1259,25 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	int ret = 0;
 	unsigned long check_cq;
 
+	check_cq = READ_ONCE(ctx->check_cq);
+	if (unlikely(check_cq)) {
+		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
+			__io_cqring_overflow_flush(ctx, false);
+		/*
+		 * Similarly do not spin if we have not informed the user of any
+		 * dropped CQE.
+		 */
+		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
+			return -EBADR;
+	}
 	/*
 	 * Don't enter poll loop if we already have events pending.
 	 * If we do, we can potentially be spinning for commands that
 	 * already triggered a CQE (eg in error).
 	 */
-	check_cq = READ_ONCE(ctx->check_cq);
-	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
-		__io_cqring_overflow_flush(ctx, false);
 	if (io_cqring_events(ctx))
 		return 0;
 
-	/*
-	 * Similarly do not spin if we have not informed the user of any
-	 * dropped CQE.
-	 */
-	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
-		return -EBADR;
-
 	do {
 		/*
 		 * If a submit got punted to a workqueue, we can have the
@@ -2203,12 +2204,15 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	ret = io_run_task_work_sig();
 	if (ret || io_should_wake(iowq))
 		return ret;
+
 	check_cq = READ_ONCE(ctx->check_cq);
-	/* let the caller flush overflows, retry */
-	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
-		return 1;
-	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
-		return -EBADR;
+	if (unlikely(check_cq)) {
+		/* let the caller flush overflows, retry */
+		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
+			return 1;
+		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
+			return -EBADR;
+	}
 	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
 	return 1;

From c65f5279ba02e47c073520ecff3c84408a87fd17 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jun 2022 17:33:56 +0100
Subject: [PATCH 067/172] io_uring: don't set REQ_F_COMPLETE_INLINE in tw

io_req_task_complete() enqueues requests for state completion itself, no
need for REQ_F_COMPLETE_INLINE, which is only serve the purpose of not
bloating the kernel.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/aca80f71464ad02c06f1311d998a2d6ee0b31573.1655310733.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a4c1746d0691d2..4adfc4ebf8c199 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1314,7 +1314,6 @@ inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
 	if (*locked) {
 		req->cqe.flags |= io_put_kbuf(req, 0);
-		req->flags |= REQ_F_COMPLETE_INLINE;
 		io_req_add_compl_list(req);
 	} else {
 		req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED);

From bb8f870031584ebf50fcc1f049fa421375556114 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 15 Jun 2022 16:28:17 -0600
Subject: [PATCH 068/172] io_uring: remove unused IO_REQ_CACHE_SIZE defined

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4adfc4ebf8c199..72640aa55abcde 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -115,7 +115,6 @@
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
 #define IO_COMPL_BATCH			32
-#define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
 enum {

From df9830d883b914296beb17e4f75017faac9c4312 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:21:57 +0100
Subject: [PATCH 069/172] io_uring: rw: delegate sync completions to core
 io_uring

io_issue_sqe() from the io_uring core knows how to complete requests
based on the returned error code, we can delegate io_read()/io_write()
completion to it. Make kiocb_done() to return the right completion
code and propagate it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/32ef005b45d23bf6b5e6837740dc0331bb051bd4.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index f0b60199ee2279..e5ca23d0783e2c 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -207,15 +207,6 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 	return false;
 }
 
-static void __io_complete_rw(struct io_kiocb *req, long res,
-			     unsigned int issue_flags)
-{
-	if (__io_complete_rw_common(req, res))
-		return;
-	io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags));
-	__io_req_complete(req, issue_flags);
-}
-
 static void io_complete_rw(struct kiocb *kiocb, long res)
 {
 	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
@@ -247,7 +238,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 	smp_store_release(&req->iopoll_completed, 1);
 }
 
-static void kiocb_done(struct io_kiocb *req, ssize_t ret,
+static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 		       unsigned int issue_flags)
 {
 	struct io_async_rw *io = req->async_data;
@@ -263,10 +254,15 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 
 	if (req->flags & REQ_F_CUR_POS)
 		req->file->f_pos = rw->kiocb.ki_pos;
-	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw))
-		__io_complete_rw(req, ret, issue_flags);
-	else
+	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
+		if (!__io_complete_rw_common(req, ret)) {
+			io_req_set_res(req, req->cqe.res,
+				       io_put_kbuf(req, issue_flags));
+			return IOU_OK;
+		}
+	} else {
 		io_rw_done(&rw->kiocb, ret);
+	}
 
 	if (req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
@@ -275,6 +271,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 		else
 			io_req_task_queue_fail(req, ret);
 	}
+	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
 static int __io_import_fixed(struct io_kiocb *req, int ddir,
@@ -847,7 +844,9 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 			goto done;
 		ret = 0;
 	} else if (ret == -EIOCBQUEUED) {
-		goto out_free;
+		if (iovec)
+			kfree(iovec);
+		return IOU_ISSUE_SKIP_COMPLETE;
 	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
 		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
 		/* read all, failed, already did sync or don't want to retry */
@@ -905,12 +904,10 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		iov_iter_restore(&s->iter, &s->iter_state);
 	} while (ret > 0);
 done:
-	kiocb_done(req, ret, issue_flags);
-out_free:
 	/* it's faster to check here then delegate to kfree */
 	if (iovec)
 		kfree(iovec);
-	return IOU_ISSUE_SKIP_COMPLETE;
+	return kiocb_done(req, ret, issue_flags);
 }
 
 int io_write(struct io_kiocb *req, unsigned int issue_flags)
@@ -960,8 +957,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ppos = io_kiocb_update_pos(req);
 
 	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
-	if (unlikely(ret))
-		goto out_free;
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
 
 	/*
 	 * Open-code file_start_write here to grab freeze protection,
@@ -1003,15 +1002,13 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
 			goto copy_iov;
 done:
-		kiocb_done(req, ret2, issue_flags);
-		ret = IOU_ISSUE_SKIP_COMPLETE;
+		ret = kiocb_done(req, ret2, issue_flags);
 	} else {
 copy_iov:
 		iov_iter_restore(&s->iter, &s->iter_state);
 		ret = io_setup_async_rw(req, iovec, s, false);
 		return ret ?: -EAGAIN;
 	}
-out_free:
 	/* it's reportedly faster than delegating the null check to kfree() */
 	if (iovec)
 		kfree(iovec);

From 75d7b3aec13be557f15d099dc612dd658d670d1d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:21:58 +0100
Subject: [PATCH 070/172] io_uring: kill REQ_F_COMPLETE_INLINE

REQ_F_COMPLETE_INLINE is only needed to delay queueing into the
completion list to io_queue_sqe() as __io_req_complete() is inlined and
we don't want to bloat the kernel.

As now we complete in a more centralised fashion in io_issue_sqe() we
can get rid of the flag and queue to the list directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/600ba20a9338b8a39b249b23d3d177803613dde4.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c       | 18 +++++++-----------
 io_uring/io_uring.h       |  5 -----
 io_uring/io_uring_types.h |  3 ---
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 72640aa55abcde..541c109a9273bf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -742,10 +742,7 @@ void io_req_complete_post(struct io_kiocb *req)
 
 inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 {
-	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
-		req->flags |= REQ_F_COMPLETE_INLINE;
-	else
-		io_req_complete_post(req);
+	io_req_complete_post(req);
 }
 
 void io_req_complete_failed(struct io_kiocb *req, s32 res)
@@ -1581,9 +1578,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	if (creds)
 		revert_creds(creds);
 
-	if (ret == IOU_OK)
-		__io_req_complete(req, issue_flags);
-	else if (ret != IOU_ISSUE_SKIP_COMPLETE)
+	if (ret == IOU_OK) {
+		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
+			io_req_add_compl_list(req);
+		else
+			io_req_complete_post(req);
+	} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
 		return ret;
 
 	/* If the op doesn't have a file, we're not polling for it */
@@ -1749,10 +1749,6 @@ static inline void io_queue_sqe(struct io_kiocb *req)
 
 	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
-	if (req->flags & REQ_F_COMPLETE_INLINE) {
-		io_req_add_compl_list(req);
-		return;
-	}
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 	 * doesn't support non-blocking read/write attempts
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 6744ce111e3812..3f06fbae0ee9e5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -217,11 +217,6 @@ static inline bool io_run_task_work(void)
 	return false;
 }
 
-static inline void io_req_complete_state(struct io_kiocb *req)
-{
-	req->flags |= REQ_F_COMPLETE_INLINE;
-}
-
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!*locked) {
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index ef1cf86e893264..4576ea8cad2e6c 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -301,7 +301,6 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_BUFFER_RING_BIT,
-	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
 	REQ_F_REFCOUNT_BIT,
@@ -356,8 +355,6 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* buffer selected from ring, needs commit */
 	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
-	/* completion is deferred through io_comp_state */
-	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 	/* caller should reissue async */
 	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
 	/* supports async reads/writes */

From 7012c81593d5a3bc6d0b8720345cf887ef1df914 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:21:59 +0100
Subject: [PATCH 071/172] io_uring: refactor io_req_task_complete()

Clean up io_req_task_complete() and deduplicate io_put_kbuf() calls.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ae3148ac7eb5cce3e06895cde306e9e959d6f6ae.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 541c109a9273bf..957a5bc1b528c9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1306,15 +1306,19 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
 	return ret;
 }
-inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
+
+void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	if (*locked) {
-		req->cqe.flags |= io_put_kbuf(req, 0);
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+		unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+
+		req->cqe.flags |= io_put_kbuf(req, issue_flags);
+	}
+
+	if (*locked)
 		io_req_add_compl_list(req);
-	} else {
-		req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED);
+	else
 		io_req_complete_post(req);
-	}
 }
 
 /*

From 53ccf69bda6f51e462f3c4ab7eb9c0ec34e78be4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:00 +0100
Subject: [PATCH 072/172] io_uring: don't inline io_put_kbuf

io_put_kbuf() is huge, don't bloat the kernel with inlining.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2e21ccf0be471ffa654032914b9430813cae53f8.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 33 +++++++++++++++++++++++++++++++++
 io_uring/kbuf.h | 38 ++++++--------------------------------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index bc58890d932b2f..b9c7f6e87cc9a3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -83,6 +83,39 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
 }
 
+unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
+{
+	unsigned int cflags;
+
+	/*
+	 * We can add this buffer back to two lists:
+	 *
+	 * 1) The io_buffers_cache list. This one is protected by the
+	 *    ctx->uring_lock. If we already hold this lock, add back to this
+	 *    list as we can grab it from issue as well.
+	 * 2) The io_buffers_comp list. This one is protected by the
+	 *    ctx->completion_lock.
+	 *
+	 * We migrate buffers from the comp_list to the issue cache list
+	 * when we need one.
+	 */
+	if (req->flags & REQ_F_BUFFER_RING) {
+		/* no buffers to recycle for this case */
+		cflags = __io_put_kbuf_list(req, NULL);
+	} else if (issue_flags & IO_URING_F_UNLOCKED) {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		spin_lock(&ctx->completion_lock);
+		cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
+		spin_unlock(&ctx->completion_lock);
+	} else {
+		lockdep_assert_held(&req->ctx->uring_lock);
+
+		cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
+	}
+	return cflags;
+}
+
 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 					      struct io_buffer_list *bl)
 {
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 9da3a933ef40e1..304e7139d83562 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -47,6 +47,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 
+unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+
 static inline bool io_do_buffer_select(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_BUFFER_SELECT))
@@ -79,7 +81,8 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	__io_kbuf_recycle(req, issue_flags);
 }
 
-static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
+static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
+					      struct list_head *list)
 {
 	if (req->flags & REQ_F_BUFFER_RING) {
 		if (req->buf_list)
@@ -99,44 +102,15 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
 
 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
-	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
+	return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
 }
 
 static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 				       unsigned issue_flags)
 {
-	unsigned int cflags;
 
 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
-
-	/*
-	 * We can add this buffer back to two lists:
-	 *
-	 * 1) The io_buffers_cache list. This one is protected by the
-	 *    ctx->uring_lock. If we already hold this lock, add back to this
-	 *    list as we can grab it from issue as well.
-	 * 2) The io_buffers_comp list. This one is protected by the
-	 *    ctx->completion_lock.
-	 *
-	 * We migrate buffers from the comp_list to the issue cache list
-	 * when we need one.
-	 */
-	if (req->flags & REQ_F_BUFFER_RING) {
-		/* no buffers to recycle for this case */
-		cflags = __io_put_kbuf(req, NULL);
-	} else if (issue_flags & IO_URING_F_UNLOCKED) {
-		struct io_ring_ctx *ctx = req->ctx;
-
-		spin_lock(&ctx->completion_lock);
-		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
-		spin_unlock(&ctx->completion_lock);
-	} else {
-		lockdep_assert_held(&req->ctx->uring_lock);
-
-		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
-	}
-
-	return cflags;
+	return __io_put_kbuf(req, issue_flags);
 }
 #endif

From 3654ab0c51a91036d15dd2fd9c2809317edf2228 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Thu, 16 Jun 2022 10:22:01 +0100
Subject: [PATCH 073/172] io_uring: poll: remove unnecessary req->ref set

We now don't need to set req->refcount for poll requests since the
reworked poll code ensures no request release race.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ec6fee45705890bdb968b0c175519242753c0215.1655371007.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 558dc170468ad8..fdb6b1101ffcb7 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -689,7 +689,6 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 		return -EINVAL;
 
-	io_req_set_refcount(req);
 	poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }

From 38513c464d3d45b4088c82f6e42d9cdbc5ee57e6 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Thu, 16 Jun 2022 10:22:02 +0100
Subject: [PATCH 074/172] io_uring: switch cancel_hash to use per entry
 spinlock

Add a new io_hash_bucket structure so that each bucket in cancel_hash
has separate spinlock. Use per entry lock for cancel_hash, this removes
some completion lock invocation and remove contension between different
cancel_hash entries.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/05d1e135b0c8bce9d1441e6346776589e5783e26.1655371007.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c         | 14 ++++++-
 io_uring/cancel.h         |  6 +++
 io_uring/fdinfo.c         |  9 +++--
 io_uring/io_uring.c       |  9 +++--
 io_uring/io_uring_types.h |  2 +-
 io_uring/poll.c           | 80 ++++++++++++++++++++++++---------------
 6 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 83cceb52d82d64..6f2888388a40ed 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -93,14 +93,14 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 	if (!ret)
 		return 0;
 
-	spin_lock(&ctx->completion_lock);
 	ret = io_poll_cancel(ctx, cd);
 	if (ret != -ENOENT)
 		goto out;
+	spin_lock(&ctx->completion_lock);
 	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
 		ret = io_timeout_cancel(ctx, cd);
-out:
 	spin_unlock(&ctx->completion_lock);
+out:
 	return ret;
 }
 
@@ -192,3 +192,13 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
+
+void init_hash_table(struct io_hash_bucket *hash_table, unsigned size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++) {
+		spin_lock_init(&hash_table[i].lock);
+		INIT_HLIST_HEAD(&hash_table[i].list);
+	}
+}
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 4f35d86963253f..556a7dcf160e47 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -4,3 +4,9 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
 int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
+void init_hash_table(struct io_hash_bucket *hash_table, unsigned size);
+
+struct io_hash_bucket {
+	spinlock_t		lock;
+	struct hlist_head	list;
+} ____cacheline_aligned_in_smp;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index fcedde4b4b1e1f..f941c73f550259 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -13,6 +13,7 @@
 #include "io_uring.h"
 #include "sqpoll.h"
 #include "fdinfo.h"
+#include "cancel.h"
 
 #ifdef CONFIG_PROC_FS
 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
@@ -157,17 +158,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 		mutex_unlock(&ctx->uring_lock);
 
 	seq_puts(m, "PollList:\n");
-	spin_lock(&ctx->completion_lock);
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct hlist_head *list = &ctx->cancel_hash[i];
+		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
 		struct io_kiocb *req;
 
-		hlist_for_each_entry(req, list, hash_node)
+		spin_lock(&hb->lock);
+		hlist_for_each_entry(req, &hb->list, hash_node)
 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
 					task_work_pending(req->task));
+		spin_unlock(&hb->lock);
 	}
 
 	seq_puts(m, "CqOverflowList:\n");
+	spin_lock(&ctx->completion_lock);
 	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
 		struct io_uring_cqe *cqe = &ocqe->cqe;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 957a5bc1b528c9..ac6946e3f174b5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -89,6 +89,7 @@
 #include "fdinfo.h"
 #include "kbuf.h"
 #include "rsrc.h"
+#include "cancel.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -260,11 +261,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	if (hash_bits <= 0)
 		hash_bits = 1;
 	ctx->cancel_hash_bits = hash_bits;
-	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
-					GFP_KERNEL);
+	ctx->cancel_hash =
+		kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket),
+			GFP_KERNEL);
 	if (!ctx->cancel_hash)
 		goto err;
-	__hash_init(ctx->cancel_hash, 1U << hash_bits);
+
+	init_hash_table(ctx->cancel_hash, 1U << hash_bits);
 
 	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 	if (!ctx->dummy_ubuf)
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 4576ea8cad2e6c..1f8db2dd7af752 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -224,7 +224,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct io_wq_work_list	iopoll_list;
-		struct hlist_head	*cancel_hash;
+		struct io_hash_bucket	*cancel_hash;
 		unsigned		cancel_hash_bits;
 		bool			poll_multi_queue;
 
diff --git a/io_uring/poll.c b/io_uring/poll.c
index fdb6b1101ffcb7..1511a26b412d17 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -19,6 +19,7 @@
 #include "opdef.h"
 #include "kbuf.h"
 #include "poll.h"
+#include "cancel.h"
 
 struct io_poll_update {
 	struct file			*file;
@@ -73,10 +74,22 @@ static struct io_poll *io_poll_get_single(struct io_kiocb *req)
 static void io_poll_req_insert(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct hlist_head *list;
+	u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits);
+	struct io_hash_bucket *hb = &ctx->cancel_hash[index];
 
-	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
-	hlist_add_head(&req->hash_node, list);
+	spin_lock(&hb->lock);
+	hlist_add_head(&req->hash_node, &hb->list);
+	spin_unlock(&hb->lock);
+}
+
+static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+	u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits);
+	spinlock_t *lock = &ctx->cancel_hash[index].lock;
+
+	spin_lock(lock);
+	hash_del(&req->hash_node);
+	spin_unlock(lock);
 }
 
 static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
@@ -220,8 +233,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	}
 
 	io_poll_remove_entries(req);
+	io_poll_req_delete(req, ctx);
 	spin_lock(&ctx->completion_lock);
-	hash_del(&req->hash_node);
 	req->cqe.flags = 0;
 	__io_req_complete_post(req);
 	io_commit_cqring(ctx);
@@ -231,7 +244,6 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 
 static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	ret = io_poll_check_events(req, locked);
@@ -239,9 +251,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	io_poll_remove_entries(req);
-	spin_lock(&ctx->completion_lock);
-	hash_del(&req->hash_node);
-	spin_unlock(&ctx->completion_lock);
+	io_poll_req_delete(req, req->ctx);
 
 	if (!ret)
 		io_req_task_submit(req, locked);
@@ -437,9 +447,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 		return 0;
 	}
 
-	spin_lock(&ctx->completion_lock);
 	io_poll_req_insert(req);
-	spin_unlock(&ctx->completion_lock);
 
 	if (mask && (poll->events & EPOLLET)) {
 		/* can't multishot if failed, just queue the event we've got */
@@ -540,32 +548,31 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	bool found = false;
 	int i;
 
-	spin_lock(&ctx->completion_lock);
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct hlist_head *list;
+		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
 
-		list = &ctx->cancel_hash[i];
-		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
+		spin_lock(&hb->lock);
+		hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
 			if (io_match_task_safe(req, tsk, cancel_all)) {
 				hlist_del_init(&req->hash_node);
 				io_poll_cancel_req(req);
 				found = true;
 			}
 		}
+		spin_unlock(&hb->lock);
 	}
-	spin_unlock(&ctx->completion_lock);
 	return found;
 }
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 				     struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
 {
-	struct hlist_head *list;
 	struct io_kiocb *req;
+	u32 index = hash_long(cd->data, ctx->cancel_hash_bits);
+	struct io_hash_bucket *hb = &ctx->cancel_hash[index];
 
-	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
-	hlist_for_each_entry(req, list, hash_node) {
+	spin_lock(&hb->lock);
+	hlist_for_each_entry(req, &hb->list, hash_node) {
 		if (cd->data != req->cqe.user_data)
 			continue;
 		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
@@ -577,21 +584,21 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 		}
 		return req;
 	}
+	spin_unlock(&hb->lock);
 	return NULL;
 }
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 					  struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req;
 	int i;
 
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct hlist_head *list;
+		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
 
-		list = &ctx->cancel_hash[i];
-		hlist_for_each_entry(req, list, hash_node) {
+		spin_lock(&hb->lock);
+		hlist_for_each_entry(req, &hb->list, hash_node) {
 			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
 			    req->file != cd->file)
 				continue;
@@ -600,12 +607,12 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 			req->work.cancel_seq = cd->seq;
 			return req;
 		}
+		spin_unlock(&hb->lock);
 	}
 	return NULL;
 }
 
 static bool io_poll_disarm(struct io_kiocb *req)
-	__must_hold(&ctx->completion_lock)
 {
 	if (!io_poll_get_ownership(req))
 		return false;
@@ -615,17 +622,23 @@ static bool io_poll_disarm(struct io_kiocb *req)
 }
 
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
-	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req;
+	u32 index;
+	spinlock_t *lock;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
 		req = io_poll_file_find(ctx, cd);
 	else
 		req = io_poll_find(ctx, false, cd);
-	if (!req)
+	if (!req) {
 		return -ENOENT;
+	} else {
+		index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits);
+		lock = &ctx->cancel_hash[index].lock;
+	}
 	io_poll_cancel_req(req);
+	spin_unlock(lock);
 	return 0;
 }
 
@@ -719,18 +732,23 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
 	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
+	u32 index = hash_long(cd.data, ctx->cancel_hash_bits);
+	spinlock_t *lock = &ctx->cancel_hash[index].lock;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
 	bool locked;
 
-	spin_lock(&ctx->completion_lock);
 	preq = io_poll_find(ctx, true, &cd);
-	if (!preq || !io_poll_disarm(preq)) {
-		spin_unlock(&ctx->completion_lock);
-		ret = preq ? -EALREADY : -ENOENT;
+	if (!preq) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret2 = io_poll_disarm(preq);
+	spin_unlock(lock);
+	if (!ret2) {
+		ret = -EALREADY;
 		goto out;
 	}
-	spin_unlock(&ctx->completion_lock);
 
 	if (poll_update->update_events || poll_update->update_user_data) {
 		/* only mask one event flags, keep behavior flags */

From 1ab1edb0a104dd683d83e7853aa5b624856f4127 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:03 +0100
Subject: [PATCH 075/172] io_uring: pass poll_find lock back

Instead of using implicit knowledge of what is locked or not after
io_poll_find() and co returns, pass back a pointer to the locked
bucket if any. If set the user must to unlock the spinlock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dae1dc5749aa34367812ecf62f82fd3f053aae44.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 1511a26b412d17..5b2c6ce26a5547 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -565,12 +565,15 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 }
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
-				     struct io_cancel_data *cd)
+				     struct io_cancel_data *cd,
+				     struct io_hash_bucket **out_bucket)
 {
 	struct io_kiocb *req;
 	u32 index = hash_long(cd->data, ctx->cancel_hash_bits);
 	struct io_hash_bucket *hb = &ctx->cancel_hash[index];
 
+	*out_bucket = NULL;
+
 	spin_lock(&hb->lock);
 	hlist_for_each_entry(req, &hb->list, hash_node) {
 		if (cd->data != req->cqe.user_data)
@@ -582,6 +585,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 				continue;
 			req->work.cancel_seq = cd->seq;
 		}
+		*out_bucket = hb;
 		return req;
 	}
 	spin_unlock(&hb->lock);
@@ -589,11 +593,14 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 }
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
-					  struct io_cancel_data *cd)
+					  struct io_cancel_data *cd,
+					  struct io_hash_bucket **out_bucket)
 {
 	struct io_kiocb *req;
 	int i;
 
+	*out_bucket = NULL;
+
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
 
@@ -605,6 +612,7 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 			if (cd->seq == req->work.cancel_seq)
 				continue;
 			req->work.cancel_seq = cd->seq;
+			*out_bucket = hb;
 			return req;
 		}
 		spin_unlock(&hb->lock);
@@ -623,23 +631,19 @@ static bool io_poll_disarm(struct io_kiocb *req)
 
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
+	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
-	u32 index;
-	spinlock_t *lock;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd);
+		req = io_poll_file_find(ctx, cd, &bucket);
 	else
-		req = io_poll_find(ctx, false, cd);
-	if (!req) {
-		return -ENOENT;
-	} else {
-		index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits);
-		lock = &ctx->cancel_hash[index].lock;
-	}
-	io_poll_cancel_req(req);
-	spin_unlock(lock);
-	return 0;
+		req = io_poll_find(ctx, false, cd, &bucket);
+
+	if (req)
+		io_poll_cancel_req(req);
+	if (bucket)
+		spin_unlock(&bucket->lock);
+	return req ? 0 : -ENOENT;
 }
 
 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
@@ -732,19 +736,21 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
 	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
-	u32 index = hash_long(cd.data, ctx->cancel_hash_bits);
-	spinlock_t *lock = &ctx->cancel_hash[index].lock;
+	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
 	bool locked;
 
-	preq = io_poll_find(ctx, true, &cd);
+	preq = io_poll_find(ctx, true, &cd, &bucket);
+	if (preq)
+		ret2 = io_poll_disarm(preq);
+	if (bucket)
+		spin_unlock(&bucket->lock);
+
 	if (!preq) {
 		ret = -ENOENT;
 		goto out;
 	}
-	ret2 = io_poll_disarm(preq);
-	spin_unlock(lock);
 	if (!ret2) {
 		ret = -EALREADY;
 		goto out;

From 4dfab8abb4721da278a2ccd45c1b6a69f8a9dd14 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:04 +0100
Subject: [PATCH 076/172] io_uring: clean up io_try_cancel

Get rid of an unnecessary extra goto in io_try_cancel() and simplify the
function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/48cf5417b43a8386c6c364dba1ad9b4c7382d158.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 6f2888388a40ed..a253e2ad22ebd0 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -95,12 +95,12 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 
 	ret = io_poll_cancel(ctx, cd);
 	if (ret != -ENOENT)
-		goto out;
+		return ret;
+
 	spin_lock(&ctx->completion_lock);
 	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
 		ret = io_timeout_cancel(ctx, cd);
 	spin_unlock(&ctx->completion_lock);
-out:
 	return ret;
 }
 

From 4a07723fb4bb2568a31d43709904ab0d4c33d6c8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:05 +0100
Subject: [PATCH 077/172] io_uring: limit the number of cancellation buckets

Don't allocate to many hash/cancellation buckets, there might be too
many, clamp it to 8 bits, or 256 * 64B = 16KB. We don't usually have too
many requests, and 256 buckets should be enough, especially since we
do hash search only in the cancellation path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b9620c8072ba61a2d50eba894b89bd93a94a9abd.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ac6946e3f174b5..aafdf1330ec67d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -254,12 +254,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	/*
 	 * Use 5 bits less than the max cq entries, that should give us around
-	 * 32 entries per hash list if totally full and uniformly spread.
+	 * 32 entries per hash list if totally full and uniformly spread, but
+	 * don't keep too many buckets to not overconsume memory.
 	 */
-	hash_bits = ilog2(p->cq_entries);
-	hash_bits -= 5;
-	if (hash_bits <= 0)
-		hash_bits = 1;
+	hash_bits = ilog2(p->cq_entries) - 5;
+	hash_bits = clamp(hash_bits, 1, 8);
+
 	ctx->cancel_hash_bits = hash_bits;
 	ctx->cancel_hash =
 		kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket),

From 8b1dfd343ae6a64ca2b4741ab47a162fa59f43be Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:06 +0100
Subject: [PATCH 078/172] io_uring: clean up io_ring_ctx_alloc

Add a variable for the number of hash buckets in io_ring_ctx_alloc(),
makes it more readable.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/993926ed0d614ba9a76b2a85bebae2babcb13983.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index aafdf1330ec67d..85a479594b05aa 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -244,6 +244,8 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
+	unsigned hash_buckets;
+	size_t hash_size;
 	int hash_bits;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -259,15 +261,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	 */
 	hash_bits = ilog2(p->cq_entries) - 5;
 	hash_bits = clamp(hash_bits, 1, 8);
+	hash_buckets = 1U << hash_bits;
+	hash_size = hash_buckets * sizeof(struct io_hash_bucket);
 
 	ctx->cancel_hash_bits = hash_bits;
-	ctx->cancel_hash =
-		kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket),
-			GFP_KERNEL);
+	ctx->cancel_hash = kmalloc(hash_size, GFP_KERNEL);
 	if (!ctx->cancel_hash)
 		goto err;
 
-	init_hash_table(ctx->cancel_hash, 1U << hash_bits);
+	init_hash_table(ctx->cancel_hash, hash_buckets);
 
 	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 	if (!ctx->dummy_ubuf)

From 0ec6dca223195aca2f7df6e432c4205b5f63305b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:07 +0100
Subject: [PATCH 079/172] io_uring: use state completion infra for poll reqs

Use io_req_task_complete() for poll request completions, so it can
utilise state completions and save lots of unnecessary locking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ced94cb5a728d8e386c640d052fd3da3f5d6891a.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 5b2c6ce26a5547..7f2ebb56f7dbf9 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -234,12 +234,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 
 	io_poll_remove_entries(req);
 	io_poll_req_delete(req, ctx);
-	spin_lock(&ctx->completion_lock);
-	req->cqe.flags = 0;
-	__io_req_complete_post(req);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_req_set_res(req, req->cqe.res, 0);
+	io_req_task_complete(req, locked);
 }
 
 static void io_apoll_task_func(struct io_kiocb *req, bool *locked)

From 97bbdc06a4446bc69d8ba71d722abae542a6b70c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:08 +0100
Subject: [PATCH 080/172] io_uring: add IORING_SETUP_SINGLE_ISSUER

Add a new IORING_SETUP_SINGLE_ISSUER flag and the userspace visible part
of it, i.e. put limitations of submitters. Also, don't allow it together
with IOPOLL as we're not going to put it to good use.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4bcc41ee467fdf04c8aab8baf6ce3ba21858c3d4.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 ++++-
 io_uring/io_uring.c           |  7 +++++--
 io_uring/io_uring_types.h     |  1 +
 io_uring/tctx.c               | 27 ++++++++++++++++++++++++---
 io_uring/tctx.h               |  4 ++--
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4927bb69387a29..d7ae81b10893e9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -140,9 +140,12 @@ enum {
  * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
  */
 #define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
-
 #define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
 #define IORING_SETUP_CQE32		(1U << 11) /* CQEs are 32 byte */
+/*
+ * Only one task is allowed to submit requests
+ */
+#define IORING_SETUP_SINGLE_ISSUER	(1U << 12)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 85a479594b05aa..06772139b7dad8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2457,6 +2457,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
+	if (ctx->submitter_task)
+		put_task_struct(ctx->submitter_task);
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
@@ -3189,7 +3191,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_tctx_node(ctx);
+	ret = __io_uring_add_tctx_node(ctx, false);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
@@ -3409,7 +3411,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
-			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
+			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
+			IORING_SETUP_SINGLE_ISSUER))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 1f8db2dd7af752..8b00243abf65fa 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -243,6 +243,7 @@ struct io_ring_ctx {
 	/* Keep this last, we don't need it for the fast path */
 
 	struct io_restriction		restrictions;
+	struct task_struct		*submitter_task;
 
 	/* slow path rsrc auxilary data, used by update/register */
 	struct io_rsrc_node		*rsrc_backup_node;
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 5a5d4f908529a2..a819da8fc85cd2 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -94,12 +94,32 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	return 0;
 }
 
-int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static int io_register_submitter(struct io_ring_ctx *ctx)
+{
+	int ret = 0;
+
+	mutex_lock(&ctx->uring_lock);
+	if (!ctx->submitter_task)
+		ctx->submitter_task = get_task_struct(current);
+	else if (ctx->submitter_task != current)
+		ret = -EEXIST;
+	mutex_unlock(&ctx->uring_lock);
+
+	return ret;
+}
+
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
 	int ret;
 
+	if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) {
+		ret = io_register_submitter(ctx);
+		if (ret)
+			return ret;
+	}
+
 	if (unlikely(!tctx)) {
 		ret = io_uring_alloc_task_context(current, ctx);
 		if (unlikely(ret))
@@ -133,7 +153,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 		list_add(&node->ctx_node, &ctx->tctx_list);
 		mutex_unlock(&ctx->uring_lock);
 	}
-	tctx->last = ctx;
+	if (submitter)
+		tctx->last = ctx;
 	return 0;
 }
 
@@ -241,7 +262,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
 		return -EINVAL;
 
 	mutex_unlock(&ctx->uring_lock);
-	ret = io_uring_add_tctx_node(ctx);
+	ret = __io_uring_add_tctx_node(ctx, false);
 	mutex_lock(&ctx->uring_lock);
 	if (ret)
 		return ret;
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index 7684713e950f4f..dde82ce4d8e29b 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -34,7 +34,7 @@ struct io_tctx_node {
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 void io_uring_del_tctx_node(unsigned long index);
-int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter);
 void io_uring_clean_tctx(struct io_uring_task *tctx);
 
 void io_uring_unreg_ringfd(void);
@@ -52,5 +52,5 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 
 	if (likely(tctx && tctx->last == ctx))
 		return 0;
-	return __io_uring_add_tctx_node(ctx);
+	return __io_uring_add_tctx_node(ctx, true);
 }

From a2cdd5193218c557b4ee7828017cce83f251e99a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:09 +0100
Subject: [PATCH 081/172] io_uring: pass hash table into poll_find

In preparation for having multiple cancellation hash tables, pass a
table pointer into io_poll_find() and other poll cancel functions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a31c88502463dce09254240fa037352927d7ecc3.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 7f2ebb56f7dbf9..96199c999fe60c 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -562,11 +562,12 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 				     struct io_cancel_data *cd,
+				     struct io_hash_bucket hash_table[],
 				     struct io_hash_bucket **out_bucket)
 {
 	struct io_kiocb *req;
 	u32 index = hash_long(cd->data, ctx->cancel_hash_bits);
-	struct io_hash_bucket *hb = &ctx->cancel_hash[index];
+	struct io_hash_bucket *hb = &hash_table[index];
 
 	*out_bucket = NULL;
 
@@ -590,6 +591,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 					  struct io_cancel_data *cd,
+					  struct io_hash_bucket hash_table[],
 					  struct io_hash_bucket **out_bucket)
 {
 	struct io_kiocb *req;
@@ -598,7 +600,7 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 	*out_bucket = NULL;
 
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
+		struct io_hash_bucket *hb = &hash_table[i];
 
 		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node) {
@@ -625,15 +627,16 @@ static bool io_poll_disarm(struct io_kiocb *req)
 	return true;
 }
 
-int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
+static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+			    struct io_hash_bucket hash_table[])
 {
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd, &bucket);
+		req = io_poll_file_find(ctx, cd, ctx->cancel_hash, &bucket);
 	else
-		req = io_poll_find(ctx, false, cd, &bucket);
+		req = io_poll_find(ctx, false, cd, ctx->cancel_hash, &bucket);
 
 	if (req)
 		io_poll_cancel_req(req);
@@ -642,6 +645,11 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	return req ? 0 : -ENOENT;
 }
 
+int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
+{
+	return __io_poll_cancel(ctx, cd, ctx->cancel_hash);
+}
+
 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 				     unsigned int flags)
 {
@@ -737,7 +745,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	int ret2, ret = 0;
 	bool locked;
 
-	preq = io_poll_find(ctx, true, &cd, &bucket);
+	preq = io_poll_find(ctx, true, &cd, ctx->cancel_hash, &bucket);
 	if (preq)
 		ret2 = io_poll_disarm(preq);
 	if (bucket)

From e6f89be61410ff5a0e690d87d7a308e81f0f5a71 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:10 +0100
Subject: [PATCH 082/172] io_uring: introduce a struct for hash table

Instead of passing around a pointer to hash buckets, add a bit of type
safety and wrap it into a structure.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d65bc3faba537ec2aca9eabf334394936d44bd28.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c         |  6 +++---
 io_uring/cancel.h         |  7 +------
 io_uring/fdinfo.c         |  4 ++--
 io_uring/io_uring.c       | 29 ++++++++++++++++------------
 io_uring/io_uring_types.h | 13 +++++++++++--
 io_uring/poll.c           | 40 +++++++++++++++++++++------------------
 6 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index a253e2ad22ebd0..f28f0a7d127240 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -193,12 +193,12 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-void init_hash_table(struct io_hash_bucket *hash_table, unsigned size)
+void init_hash_table(struct io_hash_table *table, unsigned size)
 {
 	unsigned int i;
 
 	for (i = 0; i < size; i++) {
-		spin_lock_init(&hash_table[i].lock);
-		INIT_HLIST_HEAD(&hash_table[i].list);
+		spin_lock_init(&table->hbs[i].lock);
+		INIT_HLIST_HEAD(&table->hbs[i].list);
 	}
 }
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 556a7dcf160e47..fd4cb1a2595de5 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -4,9 +4,4 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
 int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
-void init_hash_table(struct io_hash_bucket *hash_table, unsigned size);
-
-struct io_hash_bucket {
-	spinlock_t		lock;
-	struct hlist_head	list;
-} ____cacheline_aligned_in_smp;
+void init_hash_table(struct io_hash_table *table, unsigned size);
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index f941c73f550259..344e7d90d55756 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -158,8 +158,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 		mutex_unlock(&ctx->uring_lock);
 
 	seq_puts(m, "PollList:\n");
-	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
+	for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
+		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 		struct io_kiocb *req;
 
 		spin_lock(&hb->lock);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 06772139b7dad8..0b3851a0db2ba1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -241,11 +241,23 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 	percpu_ref_put(&ctx->refs);
 }
 
+static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
+{
+	unsigned hash_buckets = 1U << bits;
+	size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
+
+	table->hbs = kmalloc(hash_size, GFP_KERNEL);
+	if (!table->hbs)
+		return -ENOMEM;
+
+	table->hash_bits = bits;
+	init_hash_table(table, hash_buckets);
+	return 0;
+}
+
 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
-	unsigned hash_buckets;
-	size_t hash_size;
 	int hash_bits;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -261,16 +273,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	 */
 	hash_bits = ilog2(p->cq_entries) - 5;
 	hash_bits = clamp(hash_bits, 1, 8);
-	hash_buckets = 1U << hash_bits;
-	hash_size = hash_buckets * sizeof(struct io_hash_bucket);
-
-	ctx->cancel_hash_bits = hash_bits;
-	ctx->cancel_hash = kmalloc(hash_size, GFP_KERNEL);
-	if (!ctx->cancel_hash)
+	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
 		goto err;
 
-	init_hash_table(ctx->cancel_hash, hash_buckets);
-
 	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 	if (!ctx->dummy_ubuf)
 		goto err;
@@ -311,7 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	return ctx;
 err:
 	kfree(ctx->dummy_ubuf);
-	kfree(ctx->cancel_hash);
+	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
@@ -2487,7 +2492,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_req_caches_free(ctx);
 	if (ctx->hash_map)
 		io_wq_put_hash(ctx->hash_map);
-	kfree(ctx->cancel_hash);
+	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->dummy_ubuf);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 8b00243abf65fa..d3b9bde9c702c6 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -9,6 +9,16 @@
 #include "io-wq.h"
 #include "filetable.h"
 
+struct io_hash_bucket {
+	spinlock_t		lock;
+	struct hlist_head	list;
+} ____cacheline_aligned_in_smp;
+
+struct io_hash_table {
+	struct io_hash_bucket	*hbs;
+	unsigned		hash_bits;
+};
+
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
 	u32 tail ____cacheline_aligned_in_smp;
@@ -224,8 +234,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct io_wq_work_list	iopoll_list;
-		struct io_hash_bucket	*cancel_hash;
-		unsigned		cancel_hash_bits;
+		struct io_hash_table	cancel_table;
 		bool			poll_multi_queue;
 
 		struct list_head	io_buffers_comp;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 96199c999fe60c..ea6466388ed9ef 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -73,9 +73,9 @@ static struct io_poll *io_poll_get_single(struct io_kiocb *req)
 
 static void io_poll_req_insert(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-	u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits);
-	struct io_hash_bucket *hb = &ctx->cancel_hash[index];
+	struct io_hash_table *table = &req->ctx->cancel_table;
+	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
+	struct io_hash_bucket *hb = &table->hbs[index];
 
 	spin_lock(&hb->lock);
 	hlist_add_head(&req->hash_node, &hb->list);
@@ -84,8 +84,9 @@ static void io_poll_req_insert(struct io_kiocb *req)
 
 static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
-	u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits);
-	spinlock_t *lock = &ctx->cancel_hash[index].lock;
+	struct io_hash_table *table = &req->ctx->cancel_table;
+	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
+	spinlock_t *lock = &table->hbs[index].lock;
 
 	spin_lock(lock);
 	hash_del(&req->hash_node);
@@ -539,13 +540,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			       bool cancel_all)
 {
+	struct io_hash_table *table = &ctx->cancel_table;
+	unsigned nr_buckets = 1U << table->hash_bits;
 	struct hlist_node *tmp;
 	struct io_kiocb *req;
 	bool found = false;
 	int i;
 
-	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct io_hash_bucket *hb = &ctx->cancel_hash[i];
+	for (i = 0; i < nr_buckets; i++) {
+		struct io_hash_bucket *hb = &table->hbs[i];
 
 		spin_lock(&hb->lock);
 		hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
@@ -562,12 +565,12 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 				     struct io_cancel_data *cd,
-				     struct io_hash_bucket hash_table[],
+				     struct io_hash_table *table,
 				     struct io_hash_bucket **out_bucket)
 {
 	struct io_kiocb *req;
-	u32 index = hash_long(cd->data, ctx->cancel_hash_bits);
-	struct io_hash_bucket *hb = &hash_table[index];
+	u32 index = hash_long(cd->data, table->hash_bits);
+	struct io_hash_bucket *hb = &table->hbs[index];
 
 	*out_bucket = NULL;
 
@@ -591,16 +594,17 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 					  struct io_cancel_data *cd,
-					  struct io_hash_bucket hash_table[],
+					  struct io_hash_table *table,
 					  struct io_hash_bucket **out_bucket)
 {
+	unsigned nr_buckets = 1U << table->hash_bits;
 	struct io_kiocb *req;
 	int i;
 
 	*out_bucket = NULL;
 
-	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-		struct io_hash_bucket *hb = &hash_table[i];
+	for (i = 0; i < nr_buckets; i++) {
+		struct io_hash_bucket *hb = &table->hbs[i];
 
 		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node) {
@@ -628,15 +632,15 @@ static bool io_poll_disarm(struct io_kiocb *req)
 }
 
 static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
-			    struct io_hash_bucket hash_table[])
+			    struct io_hash_table *table)
 {
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd, ctx->cancel_hash, &bucket);
+		req = io_poll_file_find(ctx, cd, table, &bucket);
 	else
-		req = io_poll_find(ctx, false, cd, ctx->cancel_hash, &bucket);
+		req = io_poll_find(ctx, false, cd, table, &bucket);
 
 	if (req)
 		io_poll_cancel_req(req);
@@ -647,7 +651,7 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
-	return __io_poll_cancel(ctx, cd, ctx->cancel_hash);
+	return __io_poll_cancel(ctx, cd, &ctx->cancel_table);
 }
 
 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
@@ -745,7 +749,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	int ret2, ret = 0;
 	bool locked;
 
-	preq = io_poll_find(ctx, true, &cd, ctx->cancel_hash, &bucket);
+	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
 	if (preq)
 		ret2 = io_poll_disarm(preq);
 	if (bucket)

From 5d7943d99df9326c7b02f773b2d6f09709c30594 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:11 +0100
Subject: [PATCH 083/172] io_uring: propagate locking state to poll cancel

Poll cancellation will be soon need to grab ->uring_lock inside, pass
the locking state, i.e. issue_flags, inside the cancellation functions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b86781d047727c07163443b57551a3fa57c7c5e1.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c  | 7 ++++---
 io_uring/cancel.h  | 3 ++-
 io_uring/poll.c    | 3 ++-
 io_uring/poll.h    | 3 ++-
 io_uring/timeout.c | 3 ++-
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index f28f0a7d127240..f07bfd27c98ac2 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -78,7 +78,8 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 	return ret;
 }
 
-int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
+int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd,
+		  unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -93,7 +94,7 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 	if (!ret)
 		return 0;
 
-	ret = io_poll_cancel(ctx, cd);
+	ret = io_poll_cancel(ctx, cd, issue_flags);
 	if (ret != -ENOENT)
 		return ret;
 
@@ -136,7 +137,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 	int ret, nr = 0;
 
 	do {
-		ret = io_try_cancel(req, cd);
+		ret = io_try_cancel(req, cd, issue_flags);
 		if (ret == -ENOENT)
 			break;
 		if (!all)
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index fd4cb1a2595de5..8dd259dc383e2c 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -3,5 +3,6 @@
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
-int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd);
+int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd,
+		  unsigned int issue_flags);
 void init_hash_table(struct io_hash_table *table, unsigned size);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index ea6466388ed9ef..c4edf8794538c8 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -649,7 +649,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 	return req ? 0 : -ENOENT;
 }
 
-int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
+int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		   unsigned issue_flags)
 {
 	return __io_poll_cancel(ctx, cd, &ctx->cancel_table);
 }
diff --git a/io_uring/poll.h b/io_uring/poll.h
index cc75c1567a84ae..fa3e19790281b2 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -24,7 +24,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
 
-int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
+int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		   unsigned issue_flags);
 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 69cca42d6835b4..526fc8b2e3b65d 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -262,6 +262,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 
 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 {
+	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
@@ -273,7 +274,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 				.data		= prev->cqe.user_data,
 			};
 
-			ret = io_try_cancel(req, &cd);
+			ret = io_try_cancel(req, &cd, issue_flags);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
 		io_req_complete_post(req);

From 9ca9fb24d5febccea354089c41f96a8ad0d853f8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:12 +0100
Subject: [PATCH 084/172] io_uring: mutex locked poll hashing

Currently we do two extra spin lock/unlock pairs to add a poll/apoll
request to the cancellation hash table and remove it from there.

On the submission side we often already hold ->uring_lock and tw
completion is likely to hold it as well. Add a second cancellation hash
table protected by ->uring_lock. In concerns for latency because of a
need to have the mutex locked on the completion side, use the new table
only in following cases:

1) IORING_SETUP_SINGLE_ISSUER: only one task grabs uring_lock, so there
   is little to no contention and so the main tw hander will almost
   always end up grabbing it before calling callbacks.

2) IORING_SETUP_SQPOLL: same as with single issuer, only one task is
   a major user of ->uring_lock.

3) apoll: we normally grab the lock on the completion side anyway to
   execute the request, so it's free.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1bbad9c78c454b7b92f100bbf46730a37df7194f.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c       |   9 ++-
 io_uring/io_uring_types.h |   4 ++
 io_uring/poll.c           | 117 +++++++++++++++++++++++++++++++-------
 3 files changed, 108 insertions(+), 22 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0b3851a0db2ba1..eeda1673179544 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -275,6 +275,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	hash_bits = clamp(hash_bits, 1, 8);
 	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
 		goto err;
+	if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
+		goto err;
 
 	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 	if (!ctx->dummy_ubuf)
@@ -317,6 +319,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 err:
 	kfree(ctx->dummy_ubuf);
 	kfree(ctx->cancel_table.hbs);
+	kfree(ctx->cancel_table_locked.hbs);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
@@ -2493,6 +2496,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->hash_map)
 		io_wq_put_hash(ctx->hash_map);
 	kfree(ctx->cancel_table.hbs);
+	kfree(ctx->cancel_table_locked.hbs);
 	kfree(ctx->dummy_ubuf);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
@@ -2654,12 +2658,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		__io_cqring_overflow_flush(ctx, true);
 	xa_for_each(&ctx->personalities, index, creds)
 		io_unregister_personality(ctx, index);
+	if (ctx->rings)
+		io_poll_remove_all(ctx, NULL, true);
 	mutex_unlock(&ctx->uring_lock);
 
 	/* failed during ring init, it couldn't have issued any requests */
 	if (ctx->rings) {
 		io_kill_timeouts(ctx, NULL, true);
-		io_poll_remove_all(ctx, NULL, true);
 		/* if we failed setting up the ctx, we might not have any rings */
 		io_iopoll_try_reap_events(ctx);
 	}
@@ -2784,7 +2789,9 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 		}
 
 		ret |= io_cancel_defer_files(ctx, task, cancel_all);
+		mutex_lock(&ctx->uring_lock);
 		ret |= io_poll_remove_all(ctx, task, cancel_all);
+		mutex_unlock(&ctx->uring_lock);
 		ret |= io_kill_timeouts(ctx, task, cancel_all);
 		if (task)
 			ret |= io_run_task_work();
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index d3b9bde9c702c6..65ac7cdaaa73f4 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -191,6 +191,7 @@ struct io_ring_ctx {
 		struct xarray		io_bl_xa;
 		struct list_head	io_buffers_cache;
 
+		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
 		struct list_head	apoll_cache;
 		struct xarray		personalities;
@@ -323,6 +324,7 @@ enum {
 	REQ_F_CQE32_INIT_BIT,
 	REQ_F_APOLL_MULTISHOT_BIT,
 	REQ_F_CLEAR_POLLIN_BIT,
+	REQ_F_HASH_LOCKED_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -393,6 +395,8 @@ enum {
 	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
 	/* recvmsg special flag, clear EPOLLIN */
 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
+	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
+	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index c4edf8794538c8..9ae2982aef7c6c 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -93,6 +93,32 @@ static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
 	spin_unlock(lock);
 }
 
+static void io_poll_req_insert_locked(struct io_kiocb *req)
+{
+	struct io_hash_table *table = &req->ctx->cancel_table_locked;
+	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
+
+	hlist_add_head(&req->hash_node, &table->hbs[index].list);
+}
+
+static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (req->flags & REQ_F_HASH_LOCKED) {
+		/*
+		 * ->cancel_table_locked is protected by ->uring_lock in
+		 * contrast to per bucket spinlocks. Likely, tctx_task_work()
+		 * already grabbed the mutex for us, but there is a chance it
+		 * failed.
+		 */
+		io_tw_lock(ctx, locked);
+		hash_del(&req->hash_node);
+	} else {
+		io_poll_req_delete(req, ctx);
+	}
+}
+
 static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
 			      wait_queue_func_t wake_func)
 {
@@ -217,7 +243,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 
 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	ret = io_poll_check_events(req, locked);
@@ -234,7 +259,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	}
 
 	io_poll_remove_entries(req);
-	io_poll_req_delete(req, ctx);
+	io_poll_tw_hash_eject(req, locked);
+
 	io_req_set_res(req, req->cqe.res, 0);
 	io_req_task_complete(req, locked);
 }
@@ -248,7 +274,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	io_poll_remove_entries(req);
-	io_poll_req_delete(req, req->ctx);
+	io_poll_tw_hash_eject(req, locked);
 
 	if (!ret)
 		io_req_task_submit(req, locked);
@@ -444,7 +470,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 		return 0;
 	}
 
-	io_poll_req_insert(req);
+	if (req->flags & REQ_F_HASH_LOCKED)
+		io_poll_req_insert_locked(req);
+	else
+		io_poll_req_insert(req);
 
 	if (mask && (poll->events & EPOLLET)) {
 		/* can't multishot if failed, just queue the event we've got */
@@ -485,6 +514,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
 	int ret;
 
+	/*
+	 * apoll requests already grab the mutex to complete in the tw handler,
+	 * so removal from the mutex-backed hash is free, use it by default.
+	 */
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		req->flags &= ~REQ_F_HASH_LOCKED;
+	else
+		req->flags |= REQ_F_HASH_LOCKED;
+
 	if (!def->pollin && !def->pollout)
 		return IO_APOLL_ABORTED;
 	if (!file_can_poll(req->file))
@@ -534,13 +572,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	return IO_APOLL_OK;
 }
 
-/*
- * Returns true if we found and killed one or more poll requests
- */
-__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
-			       bool cancel_all)
+static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
+					    struct io_hash_table *table,
+					    bool cancel_all)
 {
-	struct io_hash_table *table = &ctx->cancel_table;
 	unsigned nr_buckets = 1U << table->hash_bits;
 	struct hlist_node *tmp;
 	struct io_kiocb *req;
@@ -563,6 +598,17 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	return found;
 }
 
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+			       bool cancel_all)
+	__must_hold(&ctx->uring_lock)
+{
+	return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) |
+	       io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
+}
+
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 				     struct io_cancel_data *cd,
 				     struct io_hash_table *table,
@@ -622,13 +668,15 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 	return NULL;
 }
 
-static bool io_poll_disarm(struct io_kiocb *req)
+static int io_poll_disarm(struct io_kiocb *req)
 {
+	if (!req)
+		return -ENOENT;
 	if (!io_poll_get_ownership(req))
-		return false;
+		return -EALREADY;
 	io_poll_remove_entries(req);
 	hash_del(&req->hash_node);
-	return true;
+	return 0;
 }
 
 static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
@@ -652,7 +700,16 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 		   unsigned issue_flags)
 {
-	return __io_poll_cancel(ctx, cd, &ctx->cancel_table);
+	int ret;
+
+	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
+	if (ret != -ENOENT)
+		return ret;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
+	io_ring_submit_unlock(ctx, issue_flags);
+	return ret;
 }
 
 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
@@ -727,6 +784,16 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 	ipt.pt._qproc = io_poll_queue_proc;
 
+	/*
+	 * If sqpoll or single issuer, there is no contention for ->uring_lock
+	 * and we'll end up holding it in tw handlers anyway.
+	 */
+	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+	    (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER)))
+		req->flags |= REQ_F_HASH_LOCKED;
+	else
+		req->flags &= ~REQ_F_HASH_LOCKED;
+
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
 	if (ret) {
 		io_req_set_res(req, ret, 0);
@@ -751,20 +818,28 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	bool locked;
 
 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
-	if (preq)
-		ret2 = io_poll_disarm(preq);
+	ret2 = io_poll_disarm(preq);
 	if (bucket)
 		spin_unlock(&bucket->lock);
-
-	if (!preq) {
-		ret = -ENOENT;
+	if (!ret2)
+		goto found;
+	if (ret2 != -ENOENT) {
+		ret = ret2;
 		goto out;
 	}
-	if (!ret2) {
-		ret = -EALREADY;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
+	ret2 = io_poll_disarm(preq);
+	if (bucket)
+		spin_unlock(&bucket->lock);
+	io_ring_submit_unlock(ctx, issue_flags);
+	if (ret2) {
+		ret = ret2;
 		goto out;
 	}
 
+found:
 	if (poll_update->update_events || poll_update->update_user_data) {
 		/* only mask one event flags, keep behavior flags */
 		if (poll_update->update_events) {

From f09c8643f0fad0e287b9f737955276000fd76a5d Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Fri, 17 Jun 2022 13:04:29 +0800
Subject: [PATCH 085/172] io_uring: kbuf: add comments for some tricky code

Add comments to explain why it is always under uring lock when
incrementing head in __io_kbuf_recycle. And rectify one comemnt about
kbuf consuming in iowq case.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220617050429.94293-1-hao.xu@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index b9c7f6e87cc9a3..59e4fafeb28ce4 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -52,6 +52,13 @@ void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	if (req->flags & REQ_F_BUFFER_RING) {
 		if (req->buf_list) {
 			if (req->flags & REQ_F_PARTIAL_IO) {
+				/*
+				 * If we end up here, then the io_uring_lock has
+				 * been kept held since we retrieved the buffer.
+				 * For the io-wq case, we already cleared
+				 * req->buf_list when the buffer was retrieved,
+				 * hence it cannot be set here for that case.
+				 */
 				req->buf_list->head++;
 				req->buf_list = NULL;
 			} else {
@@ -163,12 +170,13 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
 		/*
 		 * If we came in unlocked, we have no choice but to consume the
-		 * buffer here. This does mean it'll be pinned until the IO
-		 * completes. But coming in unlocked means we're in io-wq
-		 * context, hence there should be no further retry. For the
-		 * locked case, the caller must ensure to call the commit when
-		 * the transfer completes (or if we get -EAGAIN and must poll
-		 * or retry).
+		 * buffer here, otherwise nothing ensures that the buffer won't
+		 * get used by others. This does mean it'll be pinned until the
+		 * IO completes, coming in unlocked means we're being called from
+		 * io-wq context and there may be further retries in async hybrid
+		 * mode. For the locked case, the caller must call commit when
+		 * the transfer completes (or if we get -EAGAIN and must poll of
+		 * retry).
 		 */
 		req->buf_list = NULL;
 		bl->head++;

From d245bca6375bccfd589a6a7d5007df28575bb626 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jun 2022 09:48:00 +0100
Subject: [PATCH 086/172] io_uring: don't expose io_fill_cqe_aux()

Deduplicate some code and add a helper for filling an aux CQE, locking
and notification.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b7c6557c8f9dc5c4cfb01292116c682a0ff61081.1655455613.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 ++++++++++++++++--
 io_uring/io_uring.h |  3 +--
 io_uring/msg_ring.c | 11 +----------
 io_uring/net.c      | 20 +++++---------------
 io_uring/poll.c     | 24 ++++++++----------------
 io_uring/rsrc.c     | 14 +++++---------
 6 files changed, 36 insertions(+), 54 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eeda1673179544..8c1b0e0ce5bb8a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -676,8 +676,8 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return true;
 }
 
-bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
-		     u32 cflags)
+static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
+			    u64 user_data, s32 res, u32 cflags)
 {
 	struct io_uring_cqe *cqe;
 
@@ -704,6 +704,20 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
 }
 
+bool io_post_aux_cqe(struct io_ring_ctx *ctx,
+		     u64 user_data, s32 res, u32 cflags)
+{
+	bool filled;
+
+	spin_lock(&ctx->completion_lock);
+	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	if (filled)
+		io_cqring_ev_posted(ctx);
+	return filled;
+}
+
 static void __io_req_complete_put(struct io_kiocb *req)
 {
 	/*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 3f06fbae0ee9e5..18754fb790255e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -239,8 +239,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
 void __io_req_complete_post(struct io_kiocb *req);
-bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
-		     u32 cflags);
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
 void io_cqring_ev_posted(struct io_ring_ctx *ctx);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 3b89f9a0a0b459..7c3c5f3ab06b57 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -34,7 +34,6 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *target_ctx;
-	bool filled;
 	int ret;
 
 	ret = -EBADFD;
@@ -43,16 +42,8 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = -EOVERFLOW;
 	target_ctx = req->file->private_data;
-
-	spin_lock(&target_ctx->completion_lock);
-	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
-	io_commit_cqring(target_ctx);
-	spin_unlock(&target_ctx->completion_lock);
-
-	if (filled) {
-		io_cqring_ev_posted(target_ctx);
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
 		ret = 0;
-	}
 
 done:
 	if (ret < 0)
diff --git a/io_uring/net.c b/io_uring/net.c
index fe1fe920b9291f..35d0183fe75817 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -644,22 +644,12 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 		io_req_set_res(req, ret, 0);
 		return IOU_OK;
 	}
-	if (ret >= 0) {
-		bool filled;
-
-		spin_lock(&ctx->completion_lock);
-		filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
-					 IORING_CQE_F_MORE);
-		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
-		if (filled) {
-			io_cqring_ev_posted(ctx);
-			goto retry;
-		}
-		ret = -ECANCELED;
-	}
 
-	return ret;
+	if (ret < 0)
+		return ret;
+	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE))
+		goto retry;
+	return -ECANCELED;
 }
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 9ae2982aef7c6c..e0c181fe6264c0 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -214,23 +214,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
-			bool filled;
-
-			spin_lock(&ctx->completion_lock);
-			filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
-						 mask, IORING_CQE_F_MORE);
-			io_commit_cqring(ctx);
-			spin_unlock(&ctx->completion_lock);
-			if (filled) {
-				io_cqring_ev_posted(ctx);
-				continue;
-			}
-			return -ECANCELED;
-		}
 
-		ret = io_poll_issue(req, locked);
-		if (ret)
-			return ret;
+			if (!io_post_aux_cqe(ctx, req->cqe.user_data,
+					     mask, IORING_CQE_F_MORE))
+				return -ECANCELED;
+		} else {
+			ret = io_poll_issue(req, locked);
+			if (ret)
+				return ret;
+		}
 
 		/*
 		 * Release all references, retry if someone tried to restart
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 214ff0dfa6a48e..7fed3105152a95 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -174,17 +174,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		list_del(&prsrc->list);
 
 		if (prsrc->tag) {
-			if (ctx->flags & IORING_SETUP_IOPOLL)
+			if (ctx->flags & IORING_SETUP_IOPOLL) {
 				mutex_lock(&ctx->uring_lock);
-
-			spin_lock(&ctx->completion_lock);
-			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
-			io_commit_cqring(ctx);
-			spin_unlock(&ctx->completion_lock);
-			io_cqring_ev_posted(ctx);
-
-			if (ctx->flags & IORING_SETUP_IOPOLL)
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 				mutex_unlock(&ctx->uring_lock);
+			} else {
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+			}
 		}
 
 		rsrc_data->do_put(ctx, prsrc);

From faf88dde060f74117b3a86a62cb32a20f27fd636 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jun 2022 09:48:01 +0100
Subject: [PATCH 087/172] io_uring: don't inline __io_get_cqe()

__io_get_cqe() is not as hot as io_get_cqe(), no need to inline it, it
sheds ~500B from the binary.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c1ac829198a881b7af8710926f99a3559b9f24c0.1655455613.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++++++
 io_uring/io_uring.h | 36 +-----------------------------------
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8c1b0e0ce5bb8a..df6a9abdd966ad 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -166,6 +166,11 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 		__io_submit_flush_completions(ctx);
 }
 
+static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
+{
+	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+}
+
 static bool io_match_linked(struct io_kiocb *head)
 {
 	struct io_kiocb *req;
@@ -676,6 +681,36 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return true;
 }
 
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = ctx->rings;
+	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+	unsigned int shift = 0;
+	unsigned int free, queued, len;
+
+	if (ctx->flags & IORING_SETUP_CQE32)
+		shift = 1;
+
+	/* userspace may cheat modifying the tail, be safe and do min */
+	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+	free = ctx->cq_entries - queued;
+	/* we need a contiguous range, limit based on the current array offset */
+	len = min(free, ctx->cq_entries - off);
+	if (!len)
+		return NULL;
+
+	ctx->cached_cq_tail++;
+	ctx->cqe_cached = &rings->cqes[off];
+	ctx->cqe_sentinel = ctx->cqe_cached + len;
+	ctx->cqe_cached++;
+	return &rings->cqes[off << shift];
+}
+
 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
 			    u64 user_data, s32 res, u32 cflags)
 {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 18754fb790255e..94bd6732f55873 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -14,44 +14,10 @@ enum {
 	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
 };
 
+struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
 bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 			      u32 cflags, u64 extra1, u64 extra2);
 
-static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
-{
-	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
-}
-
-/*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
-static inline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
-{
-	struct io_rings *rings = ctx->rings;
-	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
-	unsigned int shift = 0;
-	unsigned int free, queued, len;
-
-	if (ctx->flags & IORING_SETUP_CQE32)
-		shift = 1;
-
-	/* userspace may cheat modifying the tail, be safe and do min */
-	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
-	free = ctx->cq_entries - queued;
-	/* we need a contiguous range, limit based on the current array offset */
-	len = min(free, ctx->cq_entries - off);
-	if (!len)
-		return NULL;
-
-	ctx->cached_cq_tail++;
-	ctx->cqe_cached = &rings->cqes[off];
-	ctx->cqe_sentinel = ctx->cqe_cached + len;
-	ctx->cqe_cached++;
-	return &rings->cqes[off << shift];
-}
-
 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 {
 	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {

From 68494a65d0e2de3d99b28ae050971b6161eabac0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jun 2022 09:48:02 +0100
Subject: [PATCH 088/172] io_uring: introduce io_req_cqe_overflow()

__io_fill_cqe_req() is hot and inlined, we want it to be as small as
possible. Add io_req_cqe_overflow() accepting only a request and doing
all overflow accounting, and replace with it two calls to 6 argument
io_cqring_event_overflow().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/048b9fbcce56814d77a1a540409c98c3d383edcb.1655455613.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 +++++++++++++--
 io_uring/io_uring.h | 12 ++----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index df6a9abdd966ad..7acb94c180b8e8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -643,8 +643,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 	}
 }
 
-bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
-			      u32 cflags, u64 extra1, u64 extra2)
+static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
+				     s32 res, u32 cflags, u64 extra1, u64 extra2)
 {
 	struct io_overflow_cqe *ocqe;
 	size_t ocq_size = sizeof(struct io_overflow_cqe);
@@ -681,6 +681,17 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return true;
 }
 
+bool io_req_cqe_overflow(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_CQE32_INIT)) {
+		req->extra1 = 0;
+		req->extra2 = 0;
+	}
+	return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
+					req->cqe.res, req->cqe.flags,
+					req->extra1, req->extra2);
+}
+
 /*
  * writes to the cq entry need to come after reading head; the
  * control dependency is enough as we're using WRITE_ONCE to
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 94bd6732f55873..88c64a69beaaeb 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -15,8 +15,7 @@ enum {
 };
 
 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
-bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
-			      u32 cflags, u64 extra1, u64 extra2);
+bool io_req_cqe_overflow(struct io_kiocb *req);
 
 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 {
@@ -56,10 +55,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 			memcpy(cqe, &req->cqe, sizeof(*cqe));
 			return true;
 		}
-
-		return io_cqring_event_overflow(ctx, req->cqe.user_data,
-						req->cqe.res, req->cqe.flags,
-						0, 0);
 	} else {
 		u64 extra1 = 0, extra2 = 0;
 
@@ -83,11 +78,8 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 			WRITE_ONCE(cqe->big_cqe[1], extra2);
 			return true;
 		}
-
-		return io_cqring_event_overflow(ctx, req->cqe.user_data,
-				req->cqe.res, req->cqe.flags,
-				extra1, extra2);
 	}
+	return io_req_cqe_overflow(req);
 }
 
 static inline void req_set_fail(struct io_kiocb *req)

From ae5735c69bf28fea0b8260a03caeb906319228a2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jun 2022 09:48:03 +0100
Subject: [PATCH 089/172] io_uring: deduplicate __io_fill_cqe_req tracing

Deduplicate two trace_io_uring_complete() calls in __io_fill_cqe_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/277ed85dba5189ab7d932164b314013a0f0b0fdc.1655455613.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 88c64a69beaaeb..763915c665933f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -41,10 +41,12 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 {
 	struct io_uring_cqe *cqe;
 
-	if (!(ctx->flags & IORING_SETUP_CQE32)) {
-		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags, 0, 0);
+	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+				req->cqe.res, req->cqe.flags,
+				(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
+				(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
 
+	if (!(ctx->flags & IORING_SETUP_CQE32)) {
 		/*
 		 * If we can't get a cq entry, userspace overflowed the
 		 * submission (by quite a lot). Increment the overflow count in
@@ -63,9 +65,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 			extra2 = req->extra2;
 		}
 
-		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags, extra1, extra2);
-
 		/*
 		 * If we can't get a cq entry, userspace overflowed the
 		 * submission (by quite a lot). Increment the overflow count in

From e8c328c3913d381bf60f2aecdf350c04b5b7e67d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jun 2022 09:48:04 +0100
Subject: [PATCH 090/172] io_uring: deduplicate io_get_cqe() calls

Deduplicate calls to io_get_cqe() from __io_fill_cqe_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4fa077986cc3abab7c59ff4e7c390c783885465f.1655455613.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 763915c665933f..dfb490e7cf4593 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -45,19 +45,17 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 				req->cqe.res, req->cqe.flags,
 				(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
 				(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqe(ctx);
+	if (unlikely(!cqe))
+		return io_req_cqe_overflow(req);
+	memcpy(cqe, &req->cqe, sizeof(*cqe));
 
-	if (!(ctx->flags & IORING_SETUP_CQE32)) {
-		/*
-		 * If we can't get a cq entry, userspace overflowed the
-		 * submission (by quite a lot). Increment the overflow count in
-		 * the ring.
-		 */
-		cqe = io_get_cqe(ctx);
-		if (likely(cqe)) {
-			memcpy(cqe, &req->cqe, sizeof(*cqe));
-			return true;
-		}
-	} else {
+	if (ctx->flags & IORING_SETUP_CQE32) {
 		u64 extra1 = 0, extra2 = 0;
 
 		if (req->flags & REQ_F_CQE32_INIT) {
@@ -65,20 +63,10 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 			extra2 = req->extra2;
 		}
 
-		/*
-		 * If we can't get a cq entry, userspace overflowed the
-		 * submission (by quite a lot). Increment the overflow count in
-		 * the ring.
-		 */
-		cqe = io_get_cqe(ctx);
-		if (likely(cqe)) {
-			memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
-			WRITE_ONCE(cqe->big_cqe[0], extra1);
-			WRITE_ONCE(cqe->big_cqe[1], extra2);
-			return true;
-		}
+		WRITE_ONCE(cqe->big_cqe[0], extra1);
+		WRITE_ONCE(cqe->big_cqe[1], extra2);
 	}
-	return io_req_cqe_overflow(req);
+	return true;
 }
 
 static inline void req_set_fail(struct io_kiocb *req)

From b3659a65be70eb68d9fc9802c4ce81e0f943abfd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jun 2022 09:48:05 +0100
Subject: [PATCH 091/172] io_uring: change ->cqe_cached invariant for CQE32

With IORING_SETUP_CQE32 ->cqe_cached doesn't store a real address but
rather an implicit offset into cqes. Store the real cqe pointer and
increment it accordingly if CQE32.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1ee1838cba16bed96381a006950b36ba640d998c.1655455613.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 ++++++++++-----
 io_uring/io_uring.h |  8 ++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7acb94c180b8e8..0dbf6a74f9f3a1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -701,11 +701,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
-	unsigned int shift = 0;
 	unsigned int free, queued, len;
 
-	if (ctx->flags & IORING_SETUP_CQE32)
-		shift = 1;
 
 	/* userspace may cheat modifying the tail, be safe and do min */
 	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
@@ -715,11 +712,19 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
 	if (!len)
 		return NULL;
 
-	ctx->cached_cq_tail++;
+	if (ctx->flags & IORING_SETUP_CQE32) {
+		off <<= 1;
+		len <<= 1;
+	}
+
 	ctx->cqe_cached = &rings->cqes[off];
 	ctx->cqe_sentinel = ctx->cqe_cached + len;
+
+	ctx->cached_cq_tail++;
 	ctx->cqe_cached++;
-	return &rings->cqes[off << shift];
+	if (ctx->flags & IORING_SETUP_CQE32)
+		ctx->cqe_cached++;
+	return &rings->cqes[off];
 }
 
 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dfb490e7cf4593..558a860a93fcd6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -22,14 +22,10 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
 		struct io_uring_cqe *cqe = ctx->cqe_cached;
 
-		if (ctx->flags & IORING_SETUP_CQE32) {
-			unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
-
-			cqe += off;
-		}
-
 		ctx->cached_cq_tail++;
 		ctx->cqe_cached++;
+		if (ctx->flags & IORING_SETUP_CQE32)
+			ctx->cqe_cached++;
 		return cqe;
 	}
 

From 27a9d66fec77cff0e32d2ecd5d0ac7ef878a7bb0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 13:57:18 +0100
Subject: [PATCH 092/172] io_uring: kill extra io_uring_types.h includes

io_uring/io_uring.h already includes io_uring_types.h, no need to
include it every time. Kill it in a bunch of places, it prepares us for
following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/94d8c943fbe0ef949981c508ddcee7fc1c18850f.1655384063.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/advise.c    | 1 -
 io_uring/cancel.c    | 1 -
 io_uring/epoll.c     | 1 -
 io_uring/fdinfo.c    | 1 -
 io_uring/filetable.c | 1 -
 io_uring/fs.c        | 1 -
 io_uring/io_uring.c  | 1 -
 io_uring/kbuf.c      | 1 -
 io_uring/msg_ring.c  | 1 -
 io_uring/net.c       | 1 -
 io_uring/nop.c       | 1 -
 io_uring/opdef.c     | 1 -
 io_uring/openclose.c | 1 -
 io_uring/poll.c      | 1 -
 io_uring/rsrc.c      | 1 -
 io_uring/rw.c        | 1 -
 io_uring/splice.c    | 1 -
 io_uring/sqpoll.c    | 1 -
 io_uring/statx.c     | 1 -
 io_uring/sync.c      | 1 -
 io_uring/tctx.c      | 1 -
 io_uring/timeout.c   | 1 -
 io_uring/uring_cmd.c | 1 -
 io_uring/xattr.c     | 1 -
 24 files changed, 24 deletions(-)

diff --git a/io_uring/advise.c b/io_uring/advise.c
index 8870fdf66ffbbd..581956934c0bf0 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -11,7 +11,6 @@
 #include <uapi/linux/fadvise.h>
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "advise.h"
 
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index f07bfd27c98ac2..d1e7f5a955ab20 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -10,7 +10,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "tctx.h"
 #include "poll.h"
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 10853e8ed07887..a8b794471d6b83 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -9,7 +9,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "epoll.h"
 
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 344e7d90d55756..61c35707a6cfc2 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -9,7 +9,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "sqpoll.h"
 #include "fdinfo.h"
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index e449ceb9a848e6..534e1a3c625d9f 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -9,7 +9,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "rsrc.h"
 #include "filetable.h"
diff --git a/io_uring/fs.c b/io_uring/fs.c
index aac1bc5255b07f..0de4f549bb7df6 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -12,7 +12,6 @@
 
 #include "../fs/internal.h"
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "fs.h"
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0dbf6a74f9f3a1..0a1f83a936b753 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -80,7 +80,6 @@
 
 #include "io-wq.h"
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "opdef.h"
 #include "refs.h"
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 59e4fafeb28ce4..62de0dda24bf6e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -11,7 +11,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "opdef.h"
 #include "kbuf.h"
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 7c3c5f3ab06b57..b02be23496521c 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -7,7 +7,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "msg_ring.h"
 
diff --git a/io_uring/net.c b/io_uring/net.c
index 35d0183fe75817..b77bfbfb081673 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -10,7 +10,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "kbuf.h"
 #include "net.h"
diff --git a/io_uring/nop.c b/io_uring/nop.c
index d363d8ce70a3b7..d956599a3c1b8f 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -7,7 +7,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "nop.h"
 
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index d687d33f9c0c03..a7b84b43e6c235 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -8,7 +8,6 @@
 #include <linux/file.h>
 #include <linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "opdef.h"
 #include "refs.h"
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 1cbf3903097053..099a5ec84dfdb7 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -12,7 +12,6 @@
 
 #include "../fs/internal.h"
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "rsrc.h"
 #include "openclose.h"
diff --git a/io_uring/poll.c b/io_uring/poll.c
index e0c181fe6264c0..63aca920543b31 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -13,7 +13,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "refs.h"
 #include "opdef.h"
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7fed3105152a95..68629eba413265 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -12,7 +12,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "openclose.h"
 #include "rsrc.h"
diff --git a/io_uring/rw.c b/io_uring/rw.c
index e5ca23d0783e2c..f8b42f2265df98 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -14,7 +14,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "opdef.h"
 #include "kbuf.h"
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 0e19d63303452a..b013ba34bffa58 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -11,7 +11,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "splice.h"
 
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 149d5c976f1467..76d4d70c733a99 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -14,7 +14,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "sqpoll.h"
 
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 83b15687e9c5ee..6056cd7f48761c 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -8,7 +8,6 @@
 
 #include "../fs/internal.h"
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "statx.h"
 
diff --git a/io_uring/sync.c b/io_uring/sync.c
index 9ee8ff865521f3..f2102afa79ca63 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -11,7 +11,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "sync.h"
 
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index a819da8fc85cd2..9b30fb0d360307 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -9,7 +9,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "tctx.h"
 
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 526fc8b2e3b65d..f9df359813c93e 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -8,7 +8,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "refs.h"
 #include "cancel.h"
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index abf78918a0995f..0a421ed51e7e16 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -6,7 +6,6 @@
 
 #include <uapi/linux/io_uring.h>
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "uring_cmd.h"
 
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index 79adf4efba0184..b179f9acd5acc5 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -13,7 +13,6 @@
 
 #include "../fs/internal.h"
 
-#include "io_uring_types.h"
 #include "io_uring.h"
 #include "xattr.h"
 

From ab1c84d855cf2c284fa0f4b17fc04063659c54a1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 13:57:19 +0100
Subject: [PATCH 093/172] io_uring: make io_uring_types.h public

Move io_uring types to linux/include, need them public so tracing can
see the definitions and we can clean trace/events/io_uring.h

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a15f12e8cb7289b2de0deaddcc7518d98a132d17.1655384063.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 {io_uring => include/linux}/io_uring_types.h | 28 ++++++++++++++++++--
 io_uring/filetable.h                         | 11 --------
 io_uring/io-wq.h                             | 17 +-----------
 io_uring/io_uring.h                          |  4 ++-
 io_uring/refs.h                              |  2 +-
 5 files changed, 31 insertions(+), 31 deletions(-)
 rename {io_uring => include/linux}/io_uring_types.h (96%)

diff --git a/io_uring/io_uring_types.h b/include/linux/io_uring_types.h
similarity index 96%
rename from io_uring/io_uring_types.h
rename to include/linux/io_uring_types.h
index 65ac7cdaaa73f4..779c72da5b8faf 100644
--- a/io_uring/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -6,8 +6,32 @@
 #include <linux/bitmap.h>
 #include <uapi/linux/io_uring.h>
 
-#include "io-wq.h"
-#include "filetable.h"
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
+struct io_wq_work {
+	struct io_wq_work_node list;
+	unsigned flags;
+	/* place it here instead of io_kiocb as it fills padding and saves 4B */
+	int cancel_seq;
+};
+
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
+struct io_file_table {
+	struct io_fixed_file *files;
+	unsigned long *bitmap;
+	unsigned int alloc_hint;
+};
 
 struct io_hash_bucket {
 	spinlock_t		lock;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index c404360f709053..6b58aa48bc45d3 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -22,17 +22,6 @@ struct io_kiocb;
 #endif
 #define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
-};
-
-struct io_file_table {
-	struct io_fixed_file *files;
-	unsigned long *bitmap;
-	unsigned int alloc_hint;
-};
-
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
 
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 3f54ee2a8eebd3..10b80ef78bb817 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -2,6 +2,7 @@
 #define INTERNAL_IO_WQ_H
 
 #include <linux/refcount.h>
+#include <linux/io_uring_types.h>
 
 struct io_wq;
 
@@ -20,15 +21,6 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-struct io_wq_work_node {
-	struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-	struct io_wq_work_node *first;
-	struct io_wq_work_node *last;
-};
-
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
@@ -152,13 +144,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
 	return node;
 }
 
-struct io_wq_work {
-	struct io_wq_work_node list;
-	unsigned flags;
-	/* place it here instead of io_kiocb as it fills padding and saves 4B */
-	int cancel_seq;
-};
-
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 {
 	if (!work->list.next)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 558a860a93fcd6..5eaa01c4697c37 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -3,7 +3,9 @@
 
 #include <linux/errno.h>
 #include <linux/lockdep.h>
-#include "io_uring_types.h"
+#include <linux/io_uring_types.h>
+#include "io-wq.h"
+#include "filetable.h"
 
 #ifndef CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
diff --git a/io_uring/refs.h b/io_uring/refs.h
index 334c5ead4c43d9..1336de3f2a30aa 100644
--- a/io_uring/refs.h
+++ b/io_uring/refs.h
@@ -2,7 +2,7 @@
 #define IOU_REQ_REF_H
 
 #include <linux/atomic.h>
-#include "io_uring_types.h"
+#include <linux/io_uring_types.h>
 
 /*
  * Shamelessly stolen from the mm implementation of page reference checking,

From 48863ffd3e81b6ec98606d3479c50aa77b7a98a9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 13:57:20 +0100
Subject: [PATCH 094/172] io_uring: clean up tracing events

We have lots of trace events accepting an io_uring request and wanting
to print some of its fields like user_data, opcode, flags and so on.
However, as trace points were unaware of io_uring structures, we had to
pass all the fields as arguments. Teach trace/events/io_uring.h about
struct io_kiocb and stop the misery of passing a horde of arguments to
trace helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/40ff72f92798114e56d400f2b003beb6cde6ef53.1655384063.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 142 +++++++++++++-------------------
 io_uring/io_uring.c             |  16 ++--
 io_uring/poll.c                 |   5 +-
 io_uring/timeout.c              |   3 +-
 4 files changed, 66 insertions(+), 100 deletions(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index aa2f951b07cdff..3bc8dec9acaacf 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -7,6 +7,7 @@
 
 #include <linux/tracepoint.h>
 #include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
 #include <linux/io_uring.h>
 
 struct io_wq_work;
@@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register,
 /**
  * io_uring_file_get - called before getting references to an SQE file
  *
- * @ctx:	pointer to a ring context structure
  * @req:	pointer to a submitted request
- * @user_data:	user data associated with the request
  * @fd:		SQE file descriptor
  *
  * Allows to trace out how often an SQE file reference is obtained, which can
@@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register,
  */
 TRACE_EVENT(io_uring_file_get,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
+	TP_PROTO(struct io_kiocb *req, int fd),
 
-	TP_ARGS(ctx, req, user_data, fd),
+	TP_ARGS(req, fd),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
@@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get,
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
+		__entry->user_data	= req->cqe.user_data;
 		__entry->fd		= fd;
 	),
 
@@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get,
 /**
  * io_uring_queue_async_work - called before submitting a new async work
  *
- * @ctx:	pointer to a ring context structure
  * @req:	pointer to a submitted request
- * @user_data:	user data associated with the request
- * @opcode:	opcode of request
- * @flags	request flags
- * @work:	pointer to a submitted io_wq_work
  * @rw:		type of workqueue, hashed or normal
  *
  * Allows to trace asynchronous work submission.
  */
 TRACE_EVENT(io_uring_queue_async_work,
 
-	TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
-		unsigned int flags, struct io_wq_work *work, int rw),
+	TP_PROTO(struct io_kiocb *req, int rw),
 
-	TP_ARGS(ctx, req, user_data, opcode, flags, work, rw),
+	TP_ARGS(req, rw),
 
 	TP_STRUCT__entry (
 		__field(  void *,			ctx		)
@@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  struct io_wq_work *,		work		)
 		__field(  int,				rw		)
 
-		__string( op_str, io_uring_get_opcode(opcode)	)
+		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->flags		= flags;
-		__entry->opcode		= opcode;
-		__entry->work		= work;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->flags		= req->flags;
+		__entry->opcode		= req->opcode;
+		__entry->work		= &req->work;
 		__entry->rw		= rw;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
@@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work,
 /**
  * io_uring_defer - called when an io_uring request is deferred
  *
- * @ctx:	pointer to a ring context structure
  * @req:	pointer to a deferred request
- * @user_data:	user data associated with the request
- * @opcode:	opcode of request
  *
  * Allows to track deferred requests, to get an insight about what requests are
  * not started immediately.
  */
 TRACE_EVENT(io_uring_defer,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
+	TP_PROTO(struct io_kiocb *req),
 
-	TP_ARGS(ctx, req, user_data, opcode),
+	TP_ARGS(req),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx	)
@@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer,
 		__field(  unsigned long long,	data	)
 		__field(  u8,			opcode	)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx	= ctx;
+		__entry->ctx	= req->ctx;
 		__entry->req	= req;
-		__entry->data	= user_data;
-		__entry->opcode	= opcode;
+		__entry->data	= req->cqe.user_data;
+		__entry->opcode	= req->opcode;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
@@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer,
  * io_uring_link - called before the io_uring request added into link_list of
  * 		   another request
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to a linked request
  * @target_req:		pointer to a previous request, that would contain @req
  *
@@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer,
  */
 TRACE_EVENT(io_uring_link,
 
-	TP_PROTO(void *ctx, void *req, void *target_req),
+	TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),
 
-	TP_ARGS(ctx, req, target_req),
+	TP_ARGS(req, target_req),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
@@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link,
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
 		__entry->target_req	= target_req;
 	),
@@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait,
 /**
  * io_uring_fail_link - called before failing a linked request
  *
- * @ctx:	pointer to a ring context structure
  * @req:	request, which links were cancelled
- * @user_data:	user data associated with the request
- * @opcode:	opcode of request
  * @link:	cancelled link
  *
  * Allows to track linked requests cancellation, to see not only that some work
@@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait,
  */
 TRACE_EVENT(io_uring_fail_link,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
+	TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),
 
-	TP_ARGS(ctx, req, user_data, opcode, link),
+	TP_ARGS(req, link),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link,
 		__field(  u8,			opcode		)
 		__field(  void *,		link		)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
 		__entry->link		= link;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
@@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete,
 /**
  * io_uring_submit_sqe - called before submitting one SQE
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to a submitted request
- * @user_data:		user data associated with the request
- * @opcode:		opcode of request
- * @flags		request flags
  * @force_nonblock:	whether a context blocking or not
- * @sq_thread:		true if sq_thread has submitted this SQE
  *
  * Allows to track SQE submitting, to understand what was the source of it, SQ
  * thread or io_uring_enter call.
  */
 TRACE_EVENT(io_uring_submit_sqe,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
-		 bool force_nonblock, bool sq_thread),
+	TP_PROTO(struct io_kiocb *req, bool force_nonblock),
 
-	TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
+	TP_ARGS(req, force_nonblock),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__field(  bool,			force_nonblock	)
 		__field(  bool,			sq_thread	)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
-		__entry->flags		= flags;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
+		__entry->flags		= req->flags;
 		__entry->force_nonblock	= force_nonblock;
-		__entry->sq_thread	= sq_thread;
+		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
@@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe,
 /*
  * io_uring_poll_arm - called after arming a poll wait if successful
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to the armed request
- * @user_data:		user data associated with the request
- * @opcode:		opcode of request
  * @mask:		request poll events mask
  * @events:		registered events of interest
  *
@@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe,
  */
 TRACE_EVENT(io_uring_poll_arm,
 
-	TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
-		 int mask, int events),
+	TP_PROTO(struct io_kiocb *req, int mask, int events),
 
-	TP_ARGS(ctx, req, user_data, opcode, mask, events),
+	TP_ARGS(req, mask, events),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm,
 		__field(  int,			mask		)
 		__field(  int,			events		)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 		__entry->events		= events;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
@@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm,
 /*
  * io_uring_task_add - called after adding a task
  *
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to request
- * @user_data:		user data associated with the request
- * @opcode:		opcode of request
  * @mask:		request poll events mask
  *
  */
 TRACE_EVENT(io_uring_task_add,
 
-	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
+	TP_PROTO(struct io_kiocb *req, int mask),
 
-	TP_ARGS(ctx, req, user_data, opcode, mask),
+	TP_ARGS(req, mask),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add,
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
 
-		__string( op_str, io_uring_get_opcode(opcode) )
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->opcode		= opcode;
+		__entry->user_data	= req->cqe.user_data;
+		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 
-		__assign_str(op_str, io_uring_get_opcode(opcode));
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
@@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add,
  * io_uring_req_failed - called when an sqe is errored dring submission
  *
  * @sqe:		pointer to the io_uring_sqe that failed
- * @ctx:		pointer to a ring context structure
  * @req:		pointer to request
  * @error:		error it failed with
  *
@@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add,
  */
 TRACE_EVENT(io_uring_req_failed,
 
-	TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
+	TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),
 
-	TP_ARGS(sqe, ctx, req, error),
+	TP_ARGS(sqe, req, error),
 
 	TP_STRUCT__entry (
 		__field(  void *,		ctx		)
@@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed,
 	),
 
 	TP_fast_assign(
-		__entry->ctx		= ctx;
+		__entry->ctx		= req->ctx;
 		__entry->req		= req;
 		__entry->user_data	= sqe->user_data;
 		__entry->opcode		= sqe->opcode;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0a1f83a936b753..ef4371790aaa5d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -452,9 +452,7 @@ void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 		req->work.flags |= IO_WQ_WORK_CANCEL;
 
-	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
-					req->opcode, req->flags, &req->work,
-					io_wq_is_hashed(&req->work));
+	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
 	io_wq_enqueue(tctx->io_wq, &req->work);
 	if (link)
 		io_queue_linked_timeout(link);
@@ -1583,7 +1581,7 @@ static __cold void io_drain_req(struct io_kiocb *req)
 		goto queue;
 	}
 
-	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
+	trace_io_uring_defer(req);
 	de->req = req;
 	de->seq = seq;
 	list_add_tail(&de->list, &ctx->defer_list);
@@ -1783,7 +1781,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 {
 	struct file *file = fget(fd);
 
-	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
+	trace_io_uring_file_get(req, fd);
 
 	/* we don't allow fixed io_uring files */
 	if (file && io_is_uring_fops(file))
@@ -2006,7 +2004,7 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
 	struct io_submit_link *link = &ctx->submit_state.link;
 	struct io_kiocb *head = link->head;
 
-	trace_io_uring_req_failed(sqe, ctx, req, ret);
+	trace_io_uring_req_failed(sqe, req, ret);
 
 	/*
 	 * Avoid breaking links in the middle as it renders links with SQPOLL
@@ -2048,9 +2046,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return io_submit_fail_init(sqe, req, ret);
 
 	/* don't need @sqe from now on */
-	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
-				  req->flags, true,
-				  ctx->flags & IORING_SETUP_SQPOLL);
+	trace_io_uring_submit_sqe(req, true);
 
 	/*
 	 * If we already have a head request, queue this one for async
@@ -2064,7 +2060,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		if (unlikely(ret))
 			return io_submit_fail_init(sqe, req, ret);
 
-		trace_io_uring_link(ctx, req, link->head);
+		trace_io_uring_link(req, link->head);
 		link->last->link = req;
 		link->last = req;
 
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 63aca920543b31..b2659b56c702f3 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -288,7 +288,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask,
 	else
 		req->io_task_work.func = io_apoll_task_func;
 
-	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
+	trace_io_uring_task_add(req, mask);
 	io_req_task_work_add(req);
 }
 
@@ -558,8 +558,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	if (ret || ipt.error)
 		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
 
-	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
-				mask, apoll->poll.events);
+	trace_io_uring_poll_arm(req, mask, apoll->poll.events);
 	return IO_APOLL_OK;
 }
 
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index f9df359813c93e..557c637af158be 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -115,8 +115,7 @@ static void io_fail_links(struct io_kiocb *req)
 		nxt = link->link;
 		link->link = NULL;
 
-		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
-					req->opcode, link);
+		trace_io_uring_fail_link(req, link);
 
 		if (ignore_cqes)
 			link->flags |= REQ_F_CQE_SKIP;

From ad163a7e2562230c77102c60f668bac440e60cce Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 19:44:33 -0600
Subject: [PATCH 095/172] io_uring: move a few private types to local headers

Commit 3a3d47fa9cfd ("io_uring: make io_uring_types.h public") moved
a bunch of io_uring types to a kernel wide header, so we could make
tracing a bit saner rather than pass in a ton of arguments.

However, there are a few types in there that are not really needed to
be system wide. Move the cancel data and mapped buffers back to the
appropriate io_uring local headers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 18 ------------------
 io_uring/cancel.h              | 13 +++++++++++++
 io_uring/fdinfo.c              |  1 +
 io_uring/poll.h                |  1 +
 io_uring/rsrc.h                |  8 ++++++++
 io_uring/timeout.h             |  1 +
 6 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 779c72da5b8faf..2015f3ea7cb76b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -528,27 +528,9 @@ struct io_kiocb {
 	struct io_wq_work		work;
 };
 
-struct io_cancel_data {
-	struct io_ring_ctx *ctx;
-	union {
-		u64 data;
-		struct file *file;
-	};
-	u32 flags;
-	int seq;
-};
-
 struct io_overflow_cqe {
 	struct list_head list;
 	struct io_uring_cqe cqe;
 };
 
-struct io_mapped_ubuf {
-	u64		ubuf;
-	u64		ubuf_end;
-	unsigned int	nr_bvecs;
-	unsigned long	acct_pages;
-	struct bio_vec	bvec[];
-};
-
 #endif
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 8dd259dc383e2c..2338012a5b06fb 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -1,5 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/io_uring_types.h>
+
+struct io_cancel_data {
+	struct io_ring_ctx *ctx;
+	union {
+		u64 data;
+		struct file *file;
+	};
+	u32 flags;
+	int seq;
+};
+
+
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 61c35707a6cfc2..b29e2d02216f22 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -13,6 +13,7 @@
 #include "sqpoll.h"
 #include "fdinfo.h"
 #include "cancel.h"
+#include "rsrc.h"
 
 #ifdef CONFIG_PROC_FS
 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
diff --git a/io_uring/poll.h b/io_uring/poll.h
index fa3e19790281b2..c40673d7da0199 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -24,6 +24,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
 
+struct io_cancel_data;
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 		   unsigned issue_flags);
 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 872c86312cbc2d..03f26516e99463 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -45,6 +45,14 @@ struct io_rsrc_node {
 	bool				done;
 };
 
+struct io_mapped_ubuf {
+	u64		ubuf;
+	u64		ubuf_end;
+	unsigned int	nr_bvecs;
+	unsigned long	acct_pages;
+	struct bio_vec	bvec[];
+};
+
 void io_rsrc_put_work(struct work_struct *work);
 void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index dd7cfb0d936671..858c62644897a5 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -22,6 +22,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
 }
 
 __cold void io_flush_timeouts(struct io_ring_ctx *ctx);
+struct io_cancel_data;
 int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
 __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			     bool cancel_all);

From d142c3ec8d160bea9801f0d727e92007787df8c0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 19 Jun 2022 12:26:04 +0100
Subject: [PATCH 096/172] io_uring: remove extra io_commit_cqring()

We don't post events in __io_commit_cqring_flush() anymore but send all
requests to tw, so no need to do io_commit_cqring() there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f2481e32375e749be89c42e4804268b608722cef.1655637157.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ef4371790aaa5d..efad2d9b7b42c5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -480,7 +480,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 			io_flush_timeouts(ctx);
 		if (ctx->drain_active)
 			io_queue_deferred(ctx);
-		io_commit_cqring(ctx);
 		spin_unlock(&ctx->completion_lock);
 	}
 	if (ctx->has_evfd)

From 9046c6415be60f51f60f8b771a74ac4e72e3599d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 19 Jun 2022 12:26:05 +0100
Subject: [PATCH 097/172] io_uring: reshuffle io_uring/io_uring.h

It's a good idea to first do forward declarations and then inline
helpers, otherwise there will be keep stumbling on dependencies
between them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1d7fa6672ed43f20ccc0c54ae201369ebc3ebfab.1655637157.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 95 ++++++++++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 5eaa01c4697c37..7b2055b342dfab 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -18,6 +18,53 @@ enum {
 
 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
 bool io_req_cqe_overflow(struct io_kiocb *req);
+int io_run_task_work_sig(void);
+void io_req_complete_failed(struct io_kiocb *req, s32 res);
+void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
+void io_req_complete_post(struct io_kiocb *req);
+void __io_req_complete_post(struct io_kiocb *req);
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
+void io_cqring_ev_posted(struct io_ring_ctx *ctx);
+void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
+
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+
+struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+			       unsigned issue_flags);
+
+bool io_is_uring_fops(struct file *file);
+bool io_alloc_async_data(struct io_kiocb *req);
+void io_req_task_work_add(struct io_kiocb *req);
+void io_req_task_prio_work_add(struct io_kiocb *req);
+void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
+void io_req_task_queue(struct io_kiocb *req);
+void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
+void io_req_task_complete(struct io_kiocb *req, bool *locked);
+void io_req_task_queue_fail(struct io_kiocb *req, int ret);
+void io_req_task_submit(struct io_kiocb *req, bool *locked);
+void tctx_task_work(struct callback_head *cb);
+__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
+int io_uring_alloc_task_context(struct task_struct *task,
+				struct io_ring_ctx *ctx);
+
+int io_poll_issue(struct io_kiocb *req, bool *locked);
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
+int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
+void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
+int io_req_prep_async(struct io_kiocb *req);
+
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
+void io_wq_submit_work(struct io_wq_work *work);
+
+void io_free_req(struct io_kiocb *req);
+void io_queue_next(struct io_kiocb *req);
+
+bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+			bool cancel_all);
+
+#define io_for_each_link(pos, head) \
+	for (pos = (head); pos; pos = pos->link)
 
 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 {
@@ -177,52 +224,4 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
-int io_run_task_work_sig(void);
-void io_req_complete_failed(struct io_kiocb *req, s32 res);
-void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
-void io_req_complete_post(struct io_kiocb *req);
-void __io_req_complete_post(struct io_kiocb *req);
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-void io_cqring_ev_posted(struct io_ring_ctx *ctx);
-void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
-
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
-
-struct file *io_file_get_normal(struct io_kiocb *req, int fd);
-struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
-			       unsigned issue_flags);
-
-bool io_is_uring_fops(struct file *file);
-bool io_alloc_async_data(struct io_kiocb *req);
-void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
-void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
-void io_req_task_queue(struct io_kiocb *req);
-void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
-void io_req_task_complete(struct io_kiocb *req, bool *locked);
-void io_req_task_queue_fail(struct io_kiocb *req, int ret);
-void io_req_task_submit(struct io_kiocb *req, bool *locked);
-void tctx_task_work(struct callback_head *cb);
-__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-int io_uring_alloc_task_context(struct task_struct *task,
-				struct io_ring_ctx *ctx);
-
-int io_poll_issue(struct io_kiocb *req, bool *locked);
-int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
-int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
-void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
-int io_req_prep_async(struct io_kiocb *req);
-
-struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
-void io_wq_submit_work(struct io_wq_work *work);
-
-void io_free_req(struct io_kiocb *req);
-void io_queue_next(struct io_kiocb *req);
-
-bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
-			bool cancel_all);
-
-#define io_for_each_link(pos, head) \
-	for (pos = (head); pos; pos = pos->link)
-
 #endif

From a830ffd28780627b6287bcd5b84e9fe2dd795935 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 19 Jun 2022 12:26:06 +0100
Subject: [PATCH 098/172] io_uring: move io_eventfd_signal()

Move io_eventfd_signal() in the sources without any changes and kill its
forward declaration.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9ebebb3f6f56f5a5448a621e0b6a537720c43334.1655637157.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index efad2d9b7b42c5..61d4e6d0731a20 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -142,8 +142,6 @@ static void io_queue_sqe(struct io_kiocb *req);
 
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
-static void io_eventfd_signal(struct io_ring_ctx *ctx);
-
 static struct kmem_cache *req_cachep;
 
 struct sock *io_uring_get_socket(struct file *file)
@@ -472,20 +470,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 	}
 }
 
-void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
-	if (ctx->off_timeout_used || ctx->drain_active) {
-		spin_lock(&ctx->completion_lock);
-		if (ctx->off_timeout_used)
-			io_flush_timeouts(ctx);
-		if (ctx->drain_active)
-			io_queue_deferred(ctx);
-		spin_unlock(&ctx->completion_lock);
-	}
-	if (ctx->has_evfd)
-		io_eventfd_signal(ctx);
-}
-
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
@@ -513,6 +497,20 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
 	rcu_read_unlock();
 }
 
+void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+	if (ctx->off_timeout_used || ctx->drain_active) {
+		spin_lock(&ctx->completion_lock);
+		if (ctx->off_timeout_used)
+			io_flush_timeouts(ctx);
+		if (ctx->drain_active)
+			io_queue_deferred(ctx);
+		spin_unlock(&ctx->completion_lock);
+	}
+	if (ctx->has_evfd)
+		io_eventfd_signal(ctx);
+}
+
 /*
  * This should only get called when at least one event has been posted.
  * Some applications rely on the eventfd notification count only changing

From d9dee4302a7cbd6c0142dbdf6d150acc7459de0d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 19 Jun 2022 12:26:08 +0100
Subject: [PATCH 099/172] io_uring: remove ->flush_cqes optimisation

It's not clear how widely used IOSQE_CQE_SKIP_SUCCESS is, and how often
->flush_cqes flag prevents from completion being flushed. Sometimes it's
high level of concurrency that enables it at least for one CQE, but
sometimes it doesn't save much because nobody waiting on the CQ.

Remove ->flush_cqes flag and the optimisation, it should benefit the
normal use case. Note, that there is no spurious eventfd problem with
that as checks for spuriousness were incorporated into
io_eventfd_signal().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/692e81eeddccc096f449a7960365fa7b4a18f8e6.1655637157.git.asml.silence@gmail.com
[axboe: remove now dead state->flush_cqes variable]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/io_uring.c            | 23 ++++++++++-------------
 io_uring/io_uring.h            |  2 --
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2015f3ea7cb76b..6bcd7bff6479be 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -148,7 +148,6 @@ struct io_submit_state {
 
 	bool			plug_started;
 	bool			need_plug;
-	bool			flush_cqes;
 	unsigned short		submit_nr;
 	struct blk_plug		plug;
 };
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 61d4e6d0731a20..16a625e854ec33 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1250,22 +1250,19 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_wq_work_node *node, *prev;
 	struct io_submit_state *state = &ctx->submit_state;
 
-	if (state->flush_cqes) {
-		spin_lock(&ctx->completion_lock);
-		wq_list_for_each(node, prev, &state->compl_reqs) {
-			struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    comp_list);
-
-			if (!(req->flags & REQ_F_CQE_SKIP))
-				__io_fill_cqe_req(ctx, req);
-		}
+	spin_lock(&ctx->completion_lock);
+	wq_list_for_each(node, prev, &state->compl_reqs) {
+		struct io_kiocb *req = container_of(node, struct io_kiocb,
+					    comp_list);
 
-		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
-		io_cqring_ev_posted(ctx);
-		state->flush_cqes = false;
+		if (!(req->flags & REQ_F_CQE_SKIP))
+			__io_fill_cqe_req(ctx, req);
 	}
 
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_cqring_ev_posted(ctx);
+
 	io_free_batch_list(ctx, state->compl_reqs.first);
 	INIT_WQ_LIST(&state->compl_reqs);
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7b2055b342dfab..bdc62727638be8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -219,8 +219,6 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 {
 	struct io_submit_state *state = &req->ctx->submit_state;
 
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		state->flush_cqes = true;
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 

From affa87db90108d9f017f927bcdab536e32c3915e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:52 +0100
Subject: [PATCH 100/172] io_uring: fix multi ctx cancellation

io_uring_try_cancel_requests() loops until there is nothing left to do
with the ring, however there might be several rings and they might have
dependencies between them, e.g. via poll requests.

Instead of cancelling rings one by one, try to cancel them all and only
then loop over if we still potenially some work to do.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8d491fe02d8ac4c77ff38061cf86b9a827e8845c.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 87 ++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 16a625e854ec33..707b599b9224a4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -132,7 +132,7 @@ struct io_defer_entry {
 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
 
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
 
@@ -2648,7 +2648,9 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 	 * as nobody else will be looking for them.
 	 */
 	do {
-		io_uring_try_cancel_requests(ctx, NULL, true);
+		while (io_uring_try_cancel_requests(ctx, NULL, true))
+			cond_resched();
+
 		if (ctx->sq_data) {
 			struct io_sq_data *sqd = ctx->sq_data;
 			struct task_struct *tsk;
@@ -2806,53 +2808,48 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 						struct task_struct *task,
 						bool cancel_all)
 {
 	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
 	struct io_uring_task *tctx = task ? task->io_uring : NULL;
+	enum io_wq_cancel cret;
+	bool ret = false;
 
 	/* failed during ring init, it couldn't have issued any requests */
 	if (!ctx->rings)
-		return;
-
-	while (1) {
-		enum io_wq_cancel cret;
-		bool ret = false;
+		return false;
 
-		if (!task) {
-			ret |= io_uring_try_cancel_iowq(ctx);
-		} else if (tctx && tctx->io_wq) {
-			/*
-			 * Cancels requests of all rings, not only @ctx, but
-			 * it's fine as the task is in exit/exec.
-			 */
-			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
-					       &cancel, true);
-			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
-		}
+	if (!task) {
+		ret |= io_uring_try_cancel_iowq(ctx);
+	} else if (tctx && tctx->io_wq) {
+		/*
+		 * Cancels requests of all rings, not only @ctx, but
+		 * it's fine as the task is in exit/exec.
+		 */
+		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
+				       &cancel, true);
+		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+	}
 
-		/* SQPOLL thread does its own polling */
-		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
-		    (ctx->sq_data && ctx->sq_data->thread == current)) {
-			while (!wq_list_empty(&ctx->iopoll_list)) {
-				io_iopoll_try_reap_events(ctx);
-				ret = true;
-			}
+	/* SQPOLL thread does its own polling */
+	if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
+	    (ctx->sq_data && ctx->sq_data->thread == current)) {
+		while (!wq_list_empty(&ctx->iopoll_list)) {
+			io_iopoll_try_reap_events(ctx);
+			ret = true;
 		}
-
-		ret |= io_cancel_defer_files(ctx, task, cancel_all);
-		mutex_lock(&ctx->uring_lock);
-		ret |= io_poll_remove_all(ctx, task, cancel_all);
-		mutex_unlock(&ctx->uring_lock);
-		ret |= io_kill_timeouts(ctx, task, cancel_all);
-		if (task)
-			ret |= io_run_task_work();
-		if (!ret)
-			break;
-		cond_resched();
 	}
+
+	ret |= io_cancel_defer_files(ctx, task, cancel_all);
+	mutex_lock(&ctx->uring_lock);
+	ret |= io_poll_remove_all(ctx, task, cancel_all);
+	mutex_unlock(&ctx->uring_lock);
+	ret |= io_kill_timeouts(ctx, task, cancel_all);
+	if (task)
+		ret |= io_run_task_work();
+	return ret;
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
@@ -2882,6 +2879,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 
 	atomic_inc(&tctx->in_idle);
 	do {
+		bool loop = false;
+
 		io_uring_drop_tctx_refs(current);
 		/* read completions before cancelations */
 		inflight = tctx_inflight(tctx, !cancel_all);
@@ -2896,13 +2895,19 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 				/* sqpoll task will cancel all its requests */
 				if (node->ctx->sq_data)
 					continue;
-				io_uring_try_cancel_requests(node->ctx, current,
-							     cancel_all);
+				loop |= io_uring_try_cancel_requests(node->ctx,
+							current, cancel_all);
 			}
 		} else {
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-				io_uring_try_cancel_requests(ctx, current,
-							     cancel_all);
+				loop |= io_uring_try_cancel_requests(ctx,
+								     current,
+								     cancel_all);
+		}
+
+		if (loop) {
+			cond_resched();
+			continue;
 		}
 
 		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);

From ba3cdb6fbb6e8eb525c868c60e103c5711edc068 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:53 +0100
Subject: [PATCH 101/172] io_uring: improve task exit timeout cancellations

Don't spin trying to cancel timeouts that are reachable but not
cancellable, e.g. already executing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ab8a7440a60bbdf69ae514f672ad050e43dd1b03.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 557c637af158be..a79a7d6ef1b37a 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -49,7 +49,7 @@ static inline void io_put_req(struct io_kiocb *req)
 	}
 }
 
-static void io_kill_timeout(struct io_kiocb *req, int status)
+static bool io_kill_timeout(struct io_kiocb *req, int status)
 	__must_hold(&req->ctx->completion_lock)
 	__must_hold(&req->ctx->timeout_lock)
 {
@@ -64,7 +64,9 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&timeout->list);
 		io_req_tw_post_queue(req, status, 0);
+		return true;
 	}
+	return false;
 }
 
 __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
@@ -620,10 +622,9 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
 		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
 
-		if (io_match_task(req, tsk, cancel_all)) {
-			io_kill_timeout(req, -ECANCELED);
+		if (io_match_task(req, tsk, cancel_all) &&
+		    io_kill_timeout(req, -ECANCELED))
 			canceled++;
-		}
 	}
 	spin_unlock_irq(&ctx->timeout_lock);
 	io_commit_cqring(ctx);

From b321823a03dc81a5b83b063e6e1904d612b53266 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:54 +0100
Subject: [PATCH 102/172] io_uring: fix io_poll_remove_all clang warnings

clang complains on bitwise operations with bools, add a bit more
verbosity to better show that we want to call io_poll_remove_all_table()
twice but with different arguments.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f11d21dcdf9233e0eeb15fa13b858a05a78eb310.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index b2659b56c702f3..cbf44c38efd9ee 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -595,8 +595,11 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			       bool cancel_all)
 	__must_hold(&ctx->uring_lock)
 {
-	return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) |
-	       io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
+	bool ret;
+
+	ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
+	ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
+	return ret;
 }
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,

From 305bef98870816ae58357d647521891ec558a92e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:55 +0100
Subject: [PATCH 103/172] io_uring: hide eventfd assumptions in eventfd paths

Some io_uring-eventfd users assume that there won't be spurious wakeups.
That assumption has to be honoured by all io_cqring_ev_posted() callers,
which is inconvenient and from time to time leads to problems but should
be maintained to not break the userspace.

Instead of making the callers track whether a CQE was posted or not, hide
it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw last time
and triggers the eventfd only when ->cached_cq_tail changed since then.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ffc66bae37a2513080b601e4370e147faaa72c5.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 44 ++++++++++++++++++++--------------
 io_uring/timeout.c             |  3 +--
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 6bcd7bff6479be..5987f8acca3838 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -314,6 +314,8 @@ struct io_ring_ctx {
 
 	struct list_head		defer_list;
 	unsigned			sq_thread_idle;
+	/* protected by ->completion_lock */
+	unsigned			evfd_last_cq_tail;
 };
 
 enum {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 707b599b9224a4..84f92362521670 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -473,6 +473,22 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
+	bool skip;
+
+	spin_lock(&ctx->completion_lock);
+	/*
+	 * Eventfd should only get triggered when at least one event has been
+	 * posted. Some applications rely on the eventfd notification count only
+	 * changing IFF a new CQE has been added to the CQ ring. There's no
+	 * depedency on 1:1 relationship between how many times this function is
+	 * called (and hence the eventfd count) and number of CQEs posted to the
+	 * CQ ring.
+	 */
+	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
+	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	spin_unlock(&ctx->completion_lock);
+	if (skip)
+		return;
 
 	rcu_read_lock();
 	/*
@@ -511,13 +527,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_signal(ctx);
 }
 
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
 void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
@@ -530,7 +539,7 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
-	bool all_flushed, posted;
+	bool all_flushed;
 	size_t cqe_size = sizeof(struct io_uring_cqe);
 
 	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
@@ -539,7 +548,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (ctx->flags & IORING_SETUP_CQE32)
 		cqe_size <<= 1;
 
-	posted = false;
 	spin_lock(&ctx->completion_lock);
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		struct io_uring_cqe *cqe = io_get_cqe(ctx);
@@ -554,7 +562,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		else
 			io_account_cq_overflow(ctx);
 
-		posted = true;
 		list_del(&ocqe->list);
 		kfree(ocqe);
 	}
@@ -567,8 +574,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (posted)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return all_flushed;
 }
 
@@ -758,8 +764,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (filled)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return filled;
 }
 
@@ -940,14 +945,12 @@ __cold void io_free_req(struct io_kiocb *req)
 static void __io_req_find_next_prep(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	bool posted;
 
 	spin_lock(&ctx->completion_lock);
-	posted = io_disarm_next(req);
+	io_disarm_next(req);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (posted)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 }
 
 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -2428,6 +2431,11 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 		kfree(ev_fd);
 		return ret;
 	}
+
+	spin_lock(&ctx->completion_lock);
+	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	spin_unlock(&ctx->completion_lock);
+
 	ev_fd->eventfd_async = eventfd_async;
 	ctx->has_evfd = true;
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index a79a7d6ef1b37a..424b2fc858b8b4 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -629,7 +629,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	spin_unlock_irq(&ctx->timeout_lock);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (canceled != 0)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return canceled != 0;
 }

From 253993210bd8aa3b39a392807c03c8ef1cd7dc3d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:56 +0100
Subject: [PATCH 104/172] io_uring: introduce locking helpers for CQE posting

spin_lock(&ctx->completion_lock);
/* post CQEs */
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);

We have many places repeating this sequence, and the three function
unlock section is not perfect from the maintainance perspective and also
makes it harder to add new locking/sync trick.

Introduce two helpers. io_cq_lock(), which is simple and only grabs
->completion_lock, and io_cq_unlock_post() encapsulating the three call
section.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/fe0c682bf7f7b55d9be55b0d034be9c1949277dc.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 57 +++++++++++++++++++++------------------------
 io_uring/io_uring.h |  9 ++++++-
 io_uring/timeout.c  |  6 ++---
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 84f92362521670..0db73d01455d10 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -527,7 +527,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_signal(ctx);
 }
 
-void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
 		     ctx->has_evfd))
@@ -536,6 +536,19 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 	io_cqring_wake(ctx);
 }
 
+static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
+	__releases(ctx->completion_lock)
+{
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_cqring_ev_posted(ctx);
+}
+
+void io_cq_unlock_post(struct io_ring_ctx *ctx)
+{
+	__io_cq_unlock_post(ctx);
+}
+
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -548,7 +561,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (ctx->flags & IORING_SETUP_CQE32)
 		cqe_size <<= 1;
 
-	spin_lock(&ctx->completion_lock);
+	io_cq_lock(ctx);
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		struct io_uring_cqe *cqe = io_get_cqe(ctx);
 		struct io_overflow_cqe *ocqe;
@@ -572,9 +585,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 	}
 
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_cq_unlock_post(ctx);
 	return all_flushed;
 }
 
@@ -760,11 +771,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 {
 	bool filled;
 
-	spin_lock(&ctx->completion_lock);
+	io_cq_lock(ctx);
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_cq_unlock_post(ctx);
 	return filled;
 }
 
@@ -810,11 +819,9 @@ void io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	spin_lock(&ctx->completion_lock);
+	io_cq_lock(ctx);
 	__io_req_complete_post(req);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_cq_unlock_post(ctx);
 }
 
 inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
@@ -946,11 +953,9 @@ static void __io_req_find_next_prep(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	spin_lock(&ctx->completion_lock);
+	io_cq_lock(ctx);
 	io_disarm_next(req);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_cq_unlock_post(ctx);
 }
 
 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -984,13 +989,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
-{
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
-}
-
 static void handle_prev_tw_list(struct io_wq_work_node *node,
 				struct io_ring_ctx **ctx, bool *uring_locked)
 {
@@ -1006,7 +1004,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
 
 		if (req->ctx != *ctx) {
 			if (unlikely(!*uring_locked && *ctx))
-				ctx_commit_and_unlock(*ctx);
+				io_cq_unlock_post(*ctx);
 
 			ctx_flush_and_put(*ctx, uring_locked);
 			*ctx = req->ctx;
@@ -1014,7 +1012,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
 			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
 			percpu_ref_get(&(*ctx)->refs);
 			if (unlikely(!*uring_locked))
-				spin_lock(&(*ctx)->completion_lock);
+				io_cq_lock(*ctx);
 		}
 		if (likely(*uring_locked)) {
 			req->io_task_work.func(req, uring_locked);
@@ -1026,7 +1024,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
 	} while (node);
 
 	if (unlikely(!*uring_locked))
-		ctx_commit_and_unlock(*ctx);
+		io_cq_unlock_post(*ctx);
 }
 
 static void handle_tw_list(struct io_wq_work_node *node,
@@ -1261,10 +1259,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		if (!(req->flags & REQ_F_CQE_SKIP))
 			__io_fill_cqe_req(ctx, req);
 	}
-
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	__io_cq_unlock_post(ctx);
 
 	io_free_batch_list(ctx, state->compl_reqs.first);
 	INIT_WQ_LIST(&state->compl_reqs);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index bdc62727638be8..738fb96575ab7c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -24,7 +24,6 @@ void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
 void __io_req_complete_post(struct io_kiocb *req);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-void io_cqring_ev_posted(struct io_ring_ctx *ctx);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
@@ -66,6 +65,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
+static inline void io_cq_lock(struct io_ring_ctx *ctx)
+	__acquires(ctx->completion_lock)
+{
+	spin_lock(&ctx->completion_lock);
+}
+
+void io_cq_unlock_post(struct io_ring_ctx *ctx);
+
 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 {
 	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 424b2fc858b8b4..7e2c341f976258 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -617,7 +617,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	struct io_timeout *timeout, *tmp;
 	int canceled = 0;
 
-	spin_lock(&ctx->completion_lock);
+	io_cq_lock(ctx);
 	spin_lock_irq(&ctx->timeout_lock);
 	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
 		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
@@ -627,8 +627,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			canceled++;
 	}
 	spin_unlock_irq(&ctx->timeout_lock);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_cq_unlock_post(ctx);
 	return canceled != 0;
 }

From 46929b086886ade8302cff6e85ccea66bce0cf98 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:57 +0100
Subject: [PATCH 105/172] io_uring: add io_commit_cqring_flush()

Since __io_commit_cqring_flush users moved to different files, introduce
io_commit_cqring_flush() helper and encapsulate all flags testing details
inside.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0da03887435dd9869ffe46dcd3962bf104afcca3.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 +----
 io_uring/io_uring.h | 6 ++++++
 io_uring/rw.c       | 5 +----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0db73d01455d10..3e65e04915a782 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -529,10 +529,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 
 static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
-	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
-		     ctx->has_evfd))
-		__io_commit_cqring_flush(ctx);
-
+	io_commit_cqring_flush(ctx);
 	io_cqring_wake(ctx);
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 738fb96575ab7c..afca7ff8956c32 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -229,4 +229,10 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
+static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+	if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
+		__io_commit_cqring_flush(ctx);
+}
+
 #endif
diff --git a/io_uring/rw.c b/io_uring/rw.c
index f8b42f2265df98..0028e95e6633af 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1016,10 +1016,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 {
-	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
-		     ctx->has_evfd))
-		__io_commit_cqring_flush(ctx);
-
+	io_commit_cqring_flush(ctx);
 	if (ctx->flags & IORING_SETUP_SQPOLL)
 		io_cqring_wake(ctx);
 }

From f337a84d39527a2c46b1230470748d7bd0672ce0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:58 +0100
Subject: [PATCH 106/172] io_uring: opcode independent fixed buf import

Fixed buffers are generic infrastructure, make io_import_fixed() opcode
agnostic.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b1e765c8a1c2c913a05a28d2399fc53e1d3cf37a.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0028e95e6633af..ded8ef01165cdd 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -273,14 +273,15 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
-static int __io_import_fixed(struct io_kiocb *req, int ddir,
-			     struct iov_iter *iter, struct io_mapped_ubuf *imu)
+static int io_import_fixed(int ddir, struct iov_iter *iter,
+			   struct io_mapped_ubuf *imu,
+			   u64 buf_addr, size_t len)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
-	size_t len = rw->len;
-	u64 buf_end, buf_addr = rw->addr;
+	u64 buf_end;
 	size_t offset;
 
+	if (WARN_ON_ONCE(!imu))
+		return -EFAULT;
 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 		return -EFAULT;
 	/* not inside the mapped region */
@@ -332,14 +333,6 @@ static int __io_import_fixed(struct io_kiocb *req, int ddir,
 	return 0;
 }
 
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
-			   unsigned int issue_flags)
-{
-	if (WARN_ON_ONCE(!req->imu))
-		return -EFAULT;
-	return __io_import_fixed(req, rw, iter, req->imu);
-}
-
 #ifdef CONFIG_COMPAT
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)
@@ -426,7 +419,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	ssize_t ret;
 
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(req, ddir, iter, issue_flags);
+		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;

From c059f785840807ed5e1f2810420c1555969b6246 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:25:59 +0100
Subject: [PATCH 107/172] io_uring: move io_import_fixed()

Move io_import_fixed() into rsrc.c where it belongs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4d5becb21f332b4fef6a7cedd6a50e65e2371630.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 io_uring/rsrc.h |  3 +++
 io_uring/rw.c   | 60 -------------------------------------------------
 3 files changed, 63 insertions(+), 60 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 68629eba413265..1106089551595e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1310,3 +1310,63 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		io_rsrc_node_switch(ctx, NULL);
 	return ret;
 }
+
+int io_import_fixed(int ddir, struct iov_iter *iter,
+			   struct io_mapped_ubuf *imu,
+			   u64 buf_addr, size_t len)
+{
+	u64 buf_end;
+	size_t offset;
+
+	if (WARN_ON_ONCE(!imu))
+		return -EFAULT;
+	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
+		return -EFAULT;
+	/* not inside the mapped region */
+	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
+		return -EFAULT;
+
+	/*
+	 * May not be a start of buffer, set size appropriately
+	 * and advance us to the beginning.
+	 */
+	offset = buf_addr - imu->ubuf;
+	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
+
+	if (offset) {
+		/*
+		 * Don't use iov_iter_advance() here, as it's really slow for
+		 * using the latter parts of a big fixed buffer - it iterates
+		 * over each segment manually. We can cheat a bit here, because
+		 * we know that:
+		 *
+		 * 1) it's a BVEC iter, we set it up
+		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 *    first and last bvec
+		 *
+		 * So just find our index, and adjust the iterator afterwards.
+		 * If the offset is within the first bvec (or the whole first
+		 * bvec, just use iov_iter_advance(). This makes it easier
+		 * since we can just skip the first segment, which may not
+		 * be PAGE_SIZE aligned.
+		 */
+		const struct bio_vec *bvec = imu->bvec;
+
+		if (offset <= bvec->bv_len) {
+			iov_iter_advance(iter, offset);
+		} else {
+			unsigned long seg_skip;
+
+			/* skip first vec */
+			offset -= bvec->bv_len;
+			seg_skip = 1 + (offset >> PAGE_SHIFT);
+
+			iter->bvec = bvec + seg_skip;
+			iter->nr_segs -= seg_skip;
+			iter->count -= bvec->bv_len + offset;
+			iter->iov_offset = offset & ~PAGE_MASK;
+		}
+	}
+
+	return 0;
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 03f26516e99463..87f58315b247bc 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -64,6 +64,9 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 
+int io_import_fixed(int ddir, struct iov_iter *iter,
+			   struct io_mapped_ubuf *imu,
+			   u64 buf_addr, size_t len);
 
 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index ded8ef01165cdd..e07f2670dfa815 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -273,66 +273,6 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
-static int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len)
-{
-	u64 buf_end;
-	size_t offset;
-
-	if (WARN_ON_ONCE(!imu))
-		return -EFAULT;
-	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
-		return -EFAULT;
-	/* not inside the mapped region */
-	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
-		return -EFAULT;
-
-	/*
-	 * May not be a start of buffer, set size appropriately
-	 * and advance us to the beginning.
-	 */
-	offset = buf_addr - imu->ubuf;
-	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
-
-	if (offset) {
-		/*
-		 * Don't use iov_iter_advance() here, as it's really slow for
-		 * using the latter parts of a big fixed buffer - it iterates
-		 * over each segment manually. We can cheat a bit here, because
-		 * we know that:
-		 *
-		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
-		 *    first and last bvec
-		 *
-		 * So just find our index, and adjust the iterator afterwards.
-		 * If the offset is within the first bvec (or the whole first
-		 * bvec, just use iov_iter_advance(). This makes it easier
-		 * since we can just skip the first segment, which may not
-		 * be PAGE_SIZE aligned.
-		 */
-		const struct bio_vec *bvec = imu->bvec;
-
-		if (offset <= bvec->bv_len) {
-			iov_iter_advance(iter, offset);
-		} else {
-			unsigned long seg_skip;
-
-			/* skip first vec */
-			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> PAGE_SHIFT);
-
-			iter->bvec = bvec + seg_skip;
-			iter->nr_segs -= seg_skip;
-			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ~PAGE_MASK;
-		}
-	}
-
-	return 0;
-}
-
 #ifdef CONFIG_COMPAT
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)

From 9da070b142820031a0e273097f2b39982f45bbfd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:26:00 +0100
Subject: [PATCH 108/172] io_uring: consistent naming for inline completion

Improve naming of the inline/deferred completion helper so it's
consistent with it's *_post counterpart. Add some comments and extra
lockdeps to ensure the locking is done right.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/797c619943dac06529e9d3fcb16e4c3cde6ad1a3.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  4 ++--
 io_uring/io_uring.h | 10 +++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3e65e04915a782..8bc63413fc547e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1380,7 +1380,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	}
 
 	if (*locked)
-		io_req_add_compl_list(req);
+		io_req_complete_defer(req);
 	else
 		io_req_complete_post(req);
 }
@@ -1648,7 +1648,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret == IOU_OK) {
 		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
-			io_req_add_compl_list(req);
+			io_req_complete_defer(req);
 		else
 			io_req_complete_post(req);
 	} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index afca7ff8956c32..7a00bbe85d35dd 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -222,10 +222,18 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 	}
 }
 
-static inline void io_req_add_compl_list(struct io_kiocb *req)
+/*
+ * Don't complete immediately but use deferred completion infrastructure.
+ * Protected by ->uring_lock and can only be used either with
+ * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
+ */
+static inline void io_req_complete_defer(struct io_kiocb *req)
+	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_submit_state *state = &req->ctx->submit_state;
 
+	lockdep_assert_held(&req->ctx->uring_lock);
+
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 

From bce5d70cd64a5d48aff613334b8a5fac450b9753 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 01:26:01 +0100
Subject: [PATCH 109/172] io_uring: add a warn_once for poll_find

io_poll_remove() expects poll_find() to search only for poll requests and
passes a flag for this. Just be a little bit extra cautious considering
lots of recent poll/cancellation changes and add a WARN_ON_ONCE checking
that we don't get an apoll'ed request.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ec9a66f1e22f99dcd02288d4e42f3cc6bb357804.1655684496.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index cbf44c38efd9ee..bd3110750cfada 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -833,6 +833,11 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 found:
+	if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
 	if (poll_update->update_events || poll_update->update_user_data) {
 		/* only mask one event flags, keep behavior flags */
 		if (poll_update->update_events) {

From 4a0fef62788b69df09267c8e3f3f11d4bb9d50e7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 20 Jun 2022 15:27:35 +0100
Subject: [PATCH 110/172] io_uring: optimize io_uring_task layout

task_work bits of io_uring_task are split into two cache lines causing
extra cache bouncing, place them into a separate cache line. Also move
the most used submission path fields closer together, so there are hot.

Cc: stable@vger.kernel.org # 5.15+
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/tctx.h | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dde82ce4d8e29b..dead0ed0042984 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -7,22 +7,24 @@
 
 struct io_uring_task {
 	/* submission side */
-	int			cached_refs;
-	struct xarray		xa;
-	struct wait_queue_head	wait;
-	const struct io_ring_ctx *last;
-	struct io_wq		*io_wq;
-	struct percpu_counter	inflight;
-	atomic_t		inflight_tracked;
-	atomic_t		in_idle;
-
-	spinlock_t		task_lock;
-	struct io_wq_work_list	task_list;
-	struct io_wq_work_list	prio_task_list;
-	struct callback_head	task_work;
-	bool			task_running;
-
-	struct file		*registered_rings[IO_RINGFD_REG_MAX];
+	int				cached_refs;
+	const struct io_ring_ctx 	*last;
+	struct io_wq			*io_wq;
+	struct file			*registered_rings[IO_RINGFD_REG_MAX];
+
+	struct xarray			xa;
+	struct wait_queue_head		wait;
+	atomic_t			in_idle;
+	atomic_t			inflight_tracked;
+	struct percpu_counter		inflight;
+
+	struct { /* task_work */
+		spinlock_t		task_lock;
+		bool			task_running;
+		struct io_wq_work_list	task_list;
+		struct io_wq_work_list	prio_task_list;
+		struct callback_head	task_work;
+	} ____cacheline_aligned_in_smp;
 };
 
 struct io_tctx_node {

From 625d38b3fd34c58afb969810c4b3105eabb3b143 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 21 Jun 2022 10:09:00 +0100
Subject: [PATCH 111/172] io_uring: improve io_run_task_work()

Since SQPOLL now uses TWA_SIGNAL_NO_IPI, there won't be task work items
without TIF_NOTIFY_SIGNAL. Simplify io_run_task_work() by removing
task->task_works check. Even though looks it doesn't cause extra cache
bouncing, it's still nice to not touch it an extra time when it might be
not cached.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/75d4f34b0c671075892821a409e28da6cb1d64fe.1655802465.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7a00bbe85d35dd..4c4d38ffc5ec5e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -203,7 +203,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 
 static inline bool io_run_task_work(void)
 {
-	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
+	if (test_thread_flag(TIF_NOTIFY_SIGNAL)) {
 		__set_current_state(TASK_RUNNING);
 		clear_notify_signal();
 		if (task_work_pending(current))

From a6b21fbb4ce3c4976ba478a9f0f10d4163038478 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 21 Jun 2022 10:09:01 +0100
Subject: [PATCH 112/172] io_uring: move list helpers to a separate file

It's annoying to have io-wq.h as a dependency every time we want some of
struct io_wq_work_list helpers, move them into a separate file.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c1d891ce12b30767d1d2a3b7db2ca3abc1ecc4a2.1655802465.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c    |   1 +
 io_uring/io-wq.h    | 131 -----------------------------------------
 io_uring/io_uring.h |   1 +
 io_uring/slist.h    | 138 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 131 deletions(-)
 create mode 100644 io_uring/slist.h

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 824623bcf1a53f..3e34dfbdf9466a 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -18,6 +18,7 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io-wq.h"
+#include "slist.h"
 
 #define WORKER_IDLE_TIMEOUT	(5 * HZ)
 
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 10b80ef78bb817..31228426d19241 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,137 +21,6 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-#define wq_list_for_each(pos, prv, head)			\
-	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
-
-#define wq_list_for_each_resume(pos, prv)			\
-	for (; pos; prv = pos, pos = (pos)->next)
-
-#define wq_list_empty(list)	(READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list)	do {				\
-	(list)->first = NULL;					\
-} while (0)
-
-static inline void wq_list_add_after(struct io_wq_work_node *node,
-				     struct io_wq_work_node *pos,
-				     struct io_wq_work_list *list)
-{
-	struct io_wq_work_node *next = pos->next;
-
-	pos->next = node;
-	node->next = next;
-	if (!next)
-		list->last = node;
-}
-
-/**
- * wq_list_merge - merge the second list to the first one.
- * @list0: the first list
- * @list1: the second list
- * Return the first node after mergence.
- */
-static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
-						    struct io_wq_work_list *list1)
-{
-	struct io_wq_work_node *ret;
-
-	if (!list0->first) {
-		ret = list1->first;
-	} else {
-		ret = list0->first;
-		list0->last->next = list1->first;
-	}
-	INIT_WQ_LIST(list0);
-	INIT_WQ_LIST(list1);
-	return ret;
-}
-
-static inline void wq_list_add_tail(struct io_wq_work_node *node,
-				    struct io_wq_work_list *list)
-{
-	node->next = NULL;
-	if (!list->first) {
-		list->last = node;
-		WRITE_ONCE(list->first, node);
-	} else {
-		list->last->next = node;
-		list->last = node;
-	}
-}
-
-static inline void wq_list_add_head(struct io_wq_work_node *node,
-				    struct io_wq_work_list *list)
-{
-	node->next = list->first;
-	if (!node->next)
-		list->last = node;
-	WRITE_ONCE(list->first, node);
-}
-
-static inline void wq_list_cut(struct io_wq_work_list *list,
-			       struct io_wq_work_node *last,
-			       struct io_wq_work_node *prev)
-{
-	/* first in the list, if prev==NULL */
-	if (!prev)
-		WRITE_ONCE(list->first, last->next);
-	else
-		prev->next = last->next;
-
-	if (last == list->last)
-		list->last = prev;
-	last->next = NULL;
-}
-
-static inline void __wq_list_splice(struct io_wq_work_list *list,
-				    struct io_wq_work_node *to)
-{
-	list->last->next = to->next;
-	to->next = list->first;
-	INIT_WQ_LIST(list);
-}
-
-static inline bool wq_list_splice(struct io_wq_work_list *list,
-				  struct io_wq_work_node *to)
-{
-	if (!wq_list_empty(list)) {
-		__wq_list_splice(list, to);
-		return true;
-	}
-	return false;
-}
-
-static inline void wq_stack_add_head(struct io_wq_work_node *node,
-				     struct io_wq_work_node *stack)
-{
-	node->next = stack->next;
-	stack->next = node;
-}
-
-static inline void wq_list_del(struct io_wq_work_list *list,
-			       struct io_wq_work_node *node,
-			       struct io_wq_work_node *prev)
-{
-	wq_list_cut(list, node, prev);
-}
-
-static inline
-struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
-{
-	struct io_wq_work_node *node = stack->next;
-
-	stack->next = node->next;
-	return node;
-}
-
-static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
-{
-	if (!work->list.next)
-		return NULL;
-
-	return container_of(work->list.next, struct io_wq_work, list);
-}
-
 typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
 typedef void (io_wq_work_fn)(struct io_wq_work *);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 4c4d38ffc5ec5e..f026d2670959e6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -5,6 +5,7 @@
 #include <linux/lockdep.h>
 #include <linux/io_uring_types.h>
 #include "io-wq.h"
+#include "slist.h"
 #include "filetable.h"
 
 #ifndef CREATE_TRACE_POINTS
diff --git a/io_uring/slist.h b/io_uring/slist.h
new file mode 100644
index 00000000000000..f27601fa46607b
--- /dev/null
+++ b/io_uring/slist.h
@@ -0,0 +1,138 @@
+#ifndef INTERNAL_IO_SLIST_H
+#define INTERNAL_IO_SLIST_H
+
+#include <linux/io_uring_types.h>
+
+#define wq_list_for_each(pos, prv, head)			\
+	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv)			\
+	for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list)	(READ_ONCE((list)->first) == NULL)
+
+#define INIT_WQ_LIST(list)	do {				\
+	(list)->first = NULL;					\
+} while (0)
+
+static inline void wq_list_add_after(struct io_wq_work_node *node,
+				     struct io_wq_work_node *pos,
+				     struct io_wq_work_list *list)
+{
+	struct io_wq_work_node *next = pos->next;
+
+	pos->next = node;
+	node->next = next;
+	if (!next)
+		list->last = node;
+}
+
+/**
+ * wq_list_merge - merge the second list to the first one.
+ * @list0: the first list
+ * @list1: the second list
+ * Return the first node after mergence.
+ */
+static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
+						    struct io_wq_work_list *list1)
+{
+	struct io_wq_work_node *ret;
+
+	if (!list0->first) {
+		ret = list1->first;
+	} else {
+		ret = list0->first;
+		list0->last->next = list1->first;
+	}
+	INIT_WQ_LIST(list0);
+	INIT_WQ_LIST(list1);
+	return ret;
+}
+
+static inline void wq_list_add_tail(struct io_wq_work_node *node,
+				    struct io_wq_work_list *list)
+{
+	node->next = NULL;
+	if (!list->first) {
+		list->last = node;
+		WRITE_ONCE(list->first, node);
+	} else {
+		list->last->next = node;
+		list->last = node;
+	}
+}
+
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+				    struct io_wq_work_list *list)
+{
+	node->next = list->first;
+	if (!node->next)
+		list->last = node;
+	WRITE_ONCE(list->first, node);
+}
+
+static inline void wq_list_cut(struct io_wq_work_list *list,
+			       struct io_wq_work_node *last,
+			       struct io_wq_work_node *prev)
+{
+	/* first in the list, if prev==NULL */
+	if (!prev)
+		WRITE_ONCE(list->first, last->next);
+	else
+		prev->next = last->next;
+
+	if (last == list->last)
+		list->last = prev;
+	last->next = NULL;
+}
+
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+				    struct io_wq_work_node *to)
+{
+	list->last->next = to->next;
+	to->next = list->first;
+	INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+				  struct io_wq_work_node *to)
+{
+	if (!wq_list_empty(list)) {
+		__wq_list_splice(list, to);
+		return true;
+	}
+	return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+				     struct io_wq_work_node *stack)
+{
+	node->next = stack->next;
+	stack->next = node;
+}
+
+static inline void wq_list_del(struct io_wq_work_list *list,
+			       struct io_wq_work_node *node,
+			       struct io_wq_work_node *prev)
+{
+	wq_list_cut(list, node, prev);
+}
+
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+	struct io_wq_work_node *node = stack->next;
+
+	stack->next = node->next;
+	return node;
+}
+
+static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
+{
+	if (!work->list.next)
+		return NULL;
+
+	return container_of(work->list.next, struct io_wq_work, list);
+}
+
+#endif // INTERNAL_IO_SLIST_H
\ No newline at end of file

From 024f15e033a52660a045947ee56c7e842180fa81 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 21 Jun 2022 10:09:02 +0100
Subject: [PATCH 113/172] io_uring: dedup io_run_task_work

We have an identical copy of io_run_task_work() for io-wq called
io_flush_signals(), deduplicate them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a157a4df5fa217b8bd03c73494f2fd0e24e44fbc.1655802465.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.h |  2 ++
 io_uring/io-wq.c     | 17 +++--------------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 6b58aa48bc45d3..fb5a274c08ffcc 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -2,6 +2,8 @@
 #ifndef IOU_FILE_TABLE_H
 #define IOU_FILE_TABLE_H
 
+#include <linux/file.h>
+
 struct io_ring_ctx;
 struct io_kiocb;
 
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 3e34dfbdf9466a..77df5b43bf5239 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -19,6 +19,7 @@
 
 #include "io-wq.h"
 #include "slist.h"
+#include "io_uring.h"
 
 #define WORKER_IDLE_TIMEOUT	(5 * HZ)
 
@@ -519,23 +520,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
 	return NULL;
 }
 
-static bool io_flush_signals(void)
-{
-	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
-		__set_current_state(TASK_RUNNING);
-		clear_notify_signal();
-		if (task_work_pending(current))
-			task_work_run();
-		return true;
-	}
-	return false;
-}
-
 static void io_assign_current_work(struct io_worker *worker,
 				   struct io_wq_work *work)
 {
 	if (work) {
-		io_flush_signals();
+		io_run_task_work();
 		cond_resched();
 	}
 
@@ -655,7 +644,7 @@ static int io_wqe_worker(void *data)
 		last_timeout = false;
 		__io_worker_idle(wqe, worker);
 		raw_spin_unlock(&wqe->lock);
-		if (io_flush_signals())
+		if (io_run_task_work())
 			continue;
 		ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
 		if (signal_pending(current)) {

From ed5ccb3beeba0cadb0fcf353ae192021dfecf252 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:21 -0700
Subject: [PATCH 114/172] io_uring: remove priority tw list optimisation

This optimisation has some built in assumptions that make it easy to
introduce bugs. It also does not have clear wins that make it worth keeping.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 77 +++++++--------------------------------------
 io_uring/io_uring.h |  1 -
 io_uring/rw.c       |  2 +-
 io_uring/tctx.c     |  1 -
 io_uring/tctx.h     |  1 -
 5 files changed, 12 insertions(+), 70 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8bc63413fc547e..d21d0fc3645b12 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_prev_tw_list(struct io_wq_work_node *node,
-				struct io_ring_ctx **ctx, bool *uring_locked)
-{
-	if (*ctx && !*uring_locked)
-		spin_lock(&(*ctx)->completion_lock);
-
-	do {
-		struct io_wq_work_node *next = node->next;
-		struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    io_task_work.node);
-
-		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
-		if (req->ctx != *ctx) {
-			if (unlikely(!*uring_locked && *ctx))
-				io_cq_unlock_post(*ctx);
-
-			ctx_flush_and_put(*ctx, uring_locked);
-			*ctx = req->ctx;
-			/* if not contended, grab and improve batching */
-			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
-			percpu_ref_get(&(*ctx)->refs);
-			if (unlikely(!*uring_locked))
-				io_cq_lock(*ctx);
-		}
-		if (likely(*uring_locked)) {
-			req->io_task_work.func(req, uring_locked);
-		} else {
-			req->cqe.flags = io_put_kbuf_comp(req);
-			__io_req_complete_post(req);
-		}
-		node = next;
-	} while (node);
-
-	if (unlikely(!*uring_locked))
-		io_cq_unlock_post(*ctx);
-}
-
 static void handle_tw_list(struct io_wq_work_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
@@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 
 	while (1) {
-		struct io_wq_work_node *node1, *node2;
+		struct io_wq_work_node *node;
 
 		spin_lock_irq(&tctx->task_lock);
-		node1 = tctx->prio_task_list.first;
-		node2 = tctx->task_list.first;
+		node = tctx->task_list.first;
 		INIT_WQ_LIST(&tctx->task_list);
-		INIT_WQ_LIST(&tctx->prio_task_list);
-		if (!node2 && !node1)
+		if (!node)
 			tctx->task_running = false;
 		spin_unlock_irq(&tctx->task_lock);
-		if (!node2 && !node1)
+		if (!node)
 			break;
-
-		if (node1)
-			handle_prev_tw_list(node1, &ctx, &uring_locked);
-		if (node2)
-			handle_tw_list(node2, &ctx, &uring_locked);
+		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
 
-		if (data_race(!tctx->task_list.first) &&
-		    data_race(!tctx->prio_task_list.first) && uring_locked)
+		if (data_race(!tctx->task_list.first) && uring_locked)
 			io_submit_flush_completions(ctx);
 	}
 
@@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb)
 }
 
 static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx,
-				   struct io_wq_work_list *list)
+				   struct io_uring_task *tctx)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
@@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	bool running;
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, list);
+	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 	running = tctx->task_running;
 	if (!running)
 		tctx->task_running = true;
@@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	tctx->task_running = false;
-	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
+	node = tctx->task_list.first;
+	INIT_WQ_LIST(&tctx->task_list);
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	while (node) {
@@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 
-	__io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-void io_req_task_prio_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	if (req->ctx->flags & IORING_SETUP_SQPOLL)
-		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
-	else
-		__io_req_task_work_add(req, tctx, &tctx->task_list);
+	__io_req_task_work_add(req, tctx);
 }
 
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f026d2670959e6..f77e4a5403e4e5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -36,7 +36,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index e07f2670dfa815..ade3e235f2770c 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, res, 0);
 	req->io_task_work.func = io_req_task_complete;
-	io_req_task_prio_work_add(req);
+	io_req_task_work_add(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 9b30fb0d360307..7a68ba9beec3ea 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
-	INIT_WQ_LIST(&tctx->prio_task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dead0ed0042984..c8566ea5dca477 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -22,7 +22,6 @@ struct io_uring_task {
 		spinlock_t		task_lock;
 		bool			task_running;
 		struct io_wq_work_list	task_list;
-		struct io_wq_work_list	prio_task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };

From c34398a8c018e0d3d2d30b718d03c7290c696f51 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:22 -0700
Subject: [PATCH 115/172] io_uring: remove __io_req_task_work_add

this is no longer needed as there is only one caller

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d21d0fc3645b12..bf7ca2b279d3de 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb)
 		io_uring_drop_tctx_refs(current);
 }
 
-static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx)
+void io_req_task_work_add(struct io_kiocb *req)
 {
+	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
 	unsigned long flags;
@@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	}
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	__io_req_task_work_add(req, tctx);
-}
-
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 {
 	io_req_complete_post(req);

From f88262e60bb9cb5740891672ce9f405e7f9393e5 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:23 -0700
Subject: [PATCH 116/172] io_uring: lockless task list

With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-4-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 38 ++++++++--------------------------
 io_uring/tctx.c                |  3 +--
 io_uring/tctx.h                |  6 +++---
 4 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca3838..918165a200533f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
 	union {
-		struct io_wq_work_node	node;
+		struct llist_node	node;
 		struct llist_node	fallback_node;
 	};
 	io_req_tw_func_t		func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index bf7ca2b279d3de..0124335c6d092a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
 	do {
-		struct io_wq_work_node *next = node->next;
+		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
+	struct llist_node *node = llist_del_all(&tctx->task_list);
 
-	while (1) {
-		struct io_wq_work_node *node;
-
-		spin_lock_irq(&tctx->task_lock);
-		node = tctx->task_list.first;
-		INIT_WQ_LIST(&tctx->task_list);
-		if (!node)
-			tctx->task_running = false;
-		spin_unlock_irq(&tctx->task_lock);
-		if (!node)
-			break;
+	if (node) {
 		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
-
-		if (data_race(!tctx->task_list.first) && uring_locked)
-			io_submit_flush_completions(ctx);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_wq_work_node *node;
-	unsigned long flags;
+	struct llist_node *node;
 	bool running;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
-	running = tctx->task_running;
-	if (!running)
-		tctx->task_running = true;
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+	running = !llist_add(&req->io_task_work.node, &tctx->task_list);
 
 	/* task_work already pending, we're done */
 	if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	tctx->task_running = false;
-	node = tctx->task_list.first;
-	INIT_WQ_LIST(&tctx->task_list);
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3ea..7f97d97fef0a96 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
-	spin_lock_init(&tctx->task_lock);
-	INIT_WQ_LIST(&tctx->task_list);
+	init_llist_head(&tctx->task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca477..8a33ff6e5d9138 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/llist.h>
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -19,9 +21,7 @@ struct io_uring_task {
 	struct percpu_counter		inflight;
 
 	struct { /* task_work */
-		spinlock_t		task_lock;
-		bool			task_running;
-		struct io_wq_work_list	task_list;
+		struct llist_head	task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };

From 923d159247b732885b176b24e4bafad8eda5a477 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:24 -0700
Subject: [PATCH 117/172] io_uring: introduce llist helpers

Introduce helpers to atomically switch llist.

Will later move this into common code

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-5-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0124335c6d092a..356000255211fb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node,
 	} while (node);
 }
 
+/**
+ * io_llist_xchg - swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @new:	new entry as the head of the list
+ *
+ * If list is empty, return NULL, otherwise, return the pointer to the first entry.
+ * The order of entries returned is from the newest to the oldest added one.
+ */
+static inline struct llist_node *io_llist_xchg(struct llist_head *head,
+					       struct llist_node *new)
+{
+	return xchg(&head->first, new);
+}
+
+/**
+ * io_llist_cmpxchg - possibly swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @old:	expected old value of the first entry of the list
+ * @new:	new entry as the head of the list
+ *
+ * perform a cmpxchg on the first entry of the list.
+ */
+
+static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
+						  struct llist_node *old,
+						  struct llist_node *new)
+{
+	return cmpxchg(&head->first, old, new);
+}
+
 void tctx_task_work(struct callback_head *cb)
 {
 	bool uring_locked = false;

From 3a0c037b0e16e38cf5d36d2ebc259e0ae644aaf4 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:25 -0700
Subject: [PATCH 118/172] io_uring: batch task_work

Batching task work up is an important performance optimisation, as
task_work_add is expensive.

In order to keep the semantics replace the task_list with a fake node
while processing the old list, and then do a cmpxchg at the end to see if
there is more work.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-6-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 356000255211fb..9d523fafacb72d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-
 static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked)
+			   struct io_ring_ctx **ctx, bool *locked,
+			   struct llist_node *last)
 {
-	do {
+	while (node != last) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1006,7 +1006,7 @@ static void handle_tw_list(struct llist_node *node,
 		}
 		req->io_task_work.func(req, locked);
 		node = next;
-	} while (node);
+	}
 }
 
 /**
@@ -1045,11 +1045,15 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
-	struct llist_node *node = llist_del_all(&tctx->task_list);
-
-	if (node) {
-		handle_tw_list(node, &ctx, &uring_locked);
-		cond_resched();
+	struct llist_node fake = {};
+	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+
+	handle_tw_list(node, &ctx, &uring_locked, NULL);
+	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
+	while (node != &fake) {
+		node = io_llist_xchg(&tctx->task_list, &fake);
+		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);

From eccd8801858f03a19a4d1f8fef27e3d6e17b21fd Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:27 -0700
Subject: [PATCH 119/172] io_uring: add trace event for running task work

This is useful for investigating if task_work is batching

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-8-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaacf..918e3a43e4b280 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		  __entry->cflags, __entry->ocqe)
 );
 
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx:		pointer to a io_uring_task
+ * @count:		how many functions it ran
+ * @loops:		how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+	TP_ARGS(tctx, count, loops),
+
+	TP_STRUCT__entry (
+		__field(  void *,		tctx		)
+		__field(  unsigned int,		count		)
+		__field(  unsigned int,		loops		)
+	),
+
+	TP_fast_assign(
+		__entry->tctx		= tctx;
+		__entry->count		= count;
+		__entry->loops		= loops;
+	),
+
+	TP_printk("tctx %p, count %u, loops %u",
+		 __entry->tctx, __entry->count, __entry->loops)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */

From c6dd763c246024ed0dd603db4284cbd0bd164e27 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Wed, 22 Jun 2022 06:40:28 -0700
Subject: [PATCH 120/172] io_uring: trace task_work_run

trace task_work_run to help provide stats on how often task work is run
and what batch sizes are coming through.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-9-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9d523fafacb72d..997b915a1ff78a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,10 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked,
-			   struct llist_node *last)
+static unsigned int handle_tw_list(struct llist_node *node,
+				   struct io_ring_ctx **ctx, bool *locked,
+				   struct llist_node *last)
 {
+	unsigned int count = 0;
+
 	while (node != last) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1006,7 +1008,10 @@ static void handle_tw_list(struct llist_node *node,
 		}
 		req->io_task_work.func(req, locked);
 		node = next;
+		count++;
 	}
+
+	return count;
 }
 
 /**
@@ -1047,12 +1052,14 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 	struct llist_node fake = {};
 	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+	unsigned int loops = 1;
+	unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL);
 
-	handle_tw_list(node, &ctx, &uring_locked, NULL);
 	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	while (node != &fake) {
+		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
-		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
@@ -1061,6 +1068,8 @@ void tctx_task_work(struct callback_head *cb)
 	/* relaxed read is enough as only the task itself sets ->in_idle */
 	if (unlikely(atomic_read(&tctx->in_idle)))
 		io_uring_drop_tctx_refs(current);
+
+	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
 void io_req_task_work_add(struct io_kiocb *req)

From 024b8fde3320ea34d7a5a3fc9dbc47ec736cd8eb Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Wed, 22 Jun 2022 13:55:51 +0800
Subject: [PATCH 121/172] io_uring: kbuf: kill __io_kbuf_recycle()

__io_kbuf_recycle() is only called in io_kbuf_recycle(). Kill it and
tweak the code so that the legacy pbuf and ring pbuf code become clear

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220622055551.642370-1-hao.xu@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 71 +++++++++++++++++++++++++++++--------------------
 io_uring/kbuf.h | 21 +++++----------
 2 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 62de0dda24bf6e..8bf47e49ea5bc7 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -37,36 +37,30 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 	return xa_load(&ctx->io_bl_xa, bgid);
 }
 
-void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+			      struct io_buffer_list *bl, unsigned int bgid)
+{
+	bl->bgid = bgid;
+	if (bgid < BGID_ARRAY)
+		return 0;
+
+	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
+}
+
+void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	struct io_buffer *buf;
 
 	/*
-	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
-	 * the flag and hence ensure that bl->head doesn't get incremented.
-	 * If the tail has already been incremented, hang on to it.
+	 * For legacy provided buffer mode, don't recycle if we already did
+	 * IO to this buffer. For ring-mapped provided buffer mode, we should
+	 * increment ring->head to explicitly monopolize the buffer to avoid
+	 * multiple use.
 	 */
-	if (req->flags & REQ_F_BUFFER_RING) {
-		if (req->buf_list) {
-			if (req->flags & REQ_F_PARTIAL_IO) {
-				/*
-				 * If we end up here, then the io_uring_lock has
-				 * been kept held since we retrieved the buffer.
-				 * For the io-wq case, we already cleared
-				 * req->buf_list when the buffer was retrieved,
-				 * hence it cannot be set here for that case.
-				 */
-				req->buf_list->head++;
-				req->buf_list = NULL;
-			} else {
-				req->buf_index = req->buf_list->bgid;
-				req->flags &= ~REQ_F_BUFFER_RING;
-			}
-		}
+	if (req->flags & REQ_F_PARTIAL_IO)
 		return;
-	}
 
 	io_ring_submit_lock(ctx, issue_flags);
 
@@ -77,16 +71,35 @@ void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	req->buf_index = buf->bgid;
 
 	io_ring_submit_unlock(ctx, issue_flags);
+	return;
 }
 
-static int io_buffer_add_list(struct io_ring_ctx *ctx,
-			      struct io_buffer_list *bl, unsigned int bgid)
+void io_kbuf_recycle_ring(struct io_kiocb *req)
 {
-	bl->bgid = bgid;
-	if (bgid < BGID_ARRAY)
-		return 0;
-
-	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
+	/*
+	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+	 * the flag and hence ensure that bl->head doesn't get incremented.
+	 * If the tail has already been incremented, hang on to it.
+	 * The exception is partial io, that case we should increment bl->head
+	 * to monopolize the buffer.
+	 */
+	if (req->buf_list) {
+		if (req->flags & REQ_F_PARTIAL_IO) {
+			/*
+			 * If we end up here, then the io_uring_lock has
+			 * been kept held since we retrieved the buffer.
+			 * For the io-wq case, we already cleared
+			 * req->buf_list when the buffer was retrieved,
+			 * hence it cannot be set here for that case.
+			 */
+			req->buf_list->head++;
+			req->buf_list = NULL;
+		} else {
+			req->buf_index = req->buf_list->bgid;
+			req->flags &= ~REQ_F_BUFFER_RING;
+		}
+	}
+	return;
 }
 
 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 304e7139d83562..721465c5d809fc 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -35,7 +35,6 @@ struct io_buffer {
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 			      unsigned int issue_flags);
-void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags);
 void io_destroy_buffers(struct io_ring_ctx *ctx);
 
 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -49,6 +48,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 
 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
+void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
+void io_kbuf_recycle_ring(struct io_kiocb *req);
+
 static inline bool io_do_buffer_select(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_BUFFER_SELECT))
@@ -58,18 +60,6 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
 
 static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 {
-	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
-		return;
-	/*
-	 * For legacy provided buffer mode, don't recycle if we already did
-	 * IO to this buffer. For ring-mapped provided buffer mode, we should
-	 * increment ring->head to explicitly monopolize the buffer to avoid
-	 * multiple use.
-	 */
-	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
-	    (req->flags & REQ_F_PARTIAL_IO))
-		return;
-
 	/*
 	 * READV uses fields in `struct io_rw` (len/addr) to stash the selected
 	 * buffer data. However if that buffer is recycled the original request
@@ -78,7 +68,10 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	if (req->opcode == IORING_OP_READV)
 		return;
 
-	__io_kbuf_recycle(req, issue_flags);
+	if (req->flags & REQ_F_BUFFER_SELECTED)
+		io_kbuf_recycle_legacy(req, issue_flags);
+	if (req->flags & REQ_F_BUFFER_RING)
+		io_kbuf_recycle_ring(req);
 }
 
 static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,

From 88f52eaad2df2cb5ab49b864d79398c9cb9a57f2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 09:23:54 -0600
Subject: [PATCH 122/172] io_uring: have cancelation API accept io_uring_task
 directly

We just use the io_kiocb passed in to find the io_uring_task, and we
already pass in the ctx via cd->ctx anyway.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/cancel.c  | 17 +++++++++--------
 io_uring/cancel.h  |  2 +-
 io_uring/timeout.c |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index d1e7f5a955ab20..500ee5f5fd23ca 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -77,15 +77,15 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 	return ret;
 }
 
-int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd,
+int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 		  unsigned issue_flags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_ring_ctx *ctx = cd->ctx;
 	int ret;
 
-	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
+	WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring);
 
-	ret = io_async_cancel_one(req->task->io_uring, cd);
+	ret = io_async_cancel_one(tctx, cd);
 	/*
 	 * Fall-through even for -EALREADY, as we may have poll armed
 	 * that need unarming.
@@ -104,7 +104,6 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd,
 	return ret;
 }
 
-
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_cancel *cancel = io_kiocb_to_cmd(req);
@@ -127,7 +126,8 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
+static int __io_async_cancel(struct io_cancel_data *cd,
+			     struct io_uring_task *tctx,
 			     unsigned int issue_flags)
 {
 	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
@@ -136,7 +136,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 	int ret, nr = 0;
 
 	do {
-		ret = io_try_cancel(req, cd, issue_flags);
+		ret = io_try_cancel(tctx, cd, issue_flags);
 		if (ret == -ENOENT)
 			break;
 		if (!all)
@@ -170,6 +170,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		.flags	= cancel->flags,
 		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
 	};
+	struct io_uring_task *tctx = req->task->io_uring;
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
@@ -185,7 +186,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		cd.file = req->file;
 	}
 
-	ret = __io_async_cancel(&cd, req, issue_flags);
+	ret = __io_async_cancel(&cd, tctx, issue_flags);
 done:
 	if (ret < 0)
 		req_set_fail(req);
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 2338012a5b06fb..1bc7e917ce94e6 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -16,6 +16,6 @@ struct io_cancel_data {
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 
-int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd,
+int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 		  unsigned int issue_flags);
 void init_hash_table(struct io_hash_table *table, unsigned size);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 7e2c341f976258..4af074b8f6b7d7 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -274,7 +274,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 				.data		= prev->cqe.user_data,
 			};
 
-			ret = io_try_cancel(req, &cd, issue_flags);
+			ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
 		io_req_complete_post(req);

From 7d8ca7250197096bfa9f432c1d99b0555504bbba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 09:47:04 -0600
Subject: [PATCH 123/172] io_uring: add IORING_ASYNC_CANCEL_FD_FIXED cancel
 flag

In preparation for not having a request to pass in that carries this
state, add a separate cancelation flag that allows the caller to ask
for a fixed file for cancelation.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 ++
 io_uring/cancel.c             | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d7ae81b10893e9..a09a78bd75566e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -247,10 +247,12 @@ enum io_uring_op {
  * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
  *				request 'user_data'
  * IORING_ASYNC_CANCEL_ANY	Match any request
+ * IORING_ASYNC_CANCEL_FD_FIXED	'fd' passed in is a fixed descriptor
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
+#define IORING_ASYNC_CANCEL_FD_FIXED	(1U << 3)
 
 /*
  * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 500ee5f5fd23ca..da486de07029a5 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -24,7 +24,7 @@ struct io_cancel {
 };
 
 #define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
-			 IORING_ASYNC_CANCEL_ANY)
+			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
 
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
@@ -174,11 +174,14 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
-		if (req->flags & REQ_F_FIXED_FILE)
+		if (req->flags & REQ_F_FIXED_FILE ||
+		    cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
+			req->flags |= REQ_F_FIXED_FILE;
 			req->file = io_file_get_fixed(req, cancel->fd,
 							issue_flags);
-		else
+		} else {
 			req->file = io_file_get_normal(req, cancel->fd);
+		}
 		if (!req->file) {
 			ret = -EBADF;
 			goto done;

From 78a861b9495920f8609dee5b670dacbff09d359f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 10:00:50 -0600
Subject: [PATCH 124/172] io_uring: add sync cancelation API through
 io_uring_register()

The io_uring cancelation API is async, like any other API that we expose
there. For the case of finding a request to cancel, or not finding one,
it is fully sync in that when submission returns, the CQE for both the
cancelation request and the targeted request have been posted to the
CQ ring.

However, if the targeted work is being executed by io-wq, the API can
only start the act of canceling it. This makes it difficult to use in
some circumstances, as the caller then has to wait for the CQEs to come
in and match on the same cancelation data there.

Provide a IORING_REGISTER_SYNC_CANCEL command for io_uring_register()
that does sync cancelations, always. For the io-wq case, it'll wait
for the cancelation to come in before returning. The only expected
returns from this API is:

0		Request found and canceled fine.
> 0		Requests found and canceled. Only happens if asked to
		cancel multiple requests, and if the work wasn't in
		progress.
-ENOENT		Request not found.
-ETIME		A timeout on the operation was requested, but the timeout
		expired before we could cancel.

and we won't get -EALREADY via this API.

If the timeout value passed in is -1 (tv_sec and tv_nsec), then that
means that no timeout is requested. Otherwise, the timespec passed in
is the amount of time the sync cancel will wait for a successful
cancelation.

Link: https://github.com/axboe/liburing/discussions/608
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  15 +++++
 io_uring/cancel.c             | 107 ++++++++++++++++++++++++++++++++++
 io_uring/cancel.h             |   2 +
 io_uring/io_uring.c           |   6 ++
 4 files changed, 130 insertions(+)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a09a78bd75566e..094f706c93e0b9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -10,6 +10,7 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
+#include <linux/time_types.h>
 
 /*
  * IO submission data structure (Submission Queue Entry)
@@ -428,6 +429,9 @@ enum {
 	IORING_REGISTER_PBUF_RING		= 22,
 	IORING_UNREGISTER_PBUF_RING		= 23,
 
+	/* sync cancelation API */
+	IORING_REGISTER_SYNC_CANCEL		= 24,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -563,4 +567,15 @@ struct io_uring_getevents_arg {
 	__u64	ts;
 };
 
+/*
+ * Argument for IORING_REGISTER_SYNC_CANCEL
+ */
+struct io_uring_sync_cancel_reg {
+	__u64				addr;
+	__s32				fd;
+	__u32				flags;
+	struct __kernel_timespec	timeout;
+	__u64				pad[4];
+};
+
 #endif
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index da486de07029a5..8435a1eba59acc 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -206,3 +207,109 @@ void init_hash_table(struct io_hash_table *table, unsigned size)
 		INIT_HLIST_HEAD(&table->hbs[i].list);
 	}
 }
+
+static int __io_sync_cancel(struct io_uring_task *tctx,
+			    struct io_cancel_data *cd, int fd)
+{
+	struct io_ring_ctx *ctx = cd->ctx;
+
+	/* fixed must be grabbed every time since we drop the uring_lock */
+	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
+	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
+		unsigned long file_ptr;
+
+		if (unlikely(fd > ctx->nr_user_files))
+			return -EBADF;
+		fd = array_index_nospec(fd, ctx->nr_user_files);
+		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+		cd->file = (struct file *) (file_ptr & FFS_MASK);
+		if (!cd->file)
+			return -EBADF;
+	}
+
+	return __io_async_cancel(cd, tctx, 0);
+}
+
+int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_cancel_data cd = {
+		.ctx	= ctx,
+		.seq	= atomic_inc_return(&ctx->cancel_seq),
+	};
+	ktime_t timeout = KTIME_MAX;
+	struct io_uring_sync_cancel_reg sc;
+	struct fd f = { };
+	DEFINE_WAIT(wait);
+	int ret;
+
+	if (copy_from_user(&sc, arg, sizeof(sc)))
+		return -EFAULT;
+	if (sc.flags & ~CANCEL_FLAGS)
+		return -EINVAL;
+	if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
+		return -EINVAL;
+
+	cd.data = sc.addr;
+	cd.flags = sc.flags;
+
+	/* we can grab a normal file descriptor upfront */
+	if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
+	   !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
+		f = fdget(sc.fd);
+		if (!f.file)
+			return -EBADF;
+		cd.file = f.file;
+	}
+
+	ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
+
+	/* found something, done! */
+	if (ret != -EALREADY)
+		goto out;
+
+	if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
+		struct timespec64 ts = {
+			.tv_sec		= sc.timeout.tv_sec,
+			.tv_nsec	= sc.timeout.tv_nsec
+		};
+
+		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+	}
+
+	/*
+	 * Keep looking until we get -ENOENT. we'll get woken everytime
+	 * every time a request completes and will retry the cancelation.
+	 */
+	do {
+		cd.seq = atomic_inc_return(&ctx->cancel_seq);
+
+		prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);
+
+		ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
+
+		if (ret != -EALREADY)
+			break;
+
+		mutex_unlock(&ctx->uring_lock);
+		ret = io_run_task_work_sig();
+		if (ret < 0) {
+			mutex_lock(&ctx->uring_lock);
+			break;
+		}
+		ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
+		mutex_lock(&ctx->uring_lock);
+		if (!ret) {
+			ret = -ETIME;
+			break;
+		}
+	} while (1);
+
+	finish_wait(&ctx->cq_wait, &wait);
+
+	if (ret == -ENOENT || ret > 0)
+		ret = 0;
+out:
+	fdput(f);
+	return ret;
+}
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 1bc7e917ce94e6..6a59ee484d0cca 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -19,3 +19,5 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 		  unsigned int issue_flags);
 void init_hash_table(struct io_hash_table *table, unsigned size);
+
+int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 997b915a1ff78a..45538b3c3a764e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3871,6 +3871,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_pbuf_ring(ctx, arg);
 		break;
+	case IORING_REGISTER_SYNC_CANCEL:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_sync_cancel(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;

From 0638cd7be21221c7b583a4901e25e63d96112395 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jun 2022 14:24:44 +0100
Subject: [PATCH 125/172] io_uring: clean poll ->private flagging

We store a req pointer in wqe->private but also take one bit to mark
double poll entries. Replace macro helpers with inline functions for
better type checking and also name the double flag.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9a61240555c64ac0b7a9b0eb59a9efeb638a35a4.1655990418.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index bd3110750cfada..210b174b155b61 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -39,6 +39,22 @@ struct io_poll_table {
 #define IO_POLL_CANCEL_FLAG	BIT(31)
 #define IO_POLL_REF_MASK	GENMASK(30, 0)
 
+#define IO_WQE_F_DOUBLE		1
+
+static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
+{
+	unsigned long priv = (unsigned long)wqe->private;
+
+	return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE);
+}
+
+static inline bool wqe_is_double(struct wait_queue_entry *wqe)
+{
+	unsigned long priv = (unsigned long)wqe->private;
+
+	return priv & IO_WQE_F_DOUBLE;
+}
+
 /*
  * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
  * bump it and acquire ownership. It's disallowed to modify requests while not
@@ -306,8 +322,6 @@ static void io_poll_cancel_req(struct io_kiocb *req)
 	io_poll_execute(req, 0, 0);
 }
 
-#define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
-#define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
 #define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -392,7 +406,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 			return;
 		}
 		/* mark as double wq entry */
-		wqe_private |= 1;
+		wqe_private |= IO_WQE_F_DOUBLE;
 		req->flags |= REQ_F_DOUBLE_POLL;
 		io_init_poll_iocb(poll, first->events, first->wait.func);
 		*poll_ptr = poll;

From 13a99017ff19f179b51e1c51559a9d7005df1830 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jun 2022 14:24:45 +0100
Subject: [PATCH 126/172] io_uring: remove events caching atavisms

Remove events argument from *io_poll_execute(), it's not needed and not
used.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/12efd4e15c6a90cf9e5b59807cfcb57852b51dc7.1655990418.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 210b174b155b61..7de8c52793cd3b 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -289,8 +289,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 		io_req_complete_failed(req, ret);
 }
 
-static void __io_poll_execute(struct io_kiocb *req, int mask,
-			      __poll_t __maybe_unused events)
+static void __io_poll_execute(struct io_kiocb *req, int mask)
 {
 	io_req_set_res(req, mask, 0);
 	/*
@@ -308,18 +307,17 @@ static void __io_poll_execute(struct io_kiocb *req, int mask,
 	io_req_task_work_add(req);
 }
 
-static inline void io_poll_execute(struct io_kiocb *req, int res,
-		__poll_t events)
+static inline void io_poll_execute(struct io_kiocb *req, int res)
 {
 	if (io_poll_get_ownership(req))
-		__io_poll_execute(req, res, events);
+		__io_poll_execute(req, res);
 }
 
 static void io_poll_cancel_req(struct io_kiocb *req)
 {
 	io_poll_mark_cancelled(req);
 	/* kick tw, which should complete the request */
-	io_poll_execute(req, 0, 0);
+	io_poll_execute(req, 0);
 }
 
 #define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
@@ -334,7 +332,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	if (unlikely(mask & POLLFREE)) {
 		io_poll_mark_cancelled(req);
 		/* we have to kick tw in case it's not already */
-		io_poll_execute(req, 0, poll->events);
+		io_poll_execute(req, 0);
 
 		/*
 		 * If the waitqueue is being freed early but someone is already
@@ -369,7 +367,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			else
 				req->flags &= ~REQ_F_SINGLE_POLL;
 		}
-		__io_poll_execute(req, mask, poll->events);
+		__io_poll_execute(req, mask);
 	}
 	return 1;
 }
@@ -487,7 +485,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 			req->apoll_events |= EPOLLONESHOT;
 			ipt->error = 0;
 		}
-		__io_poll_execute(req, mask, poll->events);
+		__io_poll_execute(req, mask);
 		return 0;
 	}
 
@@ -497,7 +495,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	 */
 	v = atomic_dec_return(&req->poll_refs);
 	if (unlikely(v & IO_POLL_REF_MASK))
-		__io_poll_execute(req, 0, poll->events);
+		__io_poll_execute(req, 0);
 	return 0;
 }
 

From 5204aa8c43bd1c3428b8979229183ae8269a8c09 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jun 2022 14:24:46 +0100
Subject: [PATCH 127/172] io_uring: add a helper for apoll alloc

Extract a helper function for apoll allocation, makes the code easier to
read.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2f93282b47dd678e805dd0d7097f66968ced495c.1655990418.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 7de8c52793cd3b..aef77f2a8a9a76 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -508,10 +508,33 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
 }
 
+static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
+					     unsigned issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct async_poll *apoll;
+
+	if (req->flags & REQ_F_POLLED) {
+		apoll = req->apoll;
+		kfree(apoll->double_poll);
+	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+		   !list_empty(&ctx->apoll_cache)) {
+		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+						poll.wait.entry);
+		list_del_init(&apoll->poll.wait.entry);
+	} else {
+		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+		if (unlikely(!apoll))
+			return NULL;
+	}
+	apoll->double_poll = NULL;
+	req->apoll = apoll;
+	return apoll;
+}
+
 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
-	struct io_ring_ctx *ctx = req->ctx;
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
 	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
@@ -546,21 +569,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	}
 	if (def->poll_exclusive)
 		mask |= EPOLLEXCLUSIVE;
-	if (req->flags & REQ_F_POLLED) {
-		apoll = req->apoll;
-		kfree(apoll->double_poll);
-	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-		   !list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del_init(&apoll->poll.wait.entry);
-	} else {
-		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
-		if (unlikely(!apoll))
-			return IO_APOLL_ABORTED;
-	}
-	apoll->double_poll = NULL;
-	req->apoll = apoll;
+
+	apoll = io_req_alloc_apoll(req, issue_flags);
+	if (!apoll)
+		return IO_APOLL_ABORTED;
 	req->flags |= REQ_F_POLLED;
 	ipt.pt._qproc = io_async_queue_proc;
 

From 063a007996bf725ba4c7d8741701670be9858300 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jun 2022 14:24:47 +0100
Subject: [PATCH 128/172] io_uring: change arm poll return values

The rules for __io_arm_poll_handler()'s result parsing are complicated,
as the first step don't pass return a mask but pass back a positive
return code and fill ipt->result_mask.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/529e29e9f97f2e6e383ccd44234d8b576a83a921.1655990418.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index aef77f2a8a9a76..80113b036c88d8 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -34,6 +34,8 @@ struct io_poll_table {
 	struct io_kiocb *req;
 	int nr_entries;
 	int error;
+	/* output value, set only if arm poll returns >0 */
+	__poll_t result_mask;
 };
 
 #define IO_POLL_CANCEL_FLAG	BIT(31)
@@ -462,8 +464,9 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	if (mask &&
 	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
 		io_poll_remove_entries(req);
+		ipt->result_mask = mask;
 		/* no one else has access to the req, forget about the ref */
-		return mask;
+		return 1;
 	}
 
 	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
@@ -813,7 +816,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
 	if (ret) {
-		io_req_set_res(req, ret, 0);
+		io_req_set_res(req, ipt.result_mask, 0);
 		return IOU_OK;
 	}
 	if (ipt.error) {

From de08356f4858628fdffb8bd7e9cceb60c7e08ead Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jun 2022 14:24:48 +0100
Subject: [PATCH 129/172] io_uring: refactor poll arm error handling

__io_arm_poll_handler() errors parsing is a horror, in case it failed it
returns 0 and the caller is expected to look at ipt.error, which already
led us to a number of problems before.

When it returns a valid mask, leave it as it's not, i.e. return 1 and
store the mask in ipt.result_mask. In case of a failure that can be
handled inline return an error code (negative value), and return 0 if
__io_arm_poll_handler() took ownership of the request and will complete
it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/018cacdaef5fe95d7dc56b32e85d752cab7607f6.1655990418.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 80113b036c88d8..3f3ae3b1505fa4 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -435,6 +435,12 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 			(struct io_poll **) &pt->req->async_data);
 }
 
+/*
+ * Returns 0 when it's handed over for polling. The caller owns the requests if
+ * it returns non-zero, but otherwise should not touch it. Negative values
+ * contain an error code. When the result is >0, the polling has completed
+ * inline and ipt.result_mask is set to the mask.
+ */
 static int __io_arm_poll_handler(struct io_kiocb *req,
 				 struct io_poll *poll,
 				 struct io_poll_table *ipt, __poll_t mask)
@@ -461,6 +467,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	atomic_set(&req->poll_refs, 1);
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 
+	if (unlikely(ipt->error || !ipt->nr_entries)) {
+		io_poll_remove_entries(req);
+
+		if (mask && (poll->events & EPOLLET)) {
+			ipt->result_mask = mask;
+			return 1;
+		} else {
+			return ipt->error ?: -EINVAL;
+		}
+	}
+
 	if (mask &&
 	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
 		io_poll_remove_entries(req);
@@ -469,25 +486,12 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 		return 1;
 	}
 
-	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
-		io_poll_remove_entries(req);
-		if (!ipt->error)
-			ipt->error = -EINVAL;
-		return 0;
-	}
-
 	if (req->flags & REQ_F_HASH_LOCKED)
 		io_poll_req_insert_locked(req);
 	else
 		io_poll_req_insert(req);
 
 	if (mask && (poll->events & EPOLLET)) {
-		/* can't multishot if failed, just queue the event we've got */
-		if (unlikely(ipt->error || !ipt->nr_entries)) {
-			poll->events |= EPOLLONESHOT;
-			req->apoll_events |= EPOLLONESHOT;
-			ipt->error = 0;
-		}
 		__io_poll_execute(req, mask);
 		return 0;
 	}
@@ -582,9 +586,8 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	io_kbuf_recycle(req, issue_flags);
 
 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
-	if (ret || ipt.error)
-		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
-
+	if (ret)
+		return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
 	trace_io_uring_poll_arm(req, mask, apoll->poll.events);
 	return IO_APOLL_OK;
 }
@@ -815,16 +818,11 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 		req->flags &= ~REQ_F_HASH_LOCKED;
 
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
-	if (ret) {
+	if (ret > 0) {
 		io_req_set_res(req, ipt.result_mask, 0);
 		return IOU_OK;
 	}
-	if (ipt.error) {
-		req_set_fail(req);
-		return ipt.error;
-	}
-
-	return IOU_ISSUE_SKIP_COMPLETE;
+	return ret ?: IOU_ISSUE_SKIP_COMPLETE;
 }
 
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)

From 49f1c68e048f1706b71c8255faf8110113d1cc48 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jun 2022 14:24:49 +0100
Subject: [PATCH 130/172] io_uring: optimise submission side poll_refs

The final poll_refs put in __io_arm_poll_handler() takes quite some
cycles. When we're arming from the original task context task_work won't
be run, so in this case we can assume that we won't race with task_works
and so not take the initial ownership ref.

One caveat is that after arming a poll we may race with it, so we have
to add a bunch of io_poll_get_ownership() hidden inside of
io_poll_can_finish_inline() whenever we want to complete arming inline.
For the same reason we can't just set REQ_F_DOUBLE_POLL in
__io_queue_proc() and so need to sync with the first poll entry by
taking its wq head lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8825315d7f5e182ac1578a031e546f79b1c97d01.1655990418.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 88 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 67 insertions(+), 21 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 3f3ae3b1505fa4..eba767594deee5 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -34,6 +34,7 @@ struct io_poll_table {
 	struct io_kiocb *req;
 	int nr_entries;
 	int error;
+	bool owning;
 	/* output value, set only if arm poll returns >0 */
 	__poll_t result_mask;
 };
@@ -374,6 +375,27 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	return 1;
 }
 
+static void io_poll_double_prepare(struct io_kiocb *req)
+{
+	struct wait_queue_head *head;
+	struct io_poll *poll = io_poll_get_single(req);
+
+	/* head is RCU protected, see io_poll_remove_entries() comments */
+	rcu_read_lock();
+	head = smp_load_acquire(&poll->head);
+	if (head) {
+		/*
+		 * poll arm may not hold ownership and so race with
+		 * io_poll_wake() by modifying req->flags. There is only one
+		 * poll entry queued, serialise with it by taking its head lock.
+		 */
+		spin_lock_irq(&head->lock);
+		req->flags |= REQ_F_DOUBLE_POLL;
+		spin_unlock_irq(&head->lock);
+	}
+	rcu_read_unlock();
+}
+
 static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 			    struct wait_queue_head *head,
 			    struct io_poll **poll_ptr)
@@ -405,16 +427,19 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 			pt->error = -ENOMEM;
 			return;
 		}
+
+		io_poll_double_prepare(req);
 		/* mark as double wq entry */
 		wqe_private |= IO_WQE_F_DOUBLE;
-		req->flags |= REQ_F_DOUBLE_POLL;
 		io_init_poll_iocb(poll, first->events, first->wait.func);
 		*poll_ptr = poll;
 		if (req->opcode == IORING_OP_POLL_ADD)
 			req->flags |= REQ_F_ASYNC_DATA;
+	} else {
+		/* fine to modify, there is no poll queued to race with us */
+		req->flags |= REQ_F_SINGLE_POLL;
 	}
 
-	req->flags |= REQ_F_SINGLE_POLL;
 	pt->nr_entries++;
 	poll->head = head;
 	poll->wait.private = (void *) wqe_private;
@@ -435,6 +460,12 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 			(struct io_poll **) &pt->req->async_data);
 }
 
+static bool io_poll_can_finish_inline(struct io_kiocb *req,
+				      struct io_poll_table *pt)
+{
+	return pt->owning || io_poll_get_ownership(req);
+}
+
 /*
  * Returns 0 when it's handed over for polling. The caller owns the requests if
  * it returns non-zero, but otherwise should not touch it. Negative values
@@ -443,7 +474,8 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
  */
 static int __io_arm_poll_handler(struct io_kiocb *req,
 				 struct io_poll *poll,
-				 struct io_poll_table *ipt, __poll_t mask)
+				 struct io_poll_table *ipt, __poll_t mask,
+				 unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int v;
@@ -452,34 +484,45 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
 	io_init_poll_iocb(poll, mask, io_poll_wake);
 	poll->file = req->file;
-
 	req->apoll_events = poll->events;
 
 	ipt->pt._key = mask;
 	ipt->req = req;
 	ipt->error = 0;
 	ipt->nr_entries = 0;
-
 	/*
-	 * Take the ownership to delay any tw execution up until we're done
-	 * with poll arming. see io_poll_get_ownership().
+	 * Polling is either completed here or via task_work, so if we're in the
+	 * task context we're naturally serialised with tw by merit of running
+	 * the same task. When it's io-wq, take the ownership to prevent tw
+	 * from running. However, when we're in the task context, skip taking
+	 * it as an optimisation.
+	 *
+	 * Note: even though the request won't be completed/freed, without
+	 * ownership we still can race with io_poll_wake().
+	 * io_poll_can_finish_inline() tries to deal with that.
 	 */
-	atomic_set(&req->poll_refs, 1);
+	ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
+
+	atomic_set(&req->poll_refs, (int)ipt->owning);
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 
 	if (unlikely(ipt->error || !ipt->nr_entries)) {
 		io_poll_remove_entries(req);
 
-		if (mask && (poll->events & EPOLLET)) {
+		if (!io_poll_can_finish_inline(req, ipt)) {
+			io_poll_mark_cancelled(req);
+			return 0;
+		} else if (mask && (poll->events & EPOLLET)) {
 			ipt->result_mask = mask;
 			return 1;
-		} else {
-			return ipt->error ?: -EINVAL;
 		}
+		return ipt->error ?: -EINVAL;
 	}
 
 	if (mask &&
 	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
+		if (!io_poll_can_finish_inline(req, ipt))
+			return 0;
 		io_poll_remove_entries(req);
 		ipt->result_mask = mask;
 		/* no one else has access to the req, forget about the ref */
@@ -491,18 +534,21 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	else
 		io_poll_req_insert(req);
 
-	if (mask && (poll->events & EPOLLET)) {
+	if (mask && (poll->events & EPOLLET) &&
+	    io_poll_can_finish_inline(req, ipt)) {
 		__io_poll_execute(req, mask);
 		return 0;
 	}
 
-	/*
-	 * Release ownership. If someone tried to queue a tw while it was
-	 * locked, kick it off for them.
-	 */
-	v = atomic_dec_return(&req->poll_refs);
-	if (unlikely(v & IO_POLL_REF_MASK))
-		__io_poll_execute(req, 0);
+	if (ipt->owning) {
+		/*
+		 * Release ownership. If someone tried to queue a tw while it was
+		 * locked, kick it off for them.
+		 */
+		v = atomic_dec_return(&req->poll_refs);
+		if (unlikely(v & IO_POLL_REF_MASK))
+			__io_poll_execute(req, 0);
+	}
 	return 0;
 }
 
@@ -585,7 +631,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 
 	io_kbuf_recycle(req, issue_flags);
 
-	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
+	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
 	if (ret)
 		return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
 	trace_io_uring_poll_arm(req, mask, apoll->poll.events);
@@ -817,7 +863,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	else
 		req->flags &= ~REQ_F_HASH_LOCKED;
 
-	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
+	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
 	if (ret > 0) {
 		io_req_set_res(req, ipt.result_mask, 0);
 		return IOU_OK;

From 795bbbc8a9a1bbbafce762c706bfb5733c9d0426 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Thu, 23 Jun 2022 21:01:26 +0800
Subject: [PATCH 131/172] io_uring: kbuf: inline io_kbuf_recycle_ring()

Make io_kbuf_recycle_ring() inline since it is the fast path of
provided buffer.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220623130126.179232-1-hao.xu@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 28 ----------------------------
 io_uring/kbuf.h | 28 +++++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 8bf47e49ea5bc7..5e00f16e89b866 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -74,34 +74,6 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
 	return;
 }
 
-void io_kbuf_recycle_ring(struct io_kiocb *req)
-{
-	/*
-	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
-	 * the flag and hence ensure that bl->head doesn't get incremented.
-	 * If the tail has already been incremented, hang on to it.
-	 * The exception is partial io, that case we should increment bl->head
-	 * to monopolize the buffer.
-	 */
-	if (req->buf_list) {
-		if (req->flags & REQ_F_PARTIAL_IO) {
-			/*
-			 * If we end up here, then the io_uring_lock has
-			 * been kept held since we retrieved the buffer.
-			 * For the io-wq case, we already cleared
-			 * req->buf_list when the buffer was retrieved,
-			 * hence it cannot be set here for that case.
-			 */
-			req->buf_list->head++;
-			req->buf_list = NULL;
-		} else {
-			req->buf_index = req->buf_list->bgid;
-			req->flags &= ~REQ_F_BUFFER_RING;
-		}
-	}
-	return;
-}
-
 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
 {
 	unsigned int cflags;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 721465c5d809fc..b3e8c6c5fee143 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -49,7 +49,33 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-void io_kbuf_recycle_ring(struct io_kiocb *req);
+
+static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
+{
+	/*
+	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+	 * the flag and hence ensure that bl->head doesn't get incremented.
+	 * If the tail has already been incremented, hang on to it.
+	 * The exception is partial io, that case we should increment bl->head
+	 * to monopolize the buffer.
+	 */
+	if (req->buf_list) {
+		if (req->flags & REQ_F_PARTIAL_IO) {
+			/*
+			 * If we end up here, then the io_uring_lock has
+			 * been kept held since we retrieved the buffer.
+			 * For the io-wq case, we already cleared
+			 * req->buf_list when the buffer was retrieved,
+			 * hence it cannot be set here for that case.
+			 */
+			req->buf_list->head++;
+			req->buf_list = NULL;
+		} else {
+			req->buf_index = req->buf_list->bgid;
+			req->flags &= ~REQ_F_BUFFER_RING;
+		}
+	}
+}
 
 static inline bool io_do_buffer_select(struct io_kiocb *req)
 {

From fe991a7688f894a74a6f6b4933bf6cd5fa086c1b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 21 Jun 2022 14:34:15 -0600
Subject: [PATCH 132/172] io_uring: move POLLFREE handling to separate function

We really don't care about this at all in terms of performance. Outside
of having it already be marked unlikely(), shove it into a separate
__cold function.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 50 ++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index eba767594deee5..fa25b88a7b9336 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -325,6 +325,31 @@ static void io_poll_cancel_req(struct io_kiocb *req)
 
 #define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
 
+static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
+{
+	io_poll_mark_cancelled(req);
+	/* we have to kick tw in case it's not already */
+	io_poll_execute(req, 0);
+
+	/*
+	 * If the waitqueue is being freed early but someone is already
+	 * holds ownership over it, we have to tear down the request as
+	 * best we can. That means immediately removing the request from
+	 * its waitqueue and preventing all further accesses to the
+	 * waitqueue via the request.
+	 */
+	list_del_init(&poll->wait.entry);
+
+	/*
+	 * Careful: this *must* be the last step, since as soon
+	 * as req->head is NULL'ed out, the request can be
+	 * completed and freed, since aio_poll_complete_work()
+	 * will no longer need to take the waitqueue lock.
+	 */
+	smp_store_release(&poll->head, NULL);
+	return 1;
+}
+
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
@@ -332,29 +357,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct io_poll *poll = container_of(wait, struct io_poll, wait);
 	__poll_t mask = key_to_poll(key);
 
-	if (unlikely(mask & POLLFREE)) {
-		io_poll_mark_cancelled(req);
-		/* we have to kick tw in case it's not already */
-		io_poll_execute(req, 0);
-
-		/*
-		 * If the waitqueue is being freed early but someone is already
-		 * holds ownership over it, we have to tear down the request as
-		 * best we can. That means immediately removing the request from
-		 * its waitqueue and preventing all further accesses to the
-		 * waitqueue via the request.
-		 */
-		list_del_init(&poll->wait.entry);
-
-		/*
-		 * Careful: this *must* be the last step, since as soon
-		 * as req->head is NULL'ed out, the request can be
-		 * completed and freed, since aio_poll_complete_work()
-		 * will no longer need to take the waitqueue lock.
-		 */
-		smp_store_release(&poll->head, NULL);
-		return 1;
-	}
+	if (unlikely(mask & POLLFREE))
+		return io_pollfree_wake(req, poll);
 
 	/* for instances that support it check for an event match first */
 	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))

From 37c7bd31b3e9e4b6aee3c5227f789c0b586a33a2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:52:58 +0100
Subject: [PATCH 133/172] io_uring: improve io_fail_links()

io_fail_links() is called with ->completion_lock held and for that
reason we'd want to keep it as small as we can. Instead of doing
__io_req_complete_post() for each linked request under the lock, fail
them in a task_work handler under ->uring_lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a2f68708b970a21f4e84ddfa7b3abd67a8fffb27.1656153285.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 4af074b8f6b7d7..2f9e5693547931 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -101,32 +101,44 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 	spin_unlock_irq(&ctx->timeout_lock);
 }
 
-static void io_fail_links(struct io_kiocb *req)
-	__must_hold(&req->ctx->completion_lock)
+static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
 {
-	struct io_kiocb *nxt, *link = req->link;
-	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
-
-	req->link = NULL;
+	io_tw_lock(link->ctx, locked);
 	while (link) {
+		struct io_kiocb *nxt = link->link;
 		long res = -ECANCELED;
 
 		if (link->flags & REQ_F_FAIL)
 			res = link->cqe.res;
-
-		nxt = link->link;
 		link->link = NULL;
+		io_req_set_res(link, res, 0);
+		io_req_task_complete(link, locked);
+		link = nxt;
+	}
+}
 
-		trace_io_uring_fail_link(req, link);
+static void io_fail_links(struct io_kiocb *req)
+	__must_hold(&req->ctx->completion_lock)
+{
+	struct io_kiocb *link = req->link;
+	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
+
+	if (!link)
+		return;
 
+	while (link) {
 		if (ignore_cqes)
 			link->flags |= REQ_F_CQE_SKIP;
 		else
 			link->flags &= ~REQ_F_CQE_SKIP;
-		io_req_set_res(link, res, 0);
-		__io_req_complete_post(link);
-		link = nxt;
+		trace_io_uring_fail_link(req, link);
+		link = link->link;
 	}
+
+	link = req->link;
+	link->io_task_work.func = io_req_tw_fail_links;
+	io_req_task_work_add(link);
+	req->link = NULL;
 }
 
 static inline void io_remove_next_linked(struct io_kiocb *req)

From 3218e5d32dbcf1b9c6dc589eca21deebb14215fa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:52:59 +0100
Subject: [PATCH 134/172] io_uring: fuse fallback_node and normal tw node

Now as both normal and fallback paths use llist, just keep one node head
in struct io_task_work and kill off ->fallback_node.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d04ebde409f7b162fe247b361b4486b193293e46.1656153285.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 5 +----
 io_uring/io_uring.c            | 5 ++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 918165a200533f..3ca8f363f5046c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -427,10 +427,7 @@ enum {
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
-	union {
-		struct llist_node	node;
-		struct llist_node	fallback_node;
-	};
+	struct llist_node		node;
 	io_req_tw_func_t		func;
 };
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 45538b3c3a764e..86a0b0c6f5bf77 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -233,7 +233,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 	bool locked = false;
 
 	percpu_ref_get(&ctx->refs);
-	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
+	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, &locked);
 
 	if (locked) {
@@ -1091,13 +1091,12 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-
 	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
 		node = node->next;
-		if (llist_add(&req->io_task_work.fallback_node,
+		if (llist_add(&req->io_task_work.node,
 			      &req->ctx->fallback_llist))
 			schedule_delayed_work(&req->ctx->fallback_work, 1);
 	}

From ad8b261d837400cb7ccc339e81d7c35ab018acd8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:53:00 +0100
Subject: [PATCH 135/172] io_uring: remove extra TIF_NOTIFY_SIGNAL check

io_run_task_work() accounts for TIF_NOTIFY_SIGNAL, so no need to have an
second check in io_run_task_work_sig().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/52ce41a592ad904511697f432141e5690fd4b968.1656153285.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 86a0b0c6f5bf77..f40526426db813 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2205,8 +2205,6 @@ int io_run_task_work_sig(void)
 {
 	if (io_run_task_work())
 		return 1;
-	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
-		return -ERESTARTSYS;
 	if (task_sigpending(current))
 		return -EINTR;
 	return 0;

From 3273c4407acd4348b1531e1f860fbf1da942893b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:53:01 +0100
Subject: [PATCH 136/172] io_uring: don't check file ops of registered rings

Registered rings are per definitions io_uring files, so we don't need to
additionally verify them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/425cd64fd885b8e329a46c205ee811987691baaf.1656153286.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f40526426db813..e1e8dcd17df354 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3036,22 +3036,22 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (flags & IORING_ENTER_REGISTERED_RING) {
 		struct io_uring_task *tctx = current->io_uring;
 
-		if (!tctx || fd >= IO_RINGFD_REG_MAX)
+		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
 			return -EINVAL;
 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
 		f.file = tctx->registered_rings[fd];
 		f.flags = 0;
+		if (unlikely(!f.file))
+			return -EBADF;
 	} else {
 		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+		ret = -EOPNOTSUPP;
+		if (unlikely(!io_is_uring_fops(f.file)))
+			goto out_fput;
 	}
 
-	if (unlikely(!f.file))
-		return -EBADF;
-
-	ret = -EOPNOTSUPP;
-	if (unlikely(!io_is_uring_fops(f.file)))
-		goto out_fput;
-
 	ret = -ENXIO;
 	ctx = f.file->private_data;
 	if (unlikely(!percpu_ref_tryget(&ctx->refs)))

From fbb8bb02911790147ea936f3b1c5ceb1be54bf34 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:53:02 +0100
Subject: [PATCH 137/172] io_uring: remove ctx->refs pinning on enter

io_uring_enter() takes ctx->refs, which was previously preventing racing
with register quiesce. However, as register now doesn't touch the refs,
we can freely kill extra ctx pinning and rely on the fact that we're
holding a file reference preventing the ring from being destroyed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a11c57ad33a1be53541fce90669c1b79cf4d8940.1656153286.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1e8dcd17df354..070ee9ec9ee721 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3049,14 +3049,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			return -EBADF;
 		ret = -EOPNOTSUPP;
 		if (unlikely(!io_is_uring_fops(f.file)))
-			goto out_fput;
+			goto out;
 	}
 
-	ret = -ENXIO;
 	ctx = f.file->private_data;
-	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
-		goto out_fput;
-
 	ret = -EBADFD;
 	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
 		goto out;
@@ -3141,10 +3137,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 					  &ctx->check_cq);
 		}
 	}
-
 out:
-	percpu_ref_put(&ctx->refs);
-out_fput:
 	fdput(f);
 	return ret;
 }
@@ -3730,11 +3723,10 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	int ret;
 
 	/*
-	 * We're inside the ring mutex, if the ref is already dying, then
-	 * someone else killed the ctx or is already going through
-	 * io_uring_register().
+	 * We don't quiesce the refs for register anymore and so it can't be
+	 * dying as we're holding a file ref here.
 	 */
-	if (percpu_ref_is_dying(&ctx->refs))
+	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
 		return -ENXIO;
 
 	if (ctx->restricted) {

From 8fcf4c48f44bd7b1b75db139f56ff1ad6477379e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 28 Jun 2022 21:33:20 +0200
Subject: [PATCH 138/172] io_uring: replace zero-length array with
 flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 094f706c93e0b9..8fe0275cdaf311 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -495,7 +495,7 @@ struct io_uring_probe {
 	__u8 ops_len;	/* length of ops[] array below */
 	__u16 resv;
 	__u32 resv2[3];
-	struct io_uring_probe_op ops[0];
+	struct io_uring_probe_op ops[];
 };
 
 struct io_uring_restriction {

From f110ed8498afa6ff8e9a8c08fb26880e02117616 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 04:42:56 -0600
Subject: [PATCH 139/172] io_uring: split out fixed file installation and
 removal

Put it with the filetable code, which is where it belongs. While doing
so, have the helpers take a ctx rather than an io_kiocb. It doesn't make
sense to use a request, as it's not an operation on the request itself.
It applies to the ring itself.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.c | 66 ++++++++++++++++++++++++++++++++++----------
 io_uring/filetable.h |  3 ++
 io_uring/openclose.c | 35 +++--------------------
 io_uring/openclose.h |  2 +-
 io_uring/rsrc.c      |  2 +-
 5 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 534e1a3c625d9f..abaa5ba7f6552a 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -58,11 +58,10 @@ void io_free_file_tables(struct io_file_table *table)
 	table->bitmap = NULL;
 }
 
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-				 unsigned int issue_flags, u32 slot_index)
+static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
+				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	bool needs_switch = false;
 	struct io_fixed_file *file_slot;
 	int ret;
@@ -108,34 +107,71 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 	return ret;
 }
 
-/*
- * Note when io_fixed_fd_install() returns error value, it will ensure
- * fput() is called correspondingly.
- */
-int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
-			struct file *file, unsigned int file_slot)
+int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
+			  unsigned int file_slot)
 {
 	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
-	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	io_ring_submit_lock(ctx, issue_flags);
-
 	if (alloc_slot) {
 		ret = io_file_bitmap_get(ctx);
 		if (unlikely(ret < 0))
-			goto err;
+			return ret;
 		file_slot = ret;
 	} else {
 		file_slot--;
 	}
 
-	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+	ret = io_install_fixed_file(ctx, file, file_slot);
 	if (!ret && alloc_slot)
 		ret = file_slot;
-err:
+	return ret;
+}
+/*
+ * Note when io_fixed_fd_install() returns error value, it will ensure
+ * fput() is called correspondingly.
+ */
+int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			struct file *file, unsigned int file_slot)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	ret = __io_fixed_fd_install(ctx, file, file_slot);
 	io_ring_submit_unlock(ctx, issue_flags);
+
 	if (unlikely(ret < 0))
 		fput(file);
 	return ret;
 }
+
+int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
+{
+	struct io_fixed_file *file_slot;
+	struct file *file;
+	int ret;
+
+	if (unlikely(!ctx->file_data))
+		return -ENXIO;
+	if (offset >= ctx->nr_user_files)
+		return -EINVAL;
+	ret = io_rsrc_node_switch_start(ctx);
+	if (ret)
+		return ret;
+
+	offset = array_index_nospec(offset, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
+	if (!file_slot->file_ptr)
+		return -EBADF;
+
+	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+	if (ret)
+		return ret;
+
+	file_slot->file_ptr = 0;
+	io_file_bitmap_clear(&ctx->file_table, offset);
+	io_rsrc_node_switch(ctx, ctx->file_data);
+	return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index fb5a274c08ffcc..79eb50c1980e84 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -29,6 +29,9 @@ void io_free_file_tables(struct io_file_table *table);
 
 int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 			struct file *file, unsigned int file_slot);
+int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
+				unsigned int file_slot);
+int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
 
 unsigned int io_file_get_flags(struct file *file);
 
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 099a5ec84dfdb7..d1818ec9169ba2 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -173,42 +173,15 @@ void io_open_cleanup(struct io_kiocb *req)
 		putname(open->filename);
 }
 
-int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
 		     unsigned int offset)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *file_slot;
-	struct file *file;
 	int ret;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	ret = -ENXIO;
-	if (unlikely(!ctx->file_data))
-		goto out;
-	ret = -EINVAL;
-	if (offset >= ctx->nr_user_files)
-		goto out;
-	ret = io_rsrc_node_switch_start(ctx);
-	if (ret)
-		goto out;
-
-	offset = array_index_nospec(offset, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
-	ret = -EBADF;
-	if (!file_slot->file_ptr)
-		goto out;
-
-	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
-	if (ret)
-		goto out;
-
-	file_slot->file_ptr = 0;
-	io_file_bitmap_clear(&ctx->file_table, offset);
-	io_rsrc_node_switch(ctx, ctx->file_data);
-	ret = 0;
-out:
+	ret = io_fixed_fd_remove(ctx, offset);
 	io_ring_submit_unlock(ctx, issue_flags);
+
 	return ret;
 }
 
@@ -216,7 +189,7 @@ static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_close *close = io_kiocb_to_cmd(req);
 
-	return __io_close_fixed(req, issue_flags, close->file_slot - 1);
+	return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
 }
 
 int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 9f578f3fad870d..4b1c28d3a66c3f 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
-int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
 		     unsigned int offset);
 
 int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1106089551595e..706fa020505b1a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -703,7 +703,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 		if (ret < 0)
 			break;
 		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
-			__io_close_fixed(req, issue_flags, ret);
+			__io_close_fixed(req->ctx, issue_flags, ret);
 			ret = -EFAULT;
 			break;
 		}

From e6130eba8a848a7a6ba6c534bd8f6d60749ae1a9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 04:47:02 -0600
Subject: [PATCH 140/172] io_uring: add support for passing fixed file
 descriptors

With IORING_OP_MSG_RING, one ring can send a message to another ring.
Extend that support to also allow sending a fixed file descriptor to
that ring, enabling one ring to pass a registered descriptor to another
one.

Arguments are extended to pass in:

sqe->addr3	fixed file slot in source ring
sqe->file_index	fixed file slot in destination ring

IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
If set to zero (or IORING_MSG_DATA), it sends just a message like before.
If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
to the above arguments.

Two common use cases for this are:

1) Server needs to be shutdown or restarted, pass file descriptors to
   another onei

2) Backend is split, and one accepts connections, while others then get
  the fd passed and handle the actual connection.

Both of those are classic SCM_RIGHTS use cases, and it's not possible to
support them with direct descriptors today.

By default, this will post a CQE to the target ring, similarly to how
IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message
is posted to the target ring. The issuer is expected to notify the
receiver side separately.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  17 +++++
 io_uring/msg_ring.c           | 130 ++++++++++++++++++++++++++++++++--
 2 files changed, 140 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8fe0275cdaf311..f378eabbff21b6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -51,6 +51,7 @@ struct io_uring_sqe {
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
 		__u32		xattr_flags;
+		__u32		msg_ring_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -270,6 +271,22 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
+/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+	IORING_MSG_DATA,	/* pass sqe->len as 'res' and off as user_data */
+	IORING_MSG_SEND_FD,	/* send a registered fd to another ring */
+};
+
+/*
+ * IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
+ *
+ * IORING_MSG_RING_CQE_SKIP	Don't post a CQE to the target ring. Not
+ *				applicable for IORING_MSG_DATA, obviously.
+ */
+#define IORING_MSG_RING_CQE_SKIP	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b02be23496521c..939205b30c8b62 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -3,46 +3,162 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
 #include "msg_ring.h"
 
 struct io_msg {
 	struct file			*file;
 	u64 user_data;
 	u32 len;
+	u32 cmd;
+	u32 src_fd;
+	u32 dst_fd;
+	u32 flags;
 };
 
+static int io_msg_ring_data(struct io_kiocb *req)
+{
+	struct io_ring_ctx *target_ctx = req->file->private_data;
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+
+	if (msg->src_fd || msg->dst_fd || msg->flags)
+		return -EINVAL;
+
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		return 0;
+
+	return -EOVERFLOW;
+}
+
+static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+				 struct io_ring_ctx *octx,
+				 unsigned int issue_flags)
+{
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_unlock(&ctx->uring_lock);
+	mutex_unlock(&octx->uring_lock);
+}
+
+static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+			      struct io_ring_ctx *octx,
+			      unsigned int issue_flags)
+{
+	/*
+	 * To ensure proper ordering between the two ctxs, we can only
+	 * attempt a trylock on the target. If that fails and we already have
+	 * the source ctx lock, punt to io-wq.
+	 */
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		if (!mutex_trylock(&octx->uring_lock))
+			return -EAGAIN;
+		return 0;
+	}
+
+	/* Always grab smallest value ctx first. We know ctx != octx. */
+	if (ctx < octx) {
+		mutex_lock(&ctx->uring_lock);
+		mutex_lock(&octx->uring_lock);
+	} else {
+		mutex_lock(&octx->uring_lock);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	return 0;
+}
+
+static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *target_ctx = req->file->private_data;
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long file_ptr;
+	struct file *src_file;
+	int ret;
+
+	if (target_ctx == ctx)
+		return -EINVAL;
+
+	ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+	if (unlikely(ret))
+		return ret;
+
+	ret = -EBADF;
+	if (unlikely(msg->src_fd >= ctx->nr_user_files))
+		goto out_unlock;
+
+	msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+	file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+	src_file = (struct file *) (file_ptr & FFS_MASK);
+	get_file(src_file);
+
+	ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+	if (ret < 0) {
+		fput(src_file);
+		goto out_unlock;
+	}
+
+	if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+		goto out_unlock;
+
+	/*
+	 * If this fails, the target still received the file descriptor but
+	 * wasn't notified of the fact. This means that if this request
+	 * completes with -EOVERFLOW, then the sender must ensure that a
+	 * later IORING_OP_MSG_RING delivers the message.
+	 */
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		ret = -EOVERFLOW;
+out_unlock:
+	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
+	return ret;
+}
+
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
 
-	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
-		     sqe->buf_index || sqe->personality))
+	if (unlikely(sqe->buf_index || sqe->personality))
 		return -EINVAL;
 
 	msg->user_data = READ_ONCE(sqe->off);
 	msg->len = READ_ONCE(sqe->len);
+	msg->cmd = READ_ONCE(sqe->addr);
+	msg->src_fd = READ_ONCE(sqe->addr3);
+	msg->dst_fd = READ_ONCE(sqe->file_index);
+	msg->flags = READ_ONCE(sqe->msg_ring_flags);
+	if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+		return -EINVAL;
+
 	return 0;
 }
 
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *target_ctx;
 	int ret;
 
 	ret = -EBADFD;
 	if (!io_is_uring_fops(req->file))
 		goto done;
 
-	ret = -EOVERFLOW;
-	target_ctx = req->file->private_data;
-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
-		ret = 0;
+	switch (msg->cmd) {
+	case IORING_MSG_DATA:
+		ret = io_msg_ring_data(req);
+		break;
+	case IORING_MSG_SEND_FD:
+		ret = io_msg_send_fd(req, issue_flags);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
 
 done:
 	if (ret < 0)

From 6e73dffbb93cb8797cd4e42e98d837edf0f1a967 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:55:38 +0100
Subject: [PATCH 141/172] io_uring: let to set a range for file slot allocation

From recently io_uring provides an option to allocate a file index for
operation registering fixed files. However, it's utterly unusable with
mixed approaches when for a part of files the userspace knows better
where to place it, as it may race and users don't have any sane way to
pick a slot and hoping it will not be taken.

Let the userspace to register a range of fixed file slots in which the
auto-allocation happens. The use case is splittting the fixed table in
two parts, where on of them is used for auto-allocation and another for
slot-specified operations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66ab0394e436f38437cf7c44676e1920d09687ad.1656154403.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  | 13 +++++++++++++
 io_uring/filetable.c           | 24 ++++++++++++++++++++----
 io_uring/filetable.h           | 20 +++++++++++++++++---
 io_uring/io_uring.c            |  6 ++++++
 io_uring/rsrc.c                |  2 ++
 6 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3ca8f363f5046c..26ef11e978d403 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -233,6 +233,9 @@ struct io_ring_ctx {
 
 	unsigned long		check_cq;
 
+	unsigned int		file_alloc_start;
+	unsigned int		file_alloc_end;
+
 	struct {
 		/*
 		 * We cache a range of free CQEs we can use, once exhausted it
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f378eabbff21b6..cf95354198a393 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -449,6 +449,9 @@ enum {
 	/* sync cancelation API */
 	IORING_REGISTER_SYNC_CANCEL		= 24,
 
+	/* register a range of fixed file slots for automatic slot allocation */
+	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -595,4 +598,14 @@ struct io_uring_sync_cancel_reg {
 	__u64				pad[4];
 };
 
+/*
+ * Argument for IORING_REGISTER_FILE_ALLOC_RANGE
+ * The range is specified as [off, off + len)
+ */
+struct io_uring_file_index_range {
+	__u32	off;
+	__u32	len;
+	__u64	resv;
+};
+
 #endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index abaa5ba7f6552a..7b473259f3f45a 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -16,7 +16,7 @@
 static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 {
 	struct io_file_table *table = &ctx->file_table;
-	unsigned long nr = ctx->nr_user_files;
+	unsigned long nr = ctx->file_alloc_end;
 	int ret;
 
 	do {
@@ -24,11 +24,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 		if (ret != nr)
 			return ret;
 
-		if (!table->alloc_hint)
+		if (table->alloc_hint == ctx->file_alloc_start)
 			break;
-
 		nr = table->alloc_hint;
-		table->alloc_hint = 0;
+		table->alloc_hint = ctx->file_alloc_start;
 	} while (1);
 
 	return -ENFILE;
@@ -175,3 +174,20 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 	io_rsrc_node_switch(ctx, ctx->file_data);
 	return 0;
 }
+
+int io_register_file_alloc_range(struct io_ring_ctx *ctx,
+				 struct io_uring_file_index_range __user *arg)
+{
+	struct io_uring_file_index_range range;
+	u32 end;
+
+	if (copy_from_user(&range, arg, sizeof(range)))
+		return -EFAULT;
+	if (check_add_overflow(range.off, range.len, &end))
+		return -EOVERFLOW;
+	if (range.resv || end > ctx->nr_user_files)
+		return -EINVAL;
+
+	io_file_table_set_alloc_range(ctx, range.off, range.len);
+	return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 79eb50c1980e84..ff3a712e11bf33 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -3,9 +3,7 @@
 #define IOU_FILE_TABLE_H
 
 #include <linux/file.h>
-
-struct io_ring_ctx;
-struct io_kiocb;
+#include <linux/io_uring_types.h>
 
 /*
  * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
@@ -33,6 +31,9 @@ int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
 				unsigned int file_slot);
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
 
+int io_register_file_alloc_range(struct io_ring_ctx *ctx,
+				 struct io_uring_file_index_range __user *arg);
+
 unsigned int io_file_get_flags(struct file *file);
 
 static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
@@ -71,4 +72,17 @@ static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
 	file_slot->file_ptr = file_ptr;
 }
 
+static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
+{
+	ctx->file_table.alloc_hint = ctx->file_alloc_start;
+}
+
+static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
+						 unsigned off, unsigned len)
+{
+	ctx->file_alloc_start = off;
+	ctx->file_alloc_end = off + len;
+	io_reset_alloc_hint(ctx);
+}
+
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 070ee9ec9ee721..745264938a489f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3866,6 +3866,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sync_cancel(ctx, arg);
 		break;
+	case IORING_REGISTER_FILE_ALLOC_RANGE:
+		ret = -EINVAL;
+		if (!arg || nr_args)
+			break;
+		ret = io_register_file_alloc_range(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 706fa020505b1a..d2e589c703d063 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1012,6 +1012,8 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
+	/* default it to the whole table */
+	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
 	io_rsrc_node_switch(ctx, NULL);
 	return 0;
 fail:

From b8c015598c8ef9195b8a2a5089e275c4f64ca999 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:20 -0700
Subject: [PATCH 142/172] io_uring: allow 0 length for buffer select

If user gives 0 for length, we can set it from the available buffer size.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 5e00f16e89b866..e538fa7cb727cf 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -115,7 +115,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 
 		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
 		list_del(&kbuf->list);
-		if (*len > kbuf->len)
+		if (*len == 0 || *len > kbuf->len)
 			*len = kbuf->len;
 		req->flags |= REQ_F_BUFFER_SELECTED;
 		req->kbuf = kbuf;
@@ -145,7 +145,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		buf = page_address(bl->buf_pages[index]);
 		buf += off;
 	}
-	if (*len > buf->len)
+	if (*len == 0 || *len > buf->len)
 		*len = buf->len;
 	req->flags |= REQ_F_BUFFER_RING;
 	req->buf_list = bl;

From 32f3c434b14238a2eee0c726a1918de58c07faf6 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:21 -0700
Subject: [PATCH 143/172] io_uring: restore bgid in io_put_kbuf

Attempt to restore bgid. This is needed when recycling unused buffers as
the next time around it will want the correct bgid.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index b3e8c6c5fee143..d6af208d109ffe 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -103,16 +103,21 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
 					      struct list_head *list)
 {
+	unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+
 	if (req->flags & REQ_F_BUFFER_RING) {
-		if (req->buf_list)
+		if (req->buf_list) {
+			req->buf_index = req->buf_list->bgid;
 			req->buf_list->head++;
+		}
 		req->flags &= ~REQ_F_BUFFER_RING;
 	} else {
+		req->buf_index = req->kbuf->bgid;
 		list_add(&req->kbuf->list, list);
 		req->flags &= ~REQ_F_BUFFER_SELECTED;
 	}
 
-	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+	return ret;
 }
 
 static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)

From 5702196e7d9d1232c41769e197eb33ba78a9b463 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:22 -0700
Subject: [PATCH 144/172] io_uring: allow iov_len = 0 for recvmsg and buffer
 select

When using BUFFER_SELECT there is no technical requirement that the user
actually provides iov, and this removes one copy_from_user call.

So allow iov_len to be 0.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-4-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index b77bfbfb081673..06eaef9f97bec5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -300,12 +300,18 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 		return ret;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		if (iov_len > 1)
+		if (iov_len == 0) {
+			sr->len = iomsg->fast_iov[0].iov_len = 0;
+			iomsg->fast_iov[0].iov_base = NULL;
+			iomsg->free_iov = NULL;
+		} else if (iov_len > 1) {
 			return -EINVAL;
-		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
-			return -EFAULT;
-		sr->len = iomsg->fast_iov[0].iov_len;
-		iomsg->free_iov = NULL;
+		} else {
+			if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
+				return -EFAULT;
+			sr->len = iomsg->fast_iov[0].iov_len;
+			iomsg->free_iov = NULL;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,

From d4e097dae29c24bf33a5056a2a43cff2e45c4978 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:23 -0700
Subject: [PATCH 145/172] io_uring: recycle buffers on error

Rather than passing an error back to the user with a buffer attached,
recycle the buffer immediately.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-5-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 06eaef9f97bec5..e4422dff07048a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -481,10 +481,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	if (ret >= 0)
+	if (ret > 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
+	else
+		io_kbuf_recycle(req, issue_flags);
+
 	cflags = io_put_kbuf(req, issue_flags);
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
@@ -557,10 +560,13 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	}
 
-	if (ret >= 0)
+	if (ret > 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
+	else
+		io_kbuf_recycle(req, issue_flags);
+
 	cflags = io_put_kbuf(req, issue_flags);
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;

From 2ba69707d9153eeca1ee8ac1bc55376b88978842 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:24 -0700
Subject: [PATCH 146/172] io_uring: clean up io_poll_check_events return values

The values returned are a bit confusing, where 0 and 1 have implied
meaning, so add some definitions for them.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-6-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index fa25b88a7b9336..922a3d1b2e31d1 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -192,13 +192,18 @@ static void io_poll_remove_entries(struct io_kiocb *req)
 	rcu_read_unlock();
 }
 
+enum {
+	IOU_POLL_DONE = 0,
+	IOU_POLL_NO_ACTION = 1,
+};
+
 /*
  * All poll tw should go through this. Checks for poll events, manages
  * references, does rewait, etc.
  *
- * Returns a negative error on failure. >0 when no action require, which is
- * either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->cqe.res.
+ * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require,
+ * which is either spurious wakeup or multishot CQE is served.
+ * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res.
  */
 static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 {
@@ -214,10 +219,11 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 
 		/* tw handler should be the owner, and so have some references */
 		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
-			return 0;
+			return IOU_POLL_DONE;
 		if (v & IO_POLL_CANCEL_FLAG)
 			return -ECANCELED;
 
+		/* the mask was stashed in __io_poll_execute */
 		if (!req->cqe.res) {
 			struct poll_table_struct pt = { ._key = req->apoll_events };
 			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
@@ -226,7 +232,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 		if ((unlikely(!req->cqe.res)))
 			continue;
 		if (req->apoll_events & EPOLLONESHOT)
-			return 0;
+			return IOU_POLL_DONE;
 
 		/* multishot, just fill a CQE and proceed */
 		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
@@ -238,7 +244,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 				return -ECANCELED;
 		} else {
 			ret = io_poll_issue(req, locked);
-			if (ret)
+			if (ret < 0)
 				return ret;
 		}
 
@@ -248,7 +254,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 		 */
 	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
 
-	return 1;
+	return IOU_POLL_NO_ACTION;
 }
 
 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
@@ -256,12 +262,11 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	int ret;
 
 	ret = io_poll_check_events(req, locked);
-	if (ret > 0)
+	if (ret == IOU_POLL_NO_ACTION)
 		return;
 
-	if (!ret) {
+	if (ret == IOU_POLL_DONE) {
 		struct io_poll *poll = io_kiocb_to_cmd(req);
-
 		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 	} else {
 		req->cqe.res = ret;
@@ -280,7 +285,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	int ret;
 
 	ret = io_poll_check_events(req, locked);
-	if (ret > 0)
+	if (ret == IOU_POLL_NO_ACTION)
 		return;
 
 	io_poll_remove_entries(req);

From 114eccdf0e368893b3d92e06e9788d9d94876853 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:25 -0700
Subject: [PATCH 147/172] io_uring: add IOU_STOP_MULTISHOT return code

For multishot we want a way to signal the caller that multishot has ended
but also this might not be an error return.

For example sockets return 0 when closed, which should end a multishot
recv, but still have a CQE with result 0

Introduce IOU_STOP_MULTISHOT which does this and indicates that the return
code is stored inside req->cqe

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-7-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h |  7 +++++++
 io_uring/poll.c     | 11 +++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f77e4a5403e4e5..e8da70781fa34b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -15,6 +15,13 @@
 enum {
 	IOU_OK			= 0,
 	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
+
+	/*
+	 * Intended only when both REQ_F_POLLED and REQ_F_APOLL_MULTISHOT
+	 * are set to indicate to the poll runner that multishot should be
+	 * removed and the result is set on req->cqe.res.
+	 */
+	IOU_STOP_MULTISHOT	= -ECANCELED,
 };
 
 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 922a3d1b2e31d1..64d426d696abc8 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -195,6 +195,7 @@ static void io_poll_remove_entries(struct io_kiocb *req)
 enum {
 	IOU_POLL_DONE = 0,
 	IOU_POLL_NO_ACTION = 1,
+	IOU_POLL_REMOVE_POLL_USE_RES = 2,
 };
 
 /*
@@ -204,6 +205,8 @@ enum {
  * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require,
  * which is either spurious wakeup or multishot CQE is served.
  * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res.
+ * IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result
+ * is stored in req->cqe.
  */
 static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 {
@@ -244,6 +247,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 				return -ECANCELED;
 		} else {
 			ret = io_poll_issue(req, locked);
+			if (ret == IOU_STOP_MULTISHOT)
+				return IOU_POLL_REMOVE_POLL_USE_RES;
 			if (ret < 0)
 				return ret;
 		}
@@ -268,7 +273,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	if (ret == IOU_POLL_DONE) {
 		struct io_poll *poll = io_kiocb_to_cmd(req);
 		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
-	} else {
+	} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
 		req->cqe.res = ret;
 		req_set_fail(req);
 	}
@@ -291,7 +296,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	io_poll_remove_entries(req);
 	io_poll_tw_hash_eject(req, locked);
 
-	if (!ret)
+	if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
+		io_req_complete_post(req);
+	else if (ret == IOU_POLL_DONE)
 		io_req_task_submit(req, locked);
 	else
 		io_req_complete_failed(req, ret);

From 52120f0fadcbdaaa981c19327f1865a714e85268 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:26 -0700
Subject: [PATCH 148/172] io_uring: add allow_overflow to io_post_aux_cqe

Some use cases of io_post_aux_cqe would not want to overflow as is, but
might want to change the flags/result. For example multishot receive
requires in order CQE, and so if there is an overflow it would need to
stop receiving until the overflow is taken care of.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-8-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 14 ++++++++++----
 io_uring/io_uring.h |  3 ++-
 io_uring/msg_ring.c |  4 ++--
 io_uring/net.c      |  2 +-
 io_uring/poll.c     |  2 +-
 io_uring/rsrc.c     |  4 ++--
 6 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 745264938a489f..523b6ebad15a30 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -736,7 +736,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
 }
 
 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
-			    u64 user_data, s32 res, u32 cflags)
+			    u64 user_data, s32 res, u32 cflags,
+			    bool allow_overflow)
 {
 	struct io_uring_cqe *cqe;
 
@@ -760,16 +761,21 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
 		}
 		return true;
 	}
-	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+
+	if (allow_overflow)
+		return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+
+	return false;
 }
 
 bool io_post_aux_cqe(struct io_ring_ctx *ctx,
-		     u64 user_data, s32 res, u32 cflags)
+		     u64 user_data, s32 res, u32 cflags,
+		     bool allow_overflow)
 {
 	bool filled;
 
 	io_cq_lock(ctx);
-	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
+	filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow);
 	io_cq_unlock_post(ctx);
 	return filled;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e8da70781fa34b..e022d71c177a46 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -31,7 +31,8 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
 void __io_req_complete_post(struct io_kiocb *req);
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+		     bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 939205b30c8b62..753d16734319a4 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -31,7 +31,7 @@ static int io_msg_ring_data(struct io_kiocb *req)
 	if (msg->src_fd || msg->dst_fd || msg->flags)
 		return -EINVAL;
 
-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
 		return 0;
 
 	return -EOVERFLOW;
@@ -113,7 +113,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	 * completes with -EOVERFLOW, then the sender must ensure that a
 	 * later IORING_OP_MSG_RING delivers the message.
 	 */
-	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
 		ret = -EOVERFLOW;
 out_unlock:
 	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
diff --git a/io_uring/net.c b/io_uring/net.c
index e4422dff07048a..601955fdb124f7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -658,7 +658,7 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < 0)
 		return ret;
-	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE))
+	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
 		goto retry;
 	return -ECANCELED;
 }
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 64d426d696abc8..e8f922a4f6d754 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -243,7 +243,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 						    req->apoll_events);
 
 			if (!io_post_aux_cqe(ctx, req->cqe.user_data,
-					     mask, IORING_CQE_F_MORE))
+					     mask, IORING_CQE_F_MORE, true))
 				return -ECANCELED;
 		} else {
 			ret = io_poll_issue(req, locked);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d2e589c703d063..0250c13ae1cdd0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -175,10 +175,10 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		if (prsrc->tag) {
 			if (ctx->flags & IORING_SETUP_IOPOLL) {
 				mutex_lock(&ctx->uring_lock);
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
 				mutex_unlock(&ctx->uring_lock);
 			} else {
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
 			}
 		}
 

From a2da676376feb79224eacb9ac1f554bb3232b5de Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:27 -0700
Subject: [PATCH 149/172] io_uring: fix multishot poll on overflow

On overflow, multishot poll can still complete with the IORING_CQE_F_MORE
flag set.
If in the meantime the user clears a CQE and a the poll was cancelled then
the poll will post a CQE without the IORING_CQE_F_MORE (and likely result
-ECANCELED).

However when processing the application will encounter the non-overflow
CQE which indicates that there will be no more events posted. Typical
userspace applications would free memory associated with the poll in this
case.
It will then subsequently receive the earlier CQE which has overflowed,
which breaks the contract given by the IORING_CQE_F_MORE flag.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-9-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index e8f922a4f6d754..57747d92bba427 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -243,8 +243,10 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 						    req->apoll_events);
 
 			if (!io_post_aux_cqe(ctx, req->cqe.user_data,
-					     mask, IORING_CQE_F_MORE, true))
-				return -ECANCELED;
+					     mask, IORING_CQE_F_MORE, false)) {
+				io_req_set_res(req, mask, 0);
+				return IOU_POLL_REMOVE_POLL_USE_RES;
+			}
 		} else {
 			ret = io_poll_issue(req, locked);
 			if (ret == IOU_STOP_MULTISHOT)

From cbd25748545c709d35734deb9220e0af0a69e5d2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:28 -0700
Subject: [PATCH 150/172] io_uring: fix multishot accept ordering

Similar to multishot poll, drop multishot accept when CQE overflow occurs.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-10-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 601955fdb124f7..e1eaf902f3b201 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -656,11 +656,14 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 		return IOU_OK;
 	}
 
-	if (ret < 0)
-		return ret;
-	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
+	if (ret >= 0 &&
+	    io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false))
 		goto retry;
-	return -ECANCELED;
+
+	io_req_set_res(req, ret, 0);
+	if (req->flags & REQ_F_POLLED)
+		return IOU_STOP_MULTISHOT;
+	return IOU_OK;
 }
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

From b3fdea6ecb55c3ceea866ff66486927e51a982b3 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:29 -0700
Subject: [PATCH 151/172] io_uring: multishot recv

Support multishot receive for io_uring.
Typical server applications will run a loop where for each recv CQE it
requeues another recv/recvmsg.

This can be simplified by using the existing multishot functionality
combined with io_uring's provided buffers.
The API is to add the IORING_RECV_MULTISHOT flag to the SQE. CQEs will
then be posted (with IORING_CQE_F_MORE flag set) when data is available
and is read. Once an error occurs or the socket ends, the multishot will
be removed and a completion without IORING_CQE_F_MORE will be posted.

The benefit to this is that the recv is much more performant.
 * Subsequent receives are queued up straight away without requiring the
   application to finish a processing loop.
 * If there are more data in the socket (sat the provided buffer size is
   smaller than the socket buffer) then the data is immediately
   returned, improving batching.
 * Poll is only armed once and reused, saving CPU cycles

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-11-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   5 ++
 io_uring/net.c                | 102 +++++++++++++++++++++++++++++-----
 2 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cf95354198a393..499679134961b2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -263,8 +263,13 @@ enum io_uring_op {
  *				or receive and arm poll if that yields an
  *				-EAGAIN result, arm poll upfront and skip
  *				the initial transfer attempt.
+ *
+ * IORING_RECV_MULTISHOT	Multishot recv. Sets IORING_CQE_F_MORE if
+ *				the handler will continue to report
+ *				CQEs on behalf of the same SQE.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
+#define IORING_RECV_MULTISHOT	(1U << 1)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index e1eaf902f3b201..cb08a4b6284096 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -389,6 +389,8 @@ int io_recvmsg_prep_async(struct io_kiocb *req)
 	return ret;
 }
 
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+
 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -399,13 +401,22 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (sr->flags & ~(RECVMSG_FLAGS))
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
+	if (sr->flags & IORING_RECV_MULTISHOT) {
+		if (!(req->flags & REQ_F_BUFFER_SELECT))
+			return -EINVAL;
+		if (sr->msg_flags & MSG_WAITALL)
+			return -EINVAL;
+		if (req->opcode == IORING_OP_RECV && sr->len)
+			return -EINVAL;
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+	}
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -415,6 +426,48 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+static inline void io_recv_prep_retry(struct io_kiocb *req)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+	sr->done_io = 0;
+	sr->len = 0; /* get from the provided buffer */
+}
+
+/*
+ * Finishes io_recv and io_recvmsg.
+ *
+ * Returns true if it is actually finished, or false if it should run
+ * again (for multishot).
+ */
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags)
+{
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		io_req_set_res(req, *ret, cflags);
+		*ret = IOU_OK;
+		return true;
+	}
+
+	if (*ret > 0) {
+		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
+				    cflags | IORING_CQE_F_MORE, false)) {
+			io_recv_prep_retry(req);
+			return false;
+		}
+		/*
+		 * Otherwise stop multishot but use the current result.
+		 * Probably will end up going into overflow, but this means
+		 * we cannot trust the ordering anymore
+		 */
+	}
+
+	io_req_set_res(req, *ret, cflags);
+
+	if (req->flags & REQ_F_POLLED)
+		*ret = IOU_STOP_MULTISHOT;
+	return true;
+}
+
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -424,6 +477,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = sr->len;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -442,16 +496,17 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg);
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
+		kmsg->fast_iov[0].iov_len = len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
+				len);
 	}
 
 	flags = sr->msg_flags;
@@ -463,8 +518,15 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	kmsg->msg.msg_get_inq = 1;
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg);
+		if (ret == -EAGAIN && force_nonblock) {
+			ret = io_setup_async_msg(req, kmsg);
+			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+					       IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -491,8 +553,11 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	cflags = io_put_kbuf(req, issue_flags);
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+
+	if (!io_recv_finish(req, &ret, cflags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -505,6 +570,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = sr->len;
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
@@ -514,16 +580,17 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		sr->buf = buf;
 	}
 
-	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
+	ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter);
 	if (unlikely(ret))
 		goto out_free;
 
@@ -543,8 +610,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = sock_recvmsg(sock, &msg, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
+		if (ret == -EAGAIN && force_nonblock) {
+			if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+
 			return -EAGAIN;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -570,8 +643,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	cflags = io_put_kbuf(req, issue_flags);
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+
+	if (!io_recv_finish(req, &ret, cflags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

From 9b26e811e934eebda59362c9a03d082852552574 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:30 -0700
Subject: [PATCH 152/172] io_uring: fix io_uring_cqe_overflow trace format

Make the trace format consistent with io_uring_complete for cflags

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-12-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 918e3a43e4b280..95a8cfaad15a03 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -594,7 +594,7 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		__entry->ocqe		= ocqe;
 	),
 
-	TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
+	TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
 		  "overflow_cqe %p",
 		  __entry->ctx, __entry->user_data, __entry->res,
 		  __entry->cflags, __entry->ocqe)

From e0486f3f7c1b6ab7ba7e912bf48d17f04e71d86d Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:31 -0700
Subject: [PATCH 153/172] io_uring: only trace one of complete or overflow

In overflow we see a duplcate line in the trace, and in some cases 3
lines (if initial io_post_aux_cqe fails).
Instead just trace once for each CQE

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-13-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  3 ++-
 io_uring/io_uring.h | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 523b6ebad15a30..caf979cd432729 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -742,7 +742,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
 	struct io_uring_cqe *cqe;
 
 	ctx->cq_extra++;
-	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
 
 	/*
 	 * If we can't get a cq entry, userspace overflowed the
@@ -751,6 +750,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
 	 */
 	cqe = io_get_cqe(ctx);
 	if (likely(cqe)) {
+		trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
+
 		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e022d71c177a46..868f45d55543bc 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -101,10 +101,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 {
 	struct io_uring_cqe *cqe;
 
-	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-				req->cqe.res, req->cqe.flags,
-				(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
-				(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
 	/*
 	 * If we can't get a cq entry, userspace overflowed the
 	 * submission (by quite a lot). Increment the overflow count in
@@ -113,6 +109,12 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 	cqe = io_get_cqe(ctx);
 	if (unlikely(!cqe))
 		return io_req_cqe_overflow(req);
+
+	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+				req->cqe.res, req->cqe.flags,
+				(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
+				(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
+
 	memcpy(cqe, &req->cqe, sizeof(*cqe));
 
 	if (ctx->flags & IORING_SETUP_CQE32) {

From cf0dd9527eee5d6d183fa724fdee6a128cb17b8d Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Mon, 4 Jul 2022 07:01:06 -0700
Subject: [PATCH 154/172] io_uring: disable multishot recvmsg

recvmsg has semantics that do not make it trivial to extend to
multishot. Specifically it has user pointers and returns data in the
original parameter. In order to make this API useful these will need to be
somehow included with the provided buffers.

For now remove multishot for recvmsg as it is not useful.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220704140106.200167-1-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index cb08a4b6284096..6679069eeef162 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -409,6 +409,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
 	if (sr->flags & IORING_RECV_MULTISHOT) {
+		if (req->opcode == IORING_OP_RECVMSG)
+			return -EINVAL;
 		if (!(req->flags & REQ_F_BUFFER_SELECT))
 			return -EINVAL;
 		if (sr->msg_flags & MSG_WAITALL)
@@ -435,7 +437,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
 }
 
 /*
- * Finishes io_recv and io_recvmsg.
+ * Finishes io_recv
  *
  * Returns true if it is actually finished, or false if it should run
  * again (for multishot).
@@ -477,7 +479,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	size_t len = sr->len;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -496,17 +497,16 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg);
 
-retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &len, issue_flags);
+		buf = io_buffer_select(req, &sr->len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = len;
+		kmsg->fast_iov[0].iov_len = sr->len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				len);
+				sr->len);
 	}
 
 	flags = sr->msg_flags;
@@ -518,15 +518,8 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	kmsg->msg.msg_get_inq = 1;
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock) {
-			ret = io_setup_async_msg(req, kmsg);
-			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
-					       IO_APOLL_MULTI_POLLED) {
-				io_kbuf_recycle(req, issue_flags);
-				return IOU_ISSUE_SKIP_COMPLETE;
-			}
-			return ret;
-		}
+		if (ret == -EAGAIN && force_nonblock)
+			return io_setup_async_msg(req, kmsg);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -554,10 +547,8 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	if (!io_recv_finish(req, &ret, cflags))
-		goto retry_multishot;
-
-	return ret;
+	io_req_set_res(req, ret, cflags);
+	return IOU_OK;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)

From 7a121ced6e6430d49fb802067b4f020f6df62362 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Jul 2022 15:13:14 +0100
Subject: [PATCH 155/172] io_uring: don't miss setting REQ_F_DOUBLE_POLL

When adding a second poll entry we should set REQ_F_DOUBLE_POLL
unconditionally. We might race with the first entry removal but that
doesn't change the rule.

Fixes: a18427bb2d9b ("io_uring: optimise submission side poll_refs")
Reported-and-tested-by: syzbot+49950ba66096b1f0209b@syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8b680d83ded07424db83e8745585e7a6d72826ef.1657203020.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 57747d92bba427..3710a0a46a8778 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -401,16 +401,18 @@ static void io_poll_double_prepare(struct io_kiocb *req)
 	/* head is RCU protected, see io_poll_remove_entries() comments */
 	rcu_read_lock();
 	head = smp_load_acquire(&poll->head);
-	if (head) {
-		/*
-		 * poll arm may not hold ownership and so race with
-		 * io_poll_wake() by modifying req->flags. There is only one
-		 * poll entry queued, serialise with it by taking its head lock.
-		 */
+	/*
+	 * poll arm may not hold ownership and so race with
+	 * io_poll_wake() by modifying req->flags. There is only one
+	 * poll entry queued, serialise with it by taking its head lock.
+	 */
+	if (head)
 		spin_lock_irq(&head->lock);
-		req->flags |= REQ_F_DOUBLE_POLL;
+
+	req->flags |= REQ_F_DOUBLE_POLL;
+
+	if (head)
 		spin_unlock_irq(&head->lock);
-	}
 	rcu_read_unlock();
 }
 

From ceff501790a9789c748a9b851f30f4f7e2fe4d72 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Jul 2022 15:13:15 +0100
Subject: [PATCH 156/172] io_uring: don't race double poll setting
 REQ_F_ASYNC_DATA

Just as with io_poll_double_prepare() setting REQ_F_DOUBLE_POLL, we can
race with the first poll entry when setting REQ_F_ASYNC_DATA. Move it
under io_poll_double_prepare().

Fixes: a18427bb2d9b ("io_uring: optimise submission side poll_refs")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/df6920f509c11115aa2bce8b34dc5fdb0eb98920.1657203020.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 3710a0a46a8778..c1359d45a396f2 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -410,6 +410,8 @@ static void io_poll_double_prepare(struct io_kiocb *req)
 		spin_lock_irq(&head->lock);
 
 	req->flags |= REQ_F_DOUBLE_POLL;
+	if (req->opcode == IORING_OP_POLL_ADD)
+		req->flags |= REQ_F_ASYNC_DATA;
 
 	if (head)
 		spin_unlock_irq(&head->lock);
@@ -448,13 +450,11 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 			return;
 		}
 
-		io_poll_double_prepare(req);
 		/* mark as double wq entry */
 		wqe_private |= IO_WQE_F_DOUBLE;
 		io_init_poll_iocb(poll, first->events, first->wait.func);
+		io_poll_double_prepare(req);
 		*poll_ptr = poll;
-		if (req->opcode == IORING_OP_POLL_ADD)
-			req->flags |= REQ_F_ASYNC_DATA;
 	} else {
 		/* fine to modify, there is no poll queued to race with us */
 		req->flags |= REQ_F_SINGLE_POLL;

From b21a51e26e9aed66159c2c18651da22d585d2998 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Jul 2022 15:13:16 +0100
Subject: [PATCH 157/172] io_uring: clear REQ_F_HASH_LOCKED on hash removal

Instead of clearing REQ_F_HASH_LOCKED while arming a poll, unset the bit
when we're removing the entry from the table in io_poll_tw_hash_eject().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/02e48bb88d6f1480c94ac2924c43ad1fbd48e92a.1657203020.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index c1359d45a396f2..77b669b0604609 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -132,6 +132,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
 		 */
 		io_tw_lock(ctx, locked);
 		hash_del(&req->hash_node);
+		req->flags &= ~REQ_F_HASH_LOCKED;
 	} else {
 		io_poll_req_delete(req, ctx);
 	}
@@ -617,9 +618,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	 * apoll requests already grab the mutex to complete in the tw handler,
 	 * so removal from the mutex-backed hash is free, use it by default.
 	 */
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		req->flags &= ~REQ_F_HASH_LOCKED;
-	else
+	if (!(issue_flags & IO_URING_F_UNLOCKED))
 		req->flags |= REQ_F_HASH_LOCKED;
 
 	if (!def->pollin && !def->pollout)
@@ -880,8 +879,6 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
 	    (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER)))
 		req->flags |= REQ_F_HASH_LOCKED;
-	else
-		req->flags &= ~REQ_F_HASH_LOCKED;
 
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
 	if (ret > 0) {

From e8375e43ca2d67f31b2408092eee4f91e1e140f4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Jul 2022 15:13:17 +0100
Subject: [PATCH 158/172] io_uring: consolidate hash_locked io-wq handling

Don't duplicate code disabling REQ_F_HASH_LOCKED for IO_URING_F_UNLOCKED
(i.e. io-wq), move the handling into __io_arm_poll_handler().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ff0ffdfaa65b3d536131535c3dad3c63d9b7bb0.1657203020.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 77b669b0604609..76592063abe7ba 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -523,8 +523,12 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	 * io_poll_can_finish_inline() tries to deal with that.
 	 */
 	ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
-
 	atomic_set(&req->poll_refs, (int)ipt->owning);
+
+	/* io-wq doesn't hold uring_lock */
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		req->flags &= ~REQ_F_HASH_LOCKED;
+
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 
 	if (unlikely(ipt->error || !ipt->nr_entries)) {
@@ -618,8 +622,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	 * apoll requests already grab the mutex to complete in the tw handler,
 	 * so removal from the mutex-backed hash is free, use it by default.
 	 */
-	if (!(issue_flags & IO_URING_F_UNLOCKED))
-		req->flags |= REQ_F_HASH_LOCKED;
+	req->flags |= REQ_F_HASH_LOCKED;
 
 	if (!def->pollin && !def->pollout)
 		return IO_APOLL_ABORTED;
@@ -876,8 +879,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	 * If sqpoll or single issuer, there is no contention for ->uring_lock
 	 * and we'll end up holding it in tw handlers anyway.
 	 */
-	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-	    (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER)))
+	if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
 		req->flags |= REQ_F_HASH_LOCKED;
 
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);

From 9da7471ed10dab52410062be74896a6c0aa1bf3a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 11:18:33 -0600
Subject: [PATCH 159/172] io_uring: move apoll cache to poll.c

This is where it's used, move the flush handler in there.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 12 ------------
 io_uring/poll.c     | 12 ++++++++++++
 io_uring/poll.h     |  2 ++
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index caf979cd432729..4d1ce58b015e19 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2445,18 +2445,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
-{
-	struct async_poll *apoll;
-
-	while (!list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del(&apoll->poll.wait.entry);
-		kfree(apoll);
-	}
-}
-
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 76592063abe7ba..052fcb6472083a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -959,3 +959,15 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
+
+void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+{
+	struct async_poll *apoll;
+
+	while (!list_empty(&ctx->apoll_cache)) {
+		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+						poll.wait.entry);
+		list_del(&apoll->poll.wait.entry);
+		kfree(apoll);
+	}
+}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index c40673d7da0199..95f192c7babbd5 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -30,3 +30,5 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
+
+void io_flush_apoll_cache(struct io_ring_ctx *ctx);

From 9b797a37c4bd83b03cedcfbd15852b836f5e562c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 14:16:20 -0600
Subject: [PATCH 160/172] io_uring: add abstraction around apoll cache

In preparation for adding limits, and one more user, abstract out the
core bits of the allocation+free cache.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 ++++-
 io_uring/alloc_cache.h         | 41 ++++++++++++++++++++++++++++++++++
 io_uring/io_uring.c            |  8 +++----
 io_uring/poll.c                | 18 +++++----------
 io_uring/poll.h                |  9 ++++++--
 5 files changed, 62 insertions(+), 20 deletions(-)
 create mode 100644 io_uring/alloc_cache.h

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 26ef11e978d403..b548da03b563cb 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -158,6 +158,10 @@ struct io_ev_fd {
 	struct rcu_head		rcu;
 };
 
+struct io_alloc_cache {
+	struct hlist_head	list;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -216,7 +220,7 @@ struct io_ring_ctx {
 
 		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
-		struct list_head	apoll_cache;
+		struct io_alloc_cache	apoll_cache;
 		struct xarray		personalities;
 		u32			pers_next;
 	} ____cacheline_aligned_in_smp;
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
new file mode 100644
index 00000000000000..98f2374c37c7bb
--- /dev/null
+++ b/io_uring/alloc_cache.h
@@ -0,0 +1,41 @@
+#ifndef IOU_ALLOC_CACHE_H
+#define IOU_ALLOC_CACHE_H
+
+struct io_cache_entry {
+	struct hlist_node	node;
+};
+
+static inline void io_alloc_cache_put(struct io_alloc_cache *cache,
+				      struct io_cache_entry *entry)
+{
+	hlist_add_head(&entry->node, &cache->list);
+}
+
+static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
+{
+	if (!hlist_empty(&cache->list)) {
+		struct hlist_node *node = cache->list.first;
+
+		hlist_del(node);
+		return container_of(node, struct io_cache_entry, node);
+	}
+
+	return NULL;
+}
+
+static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
+{
+	INIT_HLIST_HEAD(&cache->list);
+}
+
+static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
+					void (*free)(struct io_cache_entry *))
+{
+	while (!hlist_empty(&cache->list)) {
+		struct hlist_node *node = cache->list.first;
+
+		hlist_del(node);
+		free(container_of(node, struct io_cache_entry, node));
+	}
+}
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4d1ce58b015e19..a360a3d390c6ba 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 
 #include "timeout.h"
 #include "poll.h"
+#include "alloc_cache.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -295,7 +296,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	INIT_LIST_HEAD(&ctx->apoll_cache);
+	io_alloc_cache_init(&ctx->apoll_cache);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -1180,8 +1181,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 
 				if (apoll->double_poll)
 					kfree(apoll->double_poll);
-				list_add(&apoll->poll.wait.entry,
-						&ctx->apoll_cache);
+				io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache);
 				req->flags &= ~REQ_F_POLLED;
 			}
 			if (req->flags & IO_REQ_LINK_FLAGS)
@@ -2467,7 +2467,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	io_eventfd_unregister(ctx);
-	io_flush_apoll_cache(ctx);
+	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
 	mutex_unlock(&ctx->uring_lock);
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 052fcb6472083a..dadd293749b07b 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -590,16 +590,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
 					     unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
 	struct async_poll *apoll;
 
 	if (req->flags & REQ_F_POLLED) {
 		apoll = req->apoll;
 		kfree(apoll->double_poll);
 	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-		   !list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del_init(&apoll->poll.wait.entry);
+		   (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) {
+		apoll = container_of(entry, struct async_poll, cache);
 	} else {
 		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
 		if (unlikely(!apoll))
@@ -960,14 +959,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+void io_apoll_cache_free(struct io_cache_entry *entry)
 {
-	struct async_poll *apoll;
-
-	while (!list_empty(&ctx->apoll_cache)) {
-		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
-						poll.wait.entry);
-		list_del(&apoll->poll.wait.entry);
-		kfree(apoll);
-	}
+	kfree(container_of(entry, struct async_poll, cache));
 }
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 95f192c7babbd5..5f3bae50fc81a0 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include "alloc_cache.h"
+
 enum {
 	IO_APOLL_OK,
 	IO_APOLL_ABORTED,
@@ -14,7 +16,10 @@ struct io_poll {
 };
 
 struct async_poll {
-	struct io_poll		poll;
+	union {
+		struct io_poll		poll;
+		struct io_cache_entry	cache;
+	};
 	struct io_poll		*double_poll;
 };
 
@@ -31,4 +36,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
 
-void io_flush_apoll_cache(struct io_ring_ctx *ctx);
+void io_apoll_cache_free(struct io_cache_entry *entry);

From 9731bc9855dc169f27433fef3c4d0ff3496c512d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 14:20:54 -0600
Subject: [PATCH 161/172] io_uring: impose max limit on apoll cache

Caches like this tend to grow to the peak size, and then never get any
smaller. Impose a max limit on the size, to prevent it from growing too
big.

A somewhat randomly chosen 512 is the max size we'll allow the cache
to get. If a batch of frees come in and would bring it over that, we
simply start kfree'ing the surplus.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/alloc_cache.h         | 16 ++++++++++++++--
 io_uring/io_uring.c            |  3 ++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b548da03b563cb..bf8f95332edae0 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -160,6 +160,7 @@ struct io_ev_fd {
 
 struct io_alloc_cache {
 	struct hlist_head	list;
+	unsigned int		nr_cached;
 };
 
 struct io_ring_ctx {
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 98f2374c37c7bb..729793ae97127a 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -1,14 +1,24 @@
 #ifndef IOU_ALLOC_CACHE_H
 #define IOU_ALLOC_CACHE_H
 
+/*
+ * Don't allow the cache to grow beyond this size.
+ */
+#define IO_ALLOC_CACHE_MAX	512
+
 struct io_cache_entry {
 	struct hlist_node	node;
 };
 
-static inline void io_alloc_cache_put(struct io_alloc_cache *cache,
+static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 				      struct io_cache_entry *entry)
 {
-	hlist_add_head(&entry->node, &cache->list);
+	if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
+		cache->nr_cached++;
+		hlist_add_head(&entry->node, &cache->list);
+		return true;
+	}
+	return false;
 }
 
 static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
@@ -26,6 +36,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c
 static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
 {
 	INIT_HLIST_HEAD(&cache->list);
+	cache->nr_cached = 0;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
@@ -37,5 +48,6 @@ static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
 		hlist_del(node);
 		free(container_of(node, struct io_cache_entry, node));
 	}
+	cache->nr_cached = 0;
 }
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a360a3d390c6ba..c9c23e45976635 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1181,7 +1181,8 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 
 				if (apoll->double_poll)
 					kfree(apoll->double_poll);
-				io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache);
+				if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
+					kfree(apoll);
 				req->flags &= ~REQ_F_POLLED;
 			}
 			if (req->flags & IO_REQ_LINK_FLAGS)

From 43e0bbbd0b0e30d232fd8e9e908125b5c49a9fbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Jul 2022 14:30:09 -0600
Subject: [PATCH 162/172] io_uring: add netmsg cache

For recvmsg/sendmsg, if they don't complete inline, we currently need
to allocate a struct io_async_msghdr for each request. This is a
somewhat large struct.

Hook up sendmsg/recvmsg to use the io_alloc_cache. This reduces the
alloc + free overhead considerably, yielding 4-5% of extra performance
running netbench.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 ++--
 io_uring/io_uring.c            |  3 ++
 io_uring/net.c                 | 63 +++++++++++++++++++++++++++++-----
 io_uring/net.h                 | 13 ++++++-
 4 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bf8f95332edae0..d54b8b7e074629 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -222,8 +222,7 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table_locked;
 		struct list_head	cq_overflow_list;
 		struct io_alloc_cache	apoll_cache;
-		struct xarray		personalities;
-		u32			pers_next;
+		struct io_alloc_cache	netmsg_cache;
 	} ____cacheline_aligned_in_smp;
 
 	/* IRQ completion list, under ->completion_lock */
@@ -241,6 +240,9 @@ struct io_ring_ctx {
 	unsigned int		file_alloc_start;
 	unsigned int		file_alloc_end;
 
+	struct xarray		personalities;
+	u32			pers_next;
+
 	struct {
 		/*
 		 * We cache a range of free CQEs we can use, once exhausted it
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c9c23e45976635..f697ca4e8f558c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -89,6 +89,7 @@
 #include "kbuf.h"
 #include "rsrc.h"
 #include "cancel.h"
+#include "net.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -297,6 +298,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
 	io_alloc_cache_init(&ctx->apoll_cache);
+	io_alloc_cache_init(&ctx->netmsg_cache);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -2469,6 +2471,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		__io_cqring_overflow_flush(ctx, true);
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
+	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	mutex_unlock(&ctx->uring_lock);
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
diff --git a/io_uring/net.c b/io_uring/net.c
index 6679069eeef162..185553174437f3 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -12,6 +12,7 @@
 
 #include "io_uring.h"
 #include "kbuf.h"
+#include "alloc_cache.h"
 #include "net.h"
 
 #if defined(CONFIG_NET)
@@ -97,18 +98,55 @@ static bool io_net_retry(struct socket *sock, int flags)
 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
 }
 
+static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_async_msghdr *hdr = req->async_data;
+
+	if (!hdr || issue_flags & IO_URING_F_UNLOCKED)
+		return;
+
+	/* Let normal cleanup path reap it if we fail adding to the cache */
+	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+		req->async_data = NULL;
+		req->flags &= ~REQ_F_ASYNC_DATA;
+	}
+}
+
+static struct io_async_msghdr *io_recvmsg_alloc_async(struct io_kiocb *req,
+						      unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
+
+	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+	    (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) {
+		struct io_async_msghdr *hdr;
+
+		hdr = container_of(entry, struct io_async_msghdr, cache);
+		req->flags |= REQ_F_ASYNC_DATA;
+		req->async_data = hdr;
+		return hdr;
+	}
+
+	if (!io_alloc_async_data(req))
+		return req->async_data;
+
+	return NULL;
+}
+
 static int io_setup_async_msg(struct io_kiocb *req,
-			      struct io_async_msghdr *kmsg)
+			      struct io_async_msghdr *kmsg,
+			      unsigned int issue_flags)
 {
 	struct io_async_msghdr *async_msg = req->async_data;
 
 	if (async_msg)
 		return -EAGAIN;
-	if (io_alloc_async_data(req)) {
+	async_msg = io_recvmsg_alloc_async(req, issue_flags);
+	if (!async_msg) {
 		kfree(kmsg->free_iov);
 		return -ENOMEM;
 	}
-	async_msg = req->async_data;
 	req->flags |= REQ_F_NEED_CLEANUP;
 	memcpy(async_msg, kmsg, sizeof(*kmsg));
 	async_msg->msg.msg_name = &async_msg->addr;
@@ -195,7 +233,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return io_setup_async_msg(req, kmsg);
+		return io_setup_async_msg(req, kmsg, issue_flags);
 
 	flags = sr->msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -207,13 +245,13 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < min_ret) {
 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->done_io += ret;
 			req->flags |= REQ_F_PARTIAL_IO;
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		}
 		req_set_fail(req);
 	}
@@ -221,6 +259,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
+	io_netmsg_recycle(req, issue_flags);
 	if (ret >= 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
@@ -495,7 +534,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
-		return io_setup_async_msg(req, kmsg);
+		return io_setup_async_msg(req, kmsg, issue_flags);
 
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
@@ -519,13 +558,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
 		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->done_io += ret;
 			req->flags |= REQ_F_PARTIAL_IO;
-			return io_setup_async_msg(req, kmsg);
+			return io_setup_async_msg(req, kmsg, issue_flags);
 		}
 		req_set_fail(req);
 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
@@ -535,6 +574,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	/* fast path, check for non-NULL to avoid function call */
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
+	io_netmsg_recycle(req, issue_flags);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret > 0)
 		ret += sr->done_io;
@@ -848,4 +888,9 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
+
+void io_netmsg_cache_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct io_async_msghdr, cache));
+}
 #endif
diff --git a/io_uring/net.h b/io_uring/net.h
index 81d71d1647704d..178a6d8b76e0a8 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -3,9 +3,14 @@
 #include <linux/net.h>
 #include <linux/uio.h>
 
+#include "alloc_cache.h"
+
 #if defined(CONFIG_NET)
 struct io_async_msghdr {
-	struct iovec			fast_iov[UIO_FASTIOV];
+	union {
+		struct iovec		fast_iov[UIO_FASTIOV];
+		struct io_cache_entry	cache;
+	};
 	/* points to an allocated iov, if NULL we use fast_iov instead */
 	struct iovec			*free_iov;
 	struct sockaddr __user		*uaddr;
@@ -40,4 +45,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags);
 int io_connect_prep_async(struct io_kiocb *req);
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_connect(struct io_kiocb *req, unsigned int issue_flags);
+
+void io_netmsg_cache_free(struct io_cache_entry *entry);
+#else
+static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
+{
+}
 #endif

From e2df2ccb753e3b598a9045760e79f1847f7b31cd Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Fri, 8 Jul 2022 11:18:35 -0700
Subject: [PATCH 163/172] io_uring: fix multishot ending when not polled

If multishot is not actually polling then return IOU_OK rather than the
result.
If the result was > 0 this will confuse things further up the callstack
which expect a return <= 0.

Fixes: 1300ebb20286 ("io_uring: multishot recv")
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220708181838.1495428-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/net.c b/io_uring/net.c
index 185553174437f3..eb939899e9c5a0 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -506,6 +506,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 
 	if (req->flags & REQ_F_POLLED)
 		*ret = IOU_STOP_MULTISHOT;
+	else
+		*ret = IOU_OK;
 	return true;
 }
 

From 6d2f75a0cf3048ef38c48493f66a923353bf664b Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Fri, 8 Jul 2022 11:18:36 -0700
Subject: [PATCH 164/172] io_uring: support 0 length iov in buffer select in
 compat

Match up work done in "io_uring: allow iov_len = 0 for recvmsg and buffer
select", but for compat code path.

Fixes: a68caad69ce5 ("io_uring: allow iov_len = 0 for recvmsg and buffer select")
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220708181838.1495428-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index eb939899e9c5a0..dc9190eafbe703 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -382,16 +382,21 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		compat_ssize_t clen;
 
-		if (len > 1)
-			return -EINVAL;
-		if (!access_ok(uiov, sizeof(*uiov)))
-			return -EFAULT;
-		if (__get_user(clen, &uiov->iov_len))
-			return -EFAULT;
-		if (clen < 0)
+		if (len == 0) {
+			sr->len = 0;
+			iomsg->free_iov = NULL;
+		} else if (len > 1) {
 			return -EINVAL;
-		sr->len = clen;
-		iomsg->free_iov = NULL;
+		} else {
+			if (!access_ok(uiov, sizeof(*uiov)))
+				return -EFAULT;
+			if (__get_user(clen, &uiov->iov_len))
+				return -EFAULT;
+			if (clen < 0)
+				return -EINVAL;
+			sr->len = clen;
+			iomsg->free_iov = NULL;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,

From 7fa875b8e53c288d616234b9daf417b0650ce1cc Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:56 -0700
Subject: [PATCH 165/172] net: copy from user before calling __copy_msghdr

this is in preparation for multishot receive from io_uring, where it needs
to have access to the original struct user_msghdr.

functionally this should be a no-op.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  7 +++----
 io_uring/net.c         | 17 +++++++++--------
 net/socket.c           | 37 ++++++++++++++++---------------------
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 17311ad9f9af24..be24f1c8568a38 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
 			       struct user_msghdr __user *umsg, unsigned flags,
 			       struct sockaddr __user **uaddr,
 			       struct iovec **iov);
-extern int __copy_msghdr_from_user(struct msghdr *kmsg,
-				   struct user_msghdr __user *umsg,
-				   struct sockaddr __user **save_addr,
-				   struct iovec __user **uiov, size_t *nsegs);
+extern int __copy_msghdr(struct msghdr *kmsg,
+			 struct user_msghdr *umsg,
+			 struct sockaddr __user **save_addr);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/io_uring/net.c b/io_uring/net.c
index dc9190eafbe703..da7667ed36106d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -329,31 +329,32 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
-	struct iovec __user *uiov;
-	size_t iov_len;
+	struct user_msghdr msg;
 	int ret;
 
-	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
-					&iomsg->uaddr, &uiov, &iov_len);
+	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
+		return -EFAULT;
+
+	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
 	if (ret)
 		return ret;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		if (iov_len == 0) {
+		if (msg.msg_iovlen == 0) {
 			sr->len = iomsg->fast_iov[0].iov_len = 0;
 			iomsg->fast_iov[0].iov_base = NULL;
 			iomsg->free_iov = NULL;
-		} else if (iov_len > 1) {
+		} else if (msg.msg_iovlen > 1) {
 			return -EINVAL;
 		} else {
-			if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
+			if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
 				return -EFAULT;
 			sr->len = iomsg->fast_iov[0].iov_len;
 			iomsg->free_iov = NULL;
 		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+		ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
 				     &iomsg->free_iov, &iomsg->msg.msg_iter,
 				     false);
 		if (ret > 0)
diff --git a/net/socket.c b/net/socket.c
index 96300cdc06251f..843545c21ec235 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2358,25 +2358,20 @@ struct used_address {
 	unsigned int name_len;
 };
 
-int __copy_msghdr_from_user(struct msghdr *kmsg,
-			    struct user_msghdr __user *umsg,
-			    struct sockaddr __user **save_addr,
-			    struct iovec __user **uiov, size_t *nsegs)
+int __copy_msghdr(struct msghdr *kmsg,
+		  struct user_msghdr *msg,
+		  struct sockaddr __user **save_addr)
 {
-	struct user_msghdr msg;
 	ssize_t err;
 
-	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
-		return -EFAULT;
-
 	kmsg->msg_control_is_user = true;
 	kmsg->msg_get_inq = 0;
-	kmsg->msg_control_user = msg.msg_control;
-	kmsg->msg_controllen = msg.msg_controllen;
-	kmsg->msg_flags = msg.msg_flags;
+	kmsg->msg_control_user = msg->msg_control;
+	kmsg->msg_controllen = msg->msg_controllen;
+	kmsg->msg_flags = msg->msg_flags;
 
-	kmsg->msg_namelen = msg.msg_namelen;
-	if (!msg.msg_name)
+	kmsg->msg_namelen = msg->msg_namelen;
+	if (!msg->msg_name)
 		kmsg->msg_namelen = 0;
 
 	if (kmsg->msg_namelen < 0)
@@ -2386,11 +2381,11 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 
 	if (save_addr)
-		*save_addr = msg.msg_name;
+		*save_addr = msg->msg_name;
 
-	if (msg.msg_name && kmsg->msg_namelen) {
+	if (msg->msg_name && kmsg->msg_namelen) {
 		if (!save_addr) {
-			err = move_addr_to_kernel(msg.msg_name,
+			err = move_addr_to_kernel(msg->msg_name,
 						  kmsg->msg_namelen,
 						  kmsg->msg_name);
 			if (err < 0)
@@ -2401,12 +2396,10 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (msg.msg_iovlen > UIO_MAXIOV)
+	if (msg->msg_iovlen > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	kmsg->msg_iocb = NULL;
-	*uiov = msg.msg_iov;
-	*nsegs = msg.msg_iovlen;
 	return 0;
 }
 
@@ -2418,8 +2411,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 	struct user_msghdr msg;
 	ssize_t err;
 
-	err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
-					&msg.msg_iovlen);
+	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
+		return -EFAULT;
+
+	err = __copy_msghdr(kmsg, &msg, save_addr);
 	if (err)
 		return err;
 

From 72c531f8ef3052c682d39dc21dcb5576afda208c Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:57 -0700
Subject: [PATCH 166/172] net: copy from user before calling
 __get_compat_msghdr

this is in preparation for multishot receive from io_uring, where it needs
to have access to the original struct user_msghdr.

functionally this should be a no-op.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/compat.h |  5 ++---
 io_uring/net.c       | 17 +++++++++--------
 net/compat.c         | 39 +++++++++++++++++----------------------
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/include/net/compat.h b/include/net/compat.h
index 595fee069b8250..84c163f40f38a5 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -46,9 +46,8 @@ struct compat_rtentry {
 	unsigned short  rt_irtt;        /* Initial RTT                  */
 };
 
-int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
-			struct sockaddr __user **save_addr, compat_uptr_t *ptr,
-			compat_size_t *len);
+int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg,
+			struct sockaddr __user **save_addr);
 int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
 		      struct sockaddr __user **, struct iovec **);
 int put_cmsg_compat(struct msghdr*, int, int, int, void *);
diff --git a/io_uring/net.c b/io_uring/net.c
index da7667ed36106d..5bc3440a829012 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -369,24 +369,25 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 					struct io_async_msghdr *iomsg)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct compat_msghdr msg;
 	struct compat_iovec __user *uiov;
-	compat_uptr_t ptr;
-	compat_size_t len;
 	int ret;
 
-	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
-				  &ptr, &len);
+	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
+		return -EFAULT;
+
+	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr);
 	if (ret)
 		return ret;
 
-	uiov = compat_ptr(ptr);
+	uiov = compat_ptr(msg.msg_iov);
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		compat_ssize_t clen;
 
-		if (len == 0) {
+		if (msg.msg_iovlen == 0) {
 			sr->len = 0;
 			iomsg->free_iov = NULL;
-		} else if (len > 1) {
+		} else if (msg.msg_iovlen > 1) {
 			return -EINVAL;
 		} else {
 			if (!access_ok(uiov, sizeof(*uiov)))
@@ -400,7 +401,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
+		ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
 				   UIO_FASTIOV, &iomsg->free_iov,
 				   &iomsg->msg.msg_iter, true);
 		if (ret < 0)
diff --git a/net/compat.c b/net/compat.c
index 210fc3b4d0d833..513aa9a3fc6466 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -34,20 +34,15 @@
 #include <net/compat.h>
 
 int __get_compat_msghdr(struct msghdr *kmsg,
-			struct compat_msghdr __user *umsg,
-			struct sockaddr __user **save_addr,
-			compat_uptr_t *ptr, compat_size_t *len)
+			struct compat_msghdr *msg,
+			struct sockaddr __user **save_addr)
 {
-	struct compat_msghdr msg;
 	ssize_t err;
 
-	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
-		return -EFAULT;
-
-	kmsg->msg_flags = msg.msg_flags;
-	kmsg->msg_namelen = msg.msg_namelen;
+	kmsg->msg_flags = msg->msg_flags;
+	kmsg->msg_namelen = msg->msg_namelen;
 
-	if (!msg.msg_name)
+	if (!msg->msg_name)
 		kmsg->msg_namelen = 0;
 
 	if (kmsg->msg_namelen < 0)
@@ -57,15 +52,15 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 
 	kmsg->msg_control_is_user = true;
-	kmsg->msg_control_user = compat_ptr(msg.msg_control);
-	kmsg->msg_controllen = msg.msg_controllen;
+	kmsg->msg_control_user = compat_ptr(msg->msg_control);
+	kmsg->msg_controllen = msg->msg_controllen;
 
 	if (save_addr)
-		*save_addr = compat_ptr(msg.msg_name);
+		*save_addr = compat_ptr(msg->msg_name);
 
-	if (msg.msg_name && kmsg->msg_namelen) {
+	if (msg->msg_name && kmsg->msg_namelen) {
 		if (!save_addr) {
-			err = move_addr_to_kernel(compat_ptr(msg.msg_name),
+			err = move_addr_to_kernel(compat_ptr(msg->msg_name),
 						  kmsg->msg_namelen,
 						  kmsg->msg_name);
 			if (err < 0)
@@ -76,12 +71,10 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (msg.msg_iovlen > UIO_MAXIOV)
+	if (msg->msg_iovlen > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	kmsg->msg_iocb = NULL;
-	*ptr = msg.msg_iov;
-	*len = msg.msg_iovlen;
 	return 0;
 }
 
@@ -90,15 +83,17 @@ int get_compat_msghdr(struct msghdr *kmsg,
 		      struct sockaddr __user **save_addr,
 		      struct iovec **iov)
 {
-	compat_uptr_t ptr;
-	compat_size_t len;
+	struct compat_msghdr msg;
 	ssize_t err;
 
-	err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
+	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
+		return -EFAULT;
+
+	err = __get_compat_msghdr(kmsg, umsg, save_addr);
 	if (err)
 		return err;
 
-	err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len,
+	err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen,
 			   UIO_FASTIOV, iov, &kmsg->msg_iter);
 	return err < 0 ? err : 0;
 }

From 9bb66906f23e50d6db1e11f7498b72dfca1982a2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:58 -0700
Subject: [PATCH 167/172] io_uring: support multishot in recvmsg

Similar to multishot recv, this will require provided buffers to be
used. However recvmsg is much more complex than recv as it has multiple
outputs. Specifically flags, name, and control messages.

Support this by introducing a new struct io_uring_recvmsg_out with 4
fields. namelen, controllen and flags match the similar out fields in
msghdr from standard recvmsg(2), payloadlen is the length of the payload
following the header.
This struct is placed at the start of the returned buffer. Based on what
the user specifies in struct msghdr, the next bytes of the buffer will be
name (the next msg_namelen bytes), and then control (the next
msg_controllen bytes). The payload will come at the end. The return value
in the CQE is the total used size of the provided buffer.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-4-dylany@fb.com
[axboe: style fixups, see link]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   7 ++
 io_uring/net.c                | 180 ++++++++++++++++++++++++++++++----
 io_uring/net.h                |   6 ++
 3 files changed, 174 insertions(+), 19 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 499679134961b2..4c9b11e2e99158 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -613,4 +613,11 @@ struct io_uring_file_index_range {
 	__u64	resv;
 };
 
+struct io_uring_recvmsg_out {
+	__u32 namelen;
+	__u32 controllen;
+	__u32 payloadlen;
+	__u32 flags;
+};
+
 #endif
diff --git a/io_uring/net.c b/io_uring/net.c
index 5bc3440a829012..616d5f04cc7435 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -325,6 +325,21 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+{
+	unsigned long hdr;
+
+	if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
+			       (unsigned long)iomsg->namelen, &hdr))
+		return true;
+	if (check_add_overflow(hdr, iomsg->controllen, &hdr))
+		return true;
+	if (hdr > INT_MAX)
+		return true;
+
+	return false;
+}
+
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
@@ -352,6 +367,13 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 			sr->len = iomsg->fast_iov[0].iov_len;
 			iomsg->free_iov = NULL;
 		}
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
@@ -399,6 +421,13 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 			sr->len = clen;
 			iomsg->free_iov = NULL;
 		}
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
@@ -455,8 +484,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
 	if (sr->flags & IORING_RECV_MULTISHOT) {
-		if (req->opcode == IORING_OP_RECVMSG)
-			return -EINVAL;
 		if (!(req->flags & REQ_F_BUFFER_SELECT))
 			return -EINVAL;
 		if (sr->msg_flags & MSG_WAITALL)
@@ -483,12 +510,13 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
 }
 
 /*
- * Finishes io_recv
+ * Finishes io_recv and io_recvmsg.
  *
  * Returns true if it is actually finished, or false if it should run
  * again (for multishot).
  */
-static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags)
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
+				  unsigned int cflags, bool mshot_finished)
 {
 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
 		io_req_set_res(req, *ret, cflags);
@@ -496,7 +524,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 		return true;
 	}
 
-	if (*ret > 0) {
+	if (!mshot_finished) {
 		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
 				    cflags | IORING_CQE_F_MORE, false)) {
 			io_recv_prep_retry(req);
@@ -518,6 +546,90 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 	return true;
 }
 
+static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
+				     struct io_sr_msg *sr, void __user **buf,
+				     size_t *len)
+{
+	unsigned long ubuf = (unsigned long) *buf;
+	unsigned long hdr;
+
+	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+		kmsg->controllen;
+	if (*len < hdr)
+		return -EFAULT;
+
+	if (kmsg->controllen) {
+		unsigned long control = ubuf + hdr - kmsg->controllen;
+
+		kmsg->msg.msg_control_user = (void *) control;
+		kmsg->msg.msg_controllen = kmsg->controllen;
+	}
+
+	sr->buf = *buf; /* stash for later copy */
+	*buf = (void *) (ubuf + hdr);
+	kmsg->payloadlen = *len = *len - hdr;
+	return 0;
+}
+
+struct io_recvmsg_multishot_hdr {
+	struct io_uring_recvmsg_out msg;
+	struct sockaddr_storage addr;
+};
+
+static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
+				struct io_async_msghdr *kmsg,
+				unsigned int flags, bool *finished)
+{
+	int err;
+	int copy_len;
+	struct io_recvmsg_multishot_hdr hdr;
+
+	if (kmsg->namelen)
+		kmsg->msg.msg_name = &hdr.addr;
+	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	kmsg->msg.msg_namelen = 0;
+
+	if (sock->file->f_flags & O_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+
+	err = sock_recvmsg(sock, &kmsg->msg, flags);
+	*finished = err <= 0;
+	if (err < 0)
+		return err;
+
+	hdr.msg = (struct io_uring_recvmsg_out) {
+		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
+		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
+	};
+
+	hdr.msg.payloadlen = err;
+	if (err > kmsg->payloadlen)
+		err = kmsg->payloadlen;
+
+	copy_len = sizeof(struct io_uring_recvmsg_out);
+	if (kmsg->msg.msg_namelen > kmsg->namelen)
+		copy_len += kmsg->namelen;
+	else
+		copy_len += kmsg->msg.msg_namelen;
+
+	/*
+	 *      "fromlen shall refer to the value before truncation.."
+	 *                      1003.1g
+	 */
+	hdr.msg.namelen = kmsg->msg.msg_namelen;
+
+	/* ensure that there is no gap between hdr and sockaddr_storage */
+	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
+		     sizeof(struct io_uring_recvmsg_out));
+	if (copy_to_user(io->buf, &hdr, copy_len)) {
+		*finished = true;
+		return -EFAULT;
+	}
+
+	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+			kmsg->controllen + err;
+}
+
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -527,6 +639,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	bool mshot_finished = true;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -545,16 +658,27 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg, issue_flags);
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
+		size_t len = sr->len;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
+			if (ret) {
+				io_kbuf_recycle(req, issue_flags);
+				return ret;
+			}
+		}
+
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
+		kmsg->fast_iov[0].iov_len = len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
+				len);
 	}
 
 	flags = sr->msg_flags;
@@ -564,10 +688,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
 	kmsg->msg.msg_get_inq = 1;
-	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
+	if (req->flags & REQ_F_APOLL_MULTISHOT)
+		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
+					   &mshot_finished);
+	else
+		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
+					 kmsg->uaddr, flags);
+
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg, issue_flags);
+		if (ret == -EAGAIN && force_nonblock) {
+			ret = io_setup_async_msg(req, kmsg, issue_flags);
+			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+					       IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -580,11 +717,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	}
 
-	/* fast path, check for non-NULL to avoid function call */
-	if (kmsg->free_iov)
-		kfree(kmsg->free_iov);
-	io_netmsg_recycle(req, issue_flags);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret > 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
@@ -596,8 +728,18 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+	if (!io_recv_finish(req, &ret, cflags, mshot_finished))
+		goto retry_multishot;
+
+	if (mshot_finished) {
+		io_netmsg_recycle(req, issue_flags);
+		/* fast path, check for non-NULL to avoid function call */
+		if (kmsg->free_iov)
+			kfree(kmsg->free_iov);
+		req->flags &= ~REQ_F_NEED_CLEANUP;
+	}
+
+	return ret;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -684,7 +826,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	if (!io_recv_finish(req, &ret, cflags))
+	if (!io_recv_finish(req, &ret, cflags, ret <= 0))
 		goto retry_multishot;
 
 	return ret;
diff --git a/io_uring/net.h b/io_uring/net.h
index 178a6d8b76e0a8..db20ce9d6546d4 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -9,6 +9,12 @@
 struct io_async_msghdr {
 	union {
 		struct iovec		fast_iov[UIO_FASTIOV];
+		struct {
+			struct iovec	fast_iov_one;
+			__kernel_size_t	controllen;
+			int		namelen;
+			__kernel_size_t	payloadlen;
+		};
 		struct io_cache_entry	cache;
 	};
 	/* points to an allocated iov, if NULL we use fast_iov instead */

From 4ccc6db0900fe337212b61650663a5dcedb69f25 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Thu, 14 Jul 2022 18:33:01 +0200
Subject: [PATCH 168/172] io_uring: Use atomic_long_try_cmpxchg in
 __io_account_mem

Use atomic_long_try_cmpxchg instead of
atomic_long_cmpxchg (*ptr, old, new) == old in __io_account_mem.
x86 CMPXCHG instruction returns success in ZF flag, so this
change saves a compare after cmpxchg (and related move
instruction in front of cmpxchg).

Also, atomic_long_try_cmpxchg implicitly assigns old *ptr value
to "old" when cmpxchg fails, enabling further code simplifications.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0250c13ae1cdd0..7f66b0e2567432 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -56,14 +56,13 @@ static inline int __io_account_mem(struct user_struct *user,
 	/* Don't allow more pages than we can safely lock */
 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
+	cur_pages = atomic_long_read(&user->locked_vm);
 	do {
-		cur_pages = atomic_long_read(&user->locked_vm);
 		new_pages = cur_pages + nr_pages;
 		if (new_pages > page_limit)
 			return -ENOMEM;
-	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
-					new_pages) != cur_pages);
-
+	} while (!atomic_long_try_cmpxchg(&user->locked_vm,
+					  &cur_pages, new_pages));
 	return 0;
 }
 

From 9b0fc3c054ff2eb13753104884f1045b5bb3a627 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Fri, 15 Jul 2022 06:02:52 -0700
Subject: [PATCH 169/172] io_uring: fix types in io_recvmsg_multishot_overflow

io_recvmsg_multishot_overflow had incorrect types on non x64 system.
But also it had an unnecessary INT_MAX check, which could just be done
by changing the type of the accumulator to int (also simplifying the
casts).

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: a8b38c4ce724 ("io_uring: support multishot in recvmsg")
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220715130252.610639-1-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 616d5f04cc7435..6b7d5f33e642a0 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -327,14 +327,14 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 
 static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
 {
-	unsigned long hdr;
+	int hdr;
 
-	if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
-			       (unsigned long)iomsg->namelen, &hdr))
+	if (iomsg->namelen < 0)
 		return true;
-	if (check_add_overflow(hdr, iomsg->controllen, &hdr))
+	if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
+			       iomsg->namelen, &hdr))
 		return true;
-	if (hdr > INT_MAX)
+	if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
 		return true;
 
 	return false;

From 48904229928d941ce1db181b991948387ab463cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Fri, 15 Jul 2022 19:45:01 +0200
Subject: [PATCH 170/172] io_uring: Don't require reinitable percpu_ref
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit 8bb649ee1da3 ("io_uring: remove ring quiesce for
io_uring_register") removed the worklow relying on reinit/resurrection
of the percpu_ref, hence, initialization with that requested is a relic.

This is based on code review, this causes no real bug (and theoretically
can't). Technically it's a revert of commit 214828962dea ("io_uring:
initialize percpu refcounters using PERCU_REF_ALLOW_REINIT") but since
the flag omission is now justified, I'm not making this a revert.

Fixes: 8bb649ee1da3 ("io_uring: remove ring quiesce for io_uring_register")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f697ca4e8f558c..624535c625652f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -289,7 +289,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	ctx->dummy_ubuf->ubuf = -1UL;
 
 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
-			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+			    0, GFP_KERNEL))
 		goto err;
 
 	ctx->flags = p->flags;

From 4f6a94d337408f23e199eee7d35ed6edc201df0c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Jul 2022 15:54:47 -0600
Subject: [PATCH 171/172] net: fix compat pointer in get_compat_msghdr()

A previous change enabled external users to copy the data before
calling __get_compat_msghdr(), but didn't modify get_compat_msghdr() or
__io_compat_recvmsg_copy_hdr() to take that into account. They are both
stil passing in the __user pointer rather than the copied version.

Ensure we pass in the kernel struct, not the pointer to the user data.

Link: https://lore.kernel.org/all/46439555-644d-08a1-7d66-16f8f9a320f0@samsung.com/
Fixes: 1a3e4e94a1b9 ("net: copy from user before calling __get_compat_msghdr")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 2 +-
 net/compat.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 6b7d5f33e642a0..e61efa31c729c6 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -398,7 +398,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
 		return -EFAULT;
 
-	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr);
+	ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
 	if (ret)
 		return ret;
 
diff --git a/net/compat.c b/net/compat.c
index 513aa9a3fc6466..ed880729d159bd 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -89,7 +89,7 @@ int get_compat_msghdr(struct msghdr *kmsg,
 	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
 		return -EFAULT;
 
-	err = __get_compat_msghdr(kmsg, umsg, save_addr);
+	err = __get_compat_msghdr(kmsg, &msg, save_addr);
 	if (err)
 		return err;
 

From f6b543fd03d347e8bf245cee4f2d54eb6ffd8fcb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 21 Jul 2022 09:06:47 -0600
Subject: [PATCH 172/172] io_uring: ensure REQ_F_ISREG is set async offload

If we're offloading requests directly to io-wq because IOSQE_ASYNC was
set in the sqe, we can miss hashing writes appropriately because we
haven't set REQ_F_ISREG yet. This can cause a performance regression
with buffered writes, as io-wq then no longer correctly serializes writes
to that file.

Ensure that we set the flags in io_prep_async_work(), which will cause
the io-wq work item to be hashed appropriately.

Fixes: 584b0180f0f4 ("io_uring: move read/write file prep state into actual opcode handler")
Link: https://lore.kernel.org/io-uring/20220608080054.GB22428@xsang-OptiPlex-9020/
Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +++
 io_uring/io_uring.h | 5 +++++
 io_uring/rw.c       | 5 -----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 624535c625652f..4b3fd645d0231a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -406,6 +406,9 @@ static void io_prep_async_work(struct io_kiocb *req)
 	if (req->flags & REQ_F_FORCE_ASYNC)
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 
+	if (req->file && !io_req_ffs_set(req))
+		req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
 	if (req->flags & REQ_F_ISREG) {
 		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
 			io_wq_hash_work(&req->work, file_inode(req->file));
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 868f45d55543bc..5db0a60dc04efc 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -41,6 +41,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
 
+static inline bool io_req_ffs_set(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_FIXED_FILE;
+}
+
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index ade3e235f2770c..c50a0d66d67a66 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -647,11 +647,6 @@ static bool need_read_all(struct io_kiocb *req)
 		S_ISBLK(file_inode(req->file)->i_mode);
 }
 
-static inline bool io_req_ffs_set(struct io_kiocb *req)
-{
-	return req->flags & REQ_F_FIXED_FILE;
-}
-
 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req);