From 0702e5364f643bc86683d9f585edfe76dbabae39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 16:56:21 -0600 Subject: [PATCH 001/172] io_uring: define a 'prep' and 'issue' handler for each opcode Rather than have two giant switches for doing request preparation and then for doing request issue, add a prep and issue handler for each of them in the io_op_defs[] request definition. Signed-off-by: Jens Axboe --- fs/io_uring.c | 838 ++++++++++++++++++++++---------------------------- 1 file changed, 365 insertions(+), 473 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e8e769be9ed058..63cad0e12d8b37 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1110,231 +1110,13 @@ struct io_op_def { unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; -}; -static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_READV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITEV] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FSYNC] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_READ_FIXED] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE_FIXED] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_POLL_ADD] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_POLL_REMOVE] = { - .audit_skip = 1, - }, - [IORING_OP_SYNC_FILE_RANGE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SENDMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .ioprio = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_RECVMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .ioprio = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_TIMEOUT_REMOVE] = { - /* used by timeout updates' prep() */ - .audit_skip = 1, - }, - [IORING_OP_ACCEPT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .poll_exclusive = 1, - .ioprio = 1, /* used for flags */ - }, - [IORING_OP_ASYNC_CANCEL] = { - .audit_skip = 1, - }, - [IORING_OP_LINK_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_CONNECT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_connect), - }, - [IORING_OP_FALLOCATE] = { - .needs_file = 1, - }, - [IORING_OP_OPENAT] = {}, - [IORING_OP_CLOSE] = {}, - [IORING_OP_FILES_UPDATE] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_STATX] = { - .audit_skip = 1, - }, - [IORING_OP_READ] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FADVISE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_MADVISE] = {}, - [IORING_OP_SEND] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .audit_skip = 1, - .ioprio = 1, - }, - [IORING_OP_RECV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .audit_skip = 1, - .ioprio = 1, - }, - [IORING_OP_OPENAT2] = { - }, - [IORING_OP_EPOLL_CTL] = { - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SPLICE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_PROVIDE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_REMOVE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_TEE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SHUTDOWN] = { - .needs_file = 1, - }, - [IORING_OP_RENAMEAT] = {}, - [IORING_OP_UNLINKAT] = {}, - [IORING_OP_MKDIRAT] = {}, - [IORING_OP_SYMLINKAT] = {}, - [IORING_OP_LINKAT] = {}, - [IORING_OP_MSG_RING] = { - .needs_file = 1, - .iopoll = 1, - }, - [IORING_OP_FSETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_SETXATTR] = {}, - [IORING_OP_FGETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_GETXATTR] = {}, - [IORING_OP_SOCKET] = { - .audit_skip = 1, - }, - [IORING_OP_URING_CMD] = { - .needs_file = 1, - .plug = 1, - .needs_async_setup = 1, - .async_size = uring_cmd_pdu_size(1), - }, + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); + int (*issue)(struct io_kiocb *, unsigned int); }; +static const struct io_op_def io_op_defs[]; + /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) @@ -8039,96 +7821,33 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_req_prep_async(struct io_kiocb *req) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + + /* assign early for deferred execution for non-fixed file */ + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + req->file = io_file_get_normal(req, req->cqe.fd); + if (!def->needs_async_setup) + return 0; + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) + return -EAGAIN; + switch (req->opcode) { - case IORING_OP_NOP: - return io_nop_prep(req, sqe); case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: + return io_readv_prep_async(req); case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - return io_prep_rw(req, sqe); - case IORING_OP_POLL_ADD: - return io_poll_add_prep(req, sqe); - case IORING_OP_POLL_REMOVE: - return io_poll_remove_prep(req, sqe); - case IORING_OP_FSYNC: - return io_fsync_prep(req, sqe); - case IORING_OP_SYNC_FILE_RANGE: - return io_sfr_prep(req, sqe); + return io_writev_prep_async(req); case IORING_OP_SENDMSG: - case IORING_OP_SEND: - return io_sendmsg_prep(req, sqe); + return io_sendmsg_prep_async(req); case IORING_OP_RECVMSG: - case IORING_OP_RECV: - return io_recvmsg_prep(req, sqe); + return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: - return io_connect_prep(req, sqe); - case IORING_OP_TIMEOUT: - return io_timeout_prep(req, sqe); - case IORING_OP_TIMEOUT_REMOVE: - return io_timeout_remove_prep(req, sqe); - case IORING_OP_ASYNC_CANCEL: - return io_async_cancel_prep(req, sqe); - case IORING_OP_LINK_TIMEOUT: - return io_link_timeout_prep(req, sqe); - case IORING_OP_ACCEPT: - return io_accept_prep(req, sqe); - case IORING_OP_FALLOCATE: - return io_fallocate_prep(req, sqe); - case IORING_OP_OPENAT: - return io_openat_prep(req, sqe); - case IORING_OP_CLOSE: - return io_close_prep(req, sqe); - case IORING_OP_FILES_UPDATE: - return io_files_update_prep(req, sqe); - case IORING_OP_STATX: - return io_statx_prep(req, sqe); - case IORING_OP_FADVISE: - return io_fadvise_prep(req, sqe); - case IORING_OP_MADVISE: - return io_madvise_prep(req, sqe); - case IORING_OP_OPENAT2: - return io_openat2_prep(req, sqe); - case IORING_OP_EPOLL_CTL: - return io_epoll_ctl_prep(req, sqe); - case IORING_OP_SPLICE: - return io_splice_prep(req, sqe); - case IORING_OP_PROVIDE_BUFFERS: - return io_provide_buffers_prep(req, sqe); - case IORING_OP_REMOVE_BUFFERS: - return io_remove_buffers_prep(req, sqe); - case IORING_OP_TEE: - return io_tee_prep(req, sqe); - case IORING_OP_SHUTDOWN: - return io_shutdown_prep(req, sqe); - case IORING_OP_RENAMEAT: - return io_renameat_prep(req, sqe); - case IORING_OP_UNLINKAT: - return io_unlinkat_prep(req, sqe); - case IORING_OP_MKDIRAT: - return io_mkdirat_prep(req, sqe); - case IORING_OP_SYMLINKAT: - return io_symlinkat_prep(req, sqe); - case IORING_OP_LINKAT: - return io_linkat_prep(req, sqe); - case IORING_OP_MSG_RING: - return io_msg_ring_prep(req, sqe); - case IORING_OP_FSETXATTR: - return io_fsetxattr_prep(req, sqe); - case IORING_OP_SETXATTR: - return io_setxattr_prep(req, sqe); - case IORING_OP_FGETXATTR: - return io_fgetxattr_prep(req, sqe); - case IORING_OP_GETXATTR: - return io_getxattr_prep(req, sqe); - case IORING_OP_SOCKET: - return io_socket_prep(req, sqe); + return io_connect_prep_async(req); case IORING_OP_URING_CMD: - return io_uring_cmd_prep(req, sqe); + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -8136,39 +7855,6 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } -static int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->needs_async_setup) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -EAGAIN; - - switch (req->opcode) { - case IORING_OP_READV: - return io_readv_prep_async(req); - case IORING_OP_WRITEV: - return io_writev_prep_async(req); - case IORING_OP_SENDMSG: - return io_sendmsg_prep_async(req); - case IORING_OP_RECVMSG: - return io_recvmsg_prep_async(req); - case IORING_OP_CONNECT: - return io_connect_prep_async(req); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep_async(req); - } - printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", - req->opcode); - return -EFAULT; -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -8335,141 +8021,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_entry(req->opcode); - switch (req->opcode) { - case IORING_OP_NOP: - ret = io_nop(req, issue_flags); - break; - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - ret = io_read(req, issue_flags); - break; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - ret = io_write(req, issue_flags); - break; - case IORING_OP_FSYNC: - ret = io_fsync(req, issue_flags); - break; - case IORING_OP_POLL_ADD: - ret = io_poll_add(req, issue_flags); - break; - case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, issue_flags); - break; - case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, issue_flags); - break; - case IORING_OP_SENDMSG: - ret = io_sendmsg(req, issue_flags); - break; - case IORING_OP_SEND: - ret = io_send(req, issue_flags); - break; - case IORING_OP_RECVMSG: - ret = io_recvmsg(req, issue_flags); - break; - case IORING_OP_RECV: - ret = io_recv(req, issue_flags); - break; - case IORING_OP_TIMEOUT: - ret = io_timeout(req, issue_flags); - break; - case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, issue_flags); - break; - case IORING_OP_ACCEPT: - ret = io_accept(req, issue_flags); - break; - case IORING_OP_CONNECT: - ret = io_connect(req, issue_flags); - break; - case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, issue_flags); - break; - case IORING_OP_FALLOCATE: - ret = io_fallocate(req, issue_flags); - break; - case IORING_OP_OPENAT: - ret = io_openat(req, issue_flags); - break; - case IORING_OP_CLOSE: - ret = io_close(req, issue_flags); - break; - case IORING_OP_FILES_UPDATE: - ret = io_files_update(req, issue_flags); - break; - case IORING_OP_STATX: - ret = io_statx(req, issue_flags); - break; - case IORING_OP_FADVISE: - ret = io_fadvise(req, issue_flags); - break; - case IORING_OP_MADVISE: - ret = io_madvise(req, issue_flags); - break; - case IORING_OP_OPENAT2: - ret = io_openat2(req, issue_flags); - break; - case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl(req, issue_flags); - break; - case IORING_OP_SPLICE: - ret = io_splice(req, issue_flags); - break; - case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers(req, issue_flags); - break; - case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers(req, issue_flags); - break; - case IORING_OP_TEE: - ret = io_tee(req, issue_flags); - break; - case IORING_OP_SHUTDOWN: - ret = io_shutdown(req, issue_flags); - break; - case IORING_OP_RENAMEAT: - ret = io_renameat(req, issue_flags); - break; - case IORING_OP_UNLINKAT: - ret = io_unlinkat(req, issue_flags); - break; - case IORING_OP_MKDIRAT: - ret = io_mkdirat(req, issue_flags); - break; - case IORING_OP_SYMLINKAT: - ret = io_symlinkat(req, issue_flags); - break; - case IORING_OP_LINKAT: - ret = io_linkat(req, issue_flags); - break; - case IORING_OP_MSG_RING: - ret = io_msg_ring(req, issue_flags); - break; - case IORING_OP_FSETXATTR: - ret = io_fsetxattr(req, issue_flags); - break; - case IORING_OP_SETXATTR: - ret = io_setxattr(req, issue_flags); - break; - case IORING_OP_FGETXATTR: - ret = io_fgetxattr(req, issue_flags); - break; - case IORING_OP_GETXATTR: - ret = io_getxattr(req, issue_flags); - break; - case IORING_OP_SOCKET: - ret = io_socket(req, issue_flags); - break; - case IORING_OP_URING_CMD: - ret = io_uring_cmd(req, issue_flags); - break; - default: - ret = -EINVAL; - break; - } + ret = def->issue(req, issue_flags); if (!def->audit_skip) audit_uring_exit(!ret, ret); @@ -8898,7 +8450,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags |= REQ_F_CREDS; } - return io_req_prep(req, sqe); + return def->prep(req, sqe); } static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, @@ -13200,8 +12752,343 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, return ret; } +static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) +{ + WARN_ON_ONCE(1); + return -ECANCELED; +} + +static const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_READV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .needs_async_setup = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITEV] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_fsync_prep, + .issue = io_fsync, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_poll_add_prep, + .issue = io_poll_add, + }, + [IORING_OP_POLL_REMOVE] = { + .audit_skip = 1, + .prep = io_poll_remove_prep, + .issue = io_poll_remove, + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_sfr_prep, + .issue = io_sync_file_range, + }, + [IORING_OP_SENDMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .ioprio = 1, + .async_size = sizeof(struct io_async_msghdr), + .prep = io_sendmsg_prep, + .issue = io_sendmsg, + }, + [IORING_OP_RECVMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .needs_async_setup = 1, + .ioprio = 1, + .async_size = sizeof(struct io_async_msghdr), + .prep = io_recvmsg_prep, + .issue = io_recvmsg, + }, + [IORING_OP_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .prep = io_timeout_prep, + .issue = io_timeout, + }, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .audit_skip = 1, + .prep = io_timeout_remove_prep, + .issue = io_timeout_remove, + }, + [IORING_OP_ACCEPT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ + .prep = io_accept_prep, + .issue = io_accept, + }, + [IORING_OP_ASYNC_CANCEL] = { + .audit_skip = 1, + .prep = io_async_cancel_prep, + .issue = io_async_cancel, + }, + [IORING_OP_LINK_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .prep = io_link_timeout_prep, + .issue = io_no_issue, + }, + [IORING_OP_CONNECT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .async_size = sizeof(struct io_async_connect), + .prep = io_connect_prep, + .issue = io_connect, + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + .prep = io_fallocate_prep, + .issue = io_fallocate, + }, + [IORING_OP_OPENAT] = { + .prep = io_openat_prep, + .issue = io_openat, + }, + [IORING_OP_CLOSE] = { + .prep = io_close_prep, + .issue = io_close, + }, + [IORING_OP_FILES_UPDATE] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_files_update_prep, + .issue = io_files_update, + }, + [IORING_OP_STATX] = { + .audit_skip = 1, + .prep = io_statx_prep, + .issue = io_statx, + }, + [IORING_OP_READ] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_fadvise_prep, + .issue = io_fadvise, + }, + [IORING_OP_MADVISE] = { + .prep = io_madvise_prep, + .issue = io_madvise, + }, + [IORING_OP_SEND] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, + .prep = io_sendmsg_prep, + .issue = io_send, + }, + [IORING_OP_RECV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .audit_skip = 1, + .ioprio = 1, + .prep = io_recvmsg_prep, + .issue = io_recv, + }, + [IORING_OP_OPENAT2] = { + .prep = io_openat2_prep, + .issue = io_openat2, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_epoll_ctl_prep, + .issue = io_epoll_ctl, + }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_splice_prep, + .issue = io_splice, + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_provide_buffers_prep, + .issue = io_provide_buffers, + }, + [IORING_OP_REMOVE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_remove_buffers_prep, + .issue = io_remove_buffers, + }, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_tee_prep, + .issue = io_tee, + }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + .prep = io_shutdown_prep, + .issue = io_shutdown, + }, + [IORING_OP_RENAMEAT] = { + .prep = io_renameat_prep, + .issue = io_renameat, + }, + [IORING_OP_UNLINKAT] = { + .prep = io_unlinkat_prep, + .issue = io_unlinkat, + }, + [IORING_OP_MKDIRAT] = { + .prep = io_mkdirat_prep, + .issue = io_mkdirat, + }, + [IORING_OP_SYMLINKAT] = { + .prep = io_symlinkat_prep, + .issue = io_symlinkat, + }, + [IORING_OP_LINKAT] = { + .prep = io_linkat_prep, + .issue = io_linkat, + }, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + .iopoll = 1, + .prep = io_msg_ring_prep, + .issue = io_msg_ring, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1, + .prep = io_fsetxattr_prep, + .issue = io_fsetxattr, + }, + [IORING_OP_SETXATTR] = { + .prep = io_setxattr_prep, + .issue = io_setxattr, + }, + [IORING_OP_FGETXATTR] = { + .needs_file = 1, + .prep = io_fgetxattr_prep, + .issue = io_fgetxattr, + }, + [IORING_OP_GETXATTR] = { + .prep = io_getxattr_prep, + .issue = io_getxattr, + }, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + .prep = io_socket_prep, + .issue = io_socket, + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + }, +}; + static int __init io_uring_init(void) { + int i; + #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ @@ -13266,6 +13153,11 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { + BUG_ON(!io_op_defs[i].prep); + BUG_ON(!io_op_defs[i].issue); + } + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; From ed29b0b4fd835b058ddd151c49d021e28d631ee6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 17:05:03 -0600 Subject: [PATCH 002/172] io_uring: move to separate directory In preparation for splitting io_uring up a bit, move it into its own top level directory. It didn't really belong in fs/ anyway, as it's not a file system only API. This adds io_uring/ and moves the core files in there, and updates the MAINTAINERS file for the new location. Signed-off-by: Jens Axboe --- MAINTAINERS | 7 +------ Makefile | 1 + fs/Makefile | 2 -- io_uring/Makefile | 6 ++++++ {fs => io_uring}/io-wq.c | 0 {fs => io_uring}/io-wq.h | 0 {fs => io_uring}/io_uring.c | 2 +- kernel/sched/core.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 io_uring/Makefile rename {fs => io_uring}/io-wq.c (100%) rename {fs => io_uring}/io-wq.h (100%) rename {fs => io_uring}/io_uring.c (99%) diff --git a/MAINTAINERS b/MAINTAINERS index 64379c699903bc..08620b9a44fc77 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7773,9 +7773,6 @@ F: include/linux/fs.h F: include/linux/fs_types.h F: include/uapi/linux/fs.h F: include/uapi/linux/openat2.h -X: fs/io-wq.c -X: fs/io-wq.h -X: fs/io_uring.c FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio @@ -10476,9 +10473,7 @@ L: io-uring@vger.kernel.org S: Maintained T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/liburing -F: fs/io-wq.c -F: fs/io-wq.h -F: fs/io_uring.c +F: io_uring/ F: include/linux/io_uring.h F: include/uapi/linux/io_uring.h F: tools/io_uring/ diff --git a/Makefile b/Makefile index b79c1c18149d38..e231f70dc78a5b 100644 --- a/Makefile +++ b/Makefile @@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ core-$(CONFIG_BLOCK) += block/ +core-$(CONFIG_IO_URING) += io_uring/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/fs/Makefile b/fs/Makefile index 208a74e0b00e12..93b80529f8e827 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o -obj-$(CONFIG_IO_URING) += io_uring.o -obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/io_uring/Makefile b/io_uring/Makefile new file mode 100644 index 00000000000000..3680425df9478b --- /dev/null +++ b/io_uring/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for io_uring + +obj-$(CONFIG_IO_URING) += io_uring.o +obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/fs/io-wq.c b/io_uring/io-wq.c similarity index 100% rename from fs/io-wq.c rename to io_uring/io-wq.c diff --git a/fs/io-wq.h b/io_uring/io-wq.h similarity index 100% rename from fs/io-wq.h rename to io_uring/io-wq.h diff --git a/fs/io_uring.c b/io_uring/io_uring.c similarity index 99% rename from fs/io_uring.c rename to io_uring/io_uring.c index 63cad0e12d8b37..f429b68d1fc295 100644 --- a/fs/io_uring.c +++ b/io_uring/io_uring.c @@ -87,7 +87,7 @@ #include -#include "internal.h" +#include "../fs/internal.h" #include "io-wq.h" #define IORING_MAX_ENTRIES 32768 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecdcf..f35674e89621b2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,7 @@ #include "stats.h" #include "../workqueue_internal.h" -#include "../../fs/io-wq.h" +#include "../../io_uring/io-wq.h" #include "../smpboot.h" /* From dc919caff6b68dab1ae56cd341d96f50c2438aea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 17:30:37 -0600 Subject: [PATCH 003/172] io_uring: move req async preparation into opcode handler Define an io_op_def->prep_async() handler and push the async preparation to there. Since we now have that, we can drop ->needs_async_setup, as they mean the same thing. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f429b68d1fc295..f353822436c4ed 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1098,8 +1098,6 @@ struct io_op_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* do prep async if is going to be punted */ - unsigned needs_async_setup : 1; /* opcode is not supported by this kernel */ unsigned not_supported : 1; /* skip auditing */ @@ -1113,6 +1111,7 @@ struct io_op_def { int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*issue)(struct io_kiocb *, unsigned int); + int (*prep_async)(struct io_kiocb *); }; static const struct io_op_def io_op_defs[]; @@ -3916,7 +3915,7 @@ static inline bool io_alloc_async_data(struct io_kiocb *req) static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_rw_state *s, bool force) { - if (!force && !io_op_defs[req->opcode].needs_async_setup) + if (!force && !io_op_defs[req->opcode].prep_async) return 0; if (!req_has_async_data(req)) { struct io_async_rw *iorw; @@ -7828,31 +7827,14 @@ static int io_req_prep_async(struct io_kiocb *req) /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->needs_async_setup) + if (!def->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (io_alloc_async_data(req)) return -EAGAIN; - switch (req->opcode) { - case IORING_OP_READV: - return io_readv_prep_async(req); - case IORING_OP_WRITEV: - return io_writev_prep_async(req); - case IORING_OP_SENDMSG: - return io_sendmsg_prep_async(req); - case IORING_OP_RECVMSG: - return io_recvmsg_prep_async(req); - case IORING_OP_CONNECT: - return io_connect_prep_async(req); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep_async(req); - } - - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - return -EINVAL; + return def->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -12770,7 +12752,6 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_async_setup = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, @@ -12778,13 +12759,13 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_async_rw), .prep = io_prep_rw, .issue = io_read, + .prep_async = io_readv_prep_async, }, [IORING_OP_WRITEV] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_setup = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, @@ -12792,6 +12773,7 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_async_rw), .prep = io_prep_rw, .issue = io_write, + .prep_async = io_writev_prep_async, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -12846,22 +12828,22 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_setup = 1, .ioprio = 1, .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, + .prep_async = io_sendmsg_prep_async, }, [IORING_OP_RECVMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_async_setup = 1, .ioprio = 1, .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, + .prep_async = io_recvmsg_prep_async, }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, @@ -12899,10 +12881,10 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_setup = 1, .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, + .prep_async = io_connect_prep_async, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -13078,10 +13060,10 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, - .needs_async_setup = 1, .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, + .prep_async = io_uring_cmd_prep_async, }, }; From f49eca21563b6d919d49828aaed9eab5d8090361 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 08:32:05 -0600 Subject: [PATCH 004/172] io_uring: add generic command payload type to struct io_kiocb Each opcode generally has a command structure in io_kiocb which it can use to store data associated with that request. In preparation for having the core layer not know about what's inside these fields, add a generic io_cmd_data type and put in the union as well. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f353822436c4ed..c596ffd92a0a00 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -954,14 +954,27 @@ enum { }; /* - * NOTE! Each of the iocb union members has the file pointer - * as the first entry in their struct definition. So you can - * access the file pointer through any of the sub-structs, - * or directly as just 'file' in this struct. + * Each request type overlays its private data structure on top of this one. + * They must not exceed this one in size. */ +struct io_cmd_data { + struct file *file; + /* each command gets 56 bytes of data */ + __u8 data[56]; +}; + +#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) + struct io_kiocb { union { + /* + * NOTE! Each of the io_kiocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'file' in this struct. + */ struct file *file; + struct io_cmd_data cmd; struct io_rw rw; struct io_poll_iocb poll; struct io_poll_update poll_update; From 3c306fb2f9463fd0e0e896dde9e3e53097d0e41e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 06:57:44 -0600 Subject: [PATCH 005/172] io_uring: convert read/write path to use io_cmd_type Remove struct io_rw from io_kiocb, and convert the read/write path to use the io_cmd_type approach instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 194 ++++++++++++++++++++++++-------------------- 1 file changed, 106 insertions(+), 88 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c596ffd92a0a00..db2880ccb22778 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -964,6 +964,7 @@ struct io_cmd_data { }; #define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) +#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) struct io_kiocb { union { @@ -975,7 +976,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_rw rw; struct io_poll_iocb poll; struct io_poll_update poll_update; struct io_accept accept; @@ -3032,7 +3032,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) wq_list_for_each(pos, start, &ctx->iopoll_list) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); int ret; /* @@ -3043,7 +3043,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (READ_ONCE(req->iopoll_completed)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); + ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) @@ -3188,11 +3188,11 @@ static void kiocb_end_write(struct io_kiocb *req) #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req) { - struct io_async_rw *rw = req->async_data; + struct io_async_rw *io = req->async_data; if (!req_has_async_data(req)) return !io_req_prep_async(req); - iov_iter_restore(&rw->s.iter, &rw->s.iter_state); + iov_iter_restore(&io->s.iter, &io->s.iter_state); return true; } @@ -3234,7 +3234,9 @@ static bool io_rw_should_reissue(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - if (req->rw.kiocb.ki_flags & IOCB_WRITE) { + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (rw->kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); fsnotify_modify(req->file); } else { @@ -3276,7 +3278,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, static void io_complete_rw(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); if (__io_complete_rw_common(req, res)) return; @@ -3287,7 +3290,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -3418,11 +3422,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); unsigned ioprio; int ret; - kiocb->ki_pos = READ_ONCE(sqe->off); + rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); @@ -3444,14 +3448,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) return ret; - kiocb->ki_ioprio = ioprio; + rw->kiocb.ki_ioprio = ioprio; } else { - kiocb->ki_ioprio = get_current_ioprio(); + rw->kiocb.ki_ioprio = get_current_ioprio(); } - req->rw.addr = READ_ONCE(sqe->addr); - req->rw.len = READ_ONCE(sqe->len); - req->rw.flags = READ_ONCE(sqe->rw_flags); + rw->addr = READ_ONCE(sqe->addr); + rw->len = READ_ONCE(sqe->len); + rw->flags = READ_ONCE(sqe->rw_flags); return 0; } @@ -3478,18 +3482,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); - if (kiocb->ki_pos != -1) - return &kiocb->ki_pos; + if (rw->kiocb.ki_pos != -1) + return &rw->kiocb.ki_pos; if (!(req->file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; - return &kiocb->ki_pos; + rw->kiocb.ki_pos = req->file->f_pos; + return &rw->kiocb.ki_pos; } - kiocb->ki_pos = 0; + rw->kiocb.ki_pos = 0; return NULL; } @@ -3497,6 +3501,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req); /* add previously done IO, if any */ if (req_has_async_data(req) && io->bytes_done > 0) { @@ -3507,11 +3512,11 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, } if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = req->rw.kiocb.ki_pos; - if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) + req->file->f_pos = rw->kiocb.ki_pos; + if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) __io_complete_rw(req, ret, issue_flags); else - io_rw_done(&req->rw.kiocb, ret); + io_rw_done(&rw->kiocb, ret); if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; @@ -3522,11 +3527,12 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, } } -static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - struct io_mapped_ubuf *imu) +static int __io_import_fixed(struct io_kiocb *req, int ddir, + struct iov_iter *iter, struct io_mapped_ubuf *imu) { - size_t len = req->rw.len; - u64 buf_end, buf_addr = req->rw.addr; + struct io_rw *rw = io_kiocb_to_cmd(req); + size_t len = rw->len; + u64 buf_end, buf_addr = rw->addr; size_t offset; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) @@ -3540,7 +3546,7 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* @@ -3682,12 +3688,13 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct compat_iovec __user *uiov; compat_ssize_t clen; void __user *buf; size_t len; - uiov = u64_to_user_ptr(req->rw.addr); + uiov = u64_to_user_ptr(rw->addr); if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(clen, &uiov->iov_len)) @@ -3699,9 +3706,9 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; - req->rw.addr = (unsigned long) buf; + rw->addr = (unsigned long) buf; iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = (compat_size_t) len; + rw->len = iov[0].iov_len = (compat_size_t) len; return 0; } #endif @@ -3709,7 +3716,8 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + struct io_rw *rw = io_kiocb_to_cmd(req); + struct iovec __user *uiov = u64_to_user_ptr(rw->addr); void __user *buf; ssize_t len; @@ -3722,21 +3730,23 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; - req->rw.addr = (unsigned long) buf; + rw->addr = (unsigned long) buf; iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = len; + rw->len = iov[0].iov_len = len; return 0; } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - iov[0].iov_base = u64_to_user_ptr(req->rw.addr); - iov[0].iov_len = req->rw.len; + iov[0].iov_base = u64_to_user_ptr(rw->addr); + iov[0].iov_len = rw->len; return 0; } - if (req->rw.len != 1) + if (rw->len != 1) return -EINVAL; #ifdef CONFIG_COMPAT @@ -3754,10 +3764,11 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } -static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, +static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct iov_iter *iter = &s->iter; u8 opcode = req->opcode; struct iovec *iovec; @@ -3766,25 +3777,25 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, rw, iter, issue_flags); + ret = io_import_fixed(req, ddir, iter, issue_flags); if (ret) return ERR_PTR(ret); return NULL; } - buf = u64_to_user_ptr(req->rw.addr); - sqe_len = req->rw.len; + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (io_do_buffer_select(req)) { buf = io_buffer_select(req, &sqe_len, issue_flags); if (!buf) return ERR_PTR(-ENOBUFS); - req->rw.addr = (unsigned long) buf; - req->rw.len = sqe_len; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; } - ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); + ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); if (ret) return ERR_PTR(ret); return NULL; @@ -3795,11 +3806,11 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ret = io_iov_buffer_select(req, iovec, issue_flags); if (ret) return ERR_PTR(ret); - iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); return NULL; } - ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, + ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, req->ctx->compat); if (unlikely(ret < 0)) return ERR_PTR(ret); @@ -3827,10 +3838,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. */ -static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) +static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { - struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; + struct kiocb *kiocb = &rw->kiocb; + struct file *file = kiocb->ki_filp; ssize_t ret = 0; loff_t *ppos; @@ -3854,11 +3865,11 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { - iovec.iov_base = u64_to_user_ptr(req->rw.addr); - iovec.iov_len = req->rw.len; + iovec.iov_base = u64_to_user_ptr(rw->addr); + iovec.iov_len = rw->len; } - if (rw == READ) { + if (ddir == READ) { nr = file->f_op->read(file, iovec.iov_base, iovec.iov_len, ppos); } else { @@ -3875,9 +3886,9 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.addr += nr; - req->rw.len -= nr; - if (!req->rw.len) + rw->addr += nr; + rw->len -= nr; + if (!rw->len) break; } if (nr != iovec.iov_len) @@ -3890,24 +3901,24 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter) { - struct io_async_rw *rw = req->async_data; + struct io_async_rw *io = req->async_data; - memcpy(&rw->s.iter, iter, sizeof(*iter)); - rw->free_iovec = iovec; - rw->bytes_done = 0; + memcpy(&io->s.iter, iter, sizeof(*iter)); + io->free_iovec = iovec; + io->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iov_iter_is_bvec(iter)) return; if (!iovec) { unsigned iov_off = 0; - rw->s.iter.iov = rw->s.fast_iov; + io->s.iter.iov = io->s.fast_iov; if (iter->iov != fast_iov) { iov_off = iter->iov - fast_iov; - rw->s.iter.iov += iov_off; + io->s.iter.iov += iov_off; } - if (rw->s.fast_iov != fast_iov) - memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, + if (io->s.fast_iov != fast_iov) + memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; @@ -3989,6 +4000,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, { struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; + struct io_rw *rw = io_kiocb_to_cmd(req); struct wait_page_key *key = arg; wpq = container_of(wait, struct wait_page_queue, wait); @@ -3996,7 +4008,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; - req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; + rw->kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); io_req_task_queue(req); return 1; @@ -4016,9 +4028,10 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, */ static bool io_rw_should_retry(struct io_kiocb *req) { - struct io_async_rw *rw = req->async_data; - struct wait_page_queue *wait = &rw->wpq; - struct kiocb *kiocb = &req->rw.kiocb; + struct io_async_rw *io = req->async_data; + struct wait_page_queue *wait = &io->wpq; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; /* never retry for NOWAIT, we just complete with -EAGAIN */ if (req->flags & REQ_F_NOWAIT) @@ -4045,12 +4058,14 @@ static bool io_rw_should_retry(struct io_kiocb *req) return true; } -static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) { - if (likely(req->file->f_op->read_iter)) - return call_read_iter(req->file, &req->rw.kiocb, iter); - else if (req->file->f_op->read) - return loop_rw_iter(READ, req, iter); + struct file *file = rw->kiocb.ki_filp; + + if (likely(file->f_op->read_iter)) + return call_read_iter(file, &rw->kiocb, iter); + else if (file->f_op->read) + return loop_rw_iter(READ, rw, iter); else return -EINVAL; } @@ -4063,7 +4078,8 @@ static bool need_read_all(struct io_kiocb *req) static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; struct io_ring_ctx *ctx = req->ctx; struct file *file = req->file; int ret; @@ -4075,7 +4091,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, req->rw.flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags); if (unlikely(ret)) return ret; @@ -4107,11 +4123,12 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) static int io_read(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct io_rw_state __s, *s = &__s; struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; + struct kiocb *kiocb = &rw->kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *rw; + struct io_async_rw *io; ssize_t ret, ret2; loff_t *ppos; @@ -4120,8 +4137,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { - rw = req->async_data; - s = &rw->s; + io = req->async_data; + s = &io->s; /* * Safe and required to re-import if we're using provided @@ -4168,7 +4185,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return ret; } - ret = io_iter_do_read(req, &s->iter); + ret = io_iter_do_read(rw, &s->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; @@ -4202,8 +4219,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return ret2; iovec = NULL; - rw = req->async_data; - s = &rw->s; + io = req->async_data; + s = &io->s; /* * Now use our persistent iterator and state, if we aren't already. * We've restored and mapped the iter to match. @@ -4218,7 +4235,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_advance(&s->iter, ret); if (!iov_iter_count(&s->iter)) break; - rw->bytes_done += ret; + io->bytes_done += ret; iov_iter_save_state(&s->iter, &s->iter_state); /* if we can retry, do so with the callbacks armed */ @@ -4233,7 +4250,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(req, &s->iter); + ret = io_iter_do_read(rw, &s->iter); if (ret == -EIOCBQUEUED) return 0; /* we got some bytes, but not all. retry. */ @@ -4251,9 +4268,10 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) static int io_write(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct io_rw_state __s, *s = &__s; struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; + struct kiocb *kiocb = &rw->kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; loff_t *ppos; @@ -4263,9 +4281,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { - struct io_async_rw *rw = req->async_data; + struct io_async_rw *io = req->async_data; - s = &rw->s; + s = &io->s; iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } @@ -4315,7 +4333,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) if (likely(req->file->f_op->write_iter)) ret2 = call_write_iter(req->file, kiocb, &s->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, &s->iter); + ret2 = loop_rw_iter(WRITE, rw, &s->iter); else ret2 = -EINVAL; From 8d4388d1166faa5e16fd100157c9cb3c9e982397 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:13:46 -0600 Subject: [PATCH 006/172] io_uring: convert poll path to use io_cmd_type Remove struct io_poll_iocb from io_kiocb, and convert the poll path to use the io_cmd_type approach instead. While at it, rename io_poll_iocb to io_poll which is consistent with the other request type private structures. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 53 ++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db2880ccb22778..7459a535060c33 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -556,7 +556,7 @@ struct io_uring_task { * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in */ -struct io_poll_iocb { +struct io_poll { struct file *file; struct wait_queue_head *head; __poll_t events; @@ -919,8 +919,8 @@ enum { }; struct async_poll { - struct io_poll_iocb poll; - struct io_poll_iocb *double_poll; + struct io_poll poll; + struct io_poll *double_poll; }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); @@ -976,7 +976,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_poll_iocb poll; struct io_poll_update poll_update; struct io_accept accept; struct io_sync sync; @@ -6579,7 +6578,7 @@ static void io_poll_mark_cancelled(struct io_kiocb *req) atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } -static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) +static struct io_poll *io_poll_get_double(struct io_kiocb *req) { /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) @@ -6587,10 +6586,10 @@ static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) return req->apoll->double_poll; } -static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +static struct io_poll *io_poll_get_single(struct io_kiocb *req) { if (req->opcode == IORING_OP_POLL_ADD) - return &req->poll; + return io_kiocb_to_cmd(req); return &req->apoll->poll; } @@ -6603,7 +6602,7 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, wait_queue_func_t wake_func) { poll->head = NULL; @@ -6614,7 +6613,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, init_waitqueue_func_entry(&poll->wait, wake_func); } -static inline void io_poll_remove_entry(struct io_poll_iocb *poll) +static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); @@ -6740,7 +6739,9 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) return; if (!ret) { - req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); + struct io_poll *poll = io_kiocb_to_cmd(req); + + req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else { req->cqe.res = ret; req_set_fail(req); @@ -6816,8 +6817,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wqe_to_req(wait); - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); + struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); if (unlikely(mask & POLLFREE)) { @@ -6863,20 +6863,20 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return 1; } -static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, +static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, - struct io_poll_iocb **poll_ptr) + struct io_poll **poll_ptr) { struct io_kiocb *req = pt->req; unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling - * (e.g. one for read, one for write). Setup a separate io_poll_iocb + * (e.g. one for read, one for write). Setup a separate io_poll * if this happens. */ if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *first = poll; + struct io_poll *first = poll; /* double add on the same waitqueue head, ignore */ if (first->head == head) @@ -6918,13 +6918,14 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct io_poll *poll = io_kiocb_to_cmd(pt->req); - __io_queue_proc(&pt->req->poll, pt, head, - (struct io_poll_iocb **) &pt->req->async_data); + __io_queue_proc(poll, pt, head, + (struct io_poll **) &pt->req->async_data); } static int __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, + struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask) { struct io_ring_ctx *ctx = req->ctx; @@ -7207,7 +7208,7 @@ static int io_poll_remove_prep(struct io_kiocb *req, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_poll_iocb *poll = &req->poll; + struct io_poll *poll = io_kiocb_to_cmd(req); u32 flags; if (sqe->buf_index || sqe->off || sqe->addr) @@ -7225,13 +7226,13 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { - struct io_poll_iocb *poll = &req->poll; + struct io_poll *poll = io_kiocb_to_cmd(req); struct io_poll_table ipt; int ret; ipt.pt._qproc = io_poll_queue_proc; - ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); if (!ret && ipt.error) req_set_fail(req); ret = ret ?: ipt.error; @@ -7260,9 +7261,11 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) if (req->poll_update.update_events || req->poll_update.update_user_data) { /* only mask one event flags, keep behavior flags */ if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; + struct io_poll *poll = io_kiocb_to_cmd(preq); + + poll->events &= ~0xffff; + poll->events |= req->poll_update.events & 0xffff; + poll->events |= IO_POLL_UNMASK; } if (req->poll_update.update_user_data) preq->cqe.user_data = req->poll_update.new_user_data; From c24b154967b6d335189eac25be59913538e2cdb4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:16:40 -0600 Subject: [PATCH 007/172] io_uring: convert poll_update path to use io_cmd_type Remove struct io_poll_update from io_kiocb, and convert the poll path to use the io_cmd_type approach instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7459a535060c33..9114d4a42f2bca 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -976,7 +976,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_poll_update poll_update; struct io_accept accept; struct io_sync sync; struct io_cancel cancel; @@ -7178,7 +7177,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, static int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_poll_update *upd = &req->poll_update; + struct io_poll_update *upd = io_kiocb_to_cmd(req); u32 flags; if (sqe->buf_index || sqe->splice_fd_in) @@ -7243,7 +7242,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { - struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; + struct io_poll_update *poll_update = io_kiocb_to_cmd(req); + struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; int ret2, ret = 0; @@ -7258,17 +7258,17 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) } spin_unlock(&ctx->completion_lock); - if (req->poll_update.update_events || req->poll_update.update_user_data) { + if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { + if (poll_update->update_events) { struct io_poll *poll = io_kiocb_to_cmd(preq); poll->events &= ~0xffff; - poll->events |= req->poll_update.events & 0xffff; + poll->events |= poll_update->events & 0xffff; poll->events |= IO_POLL_UNMASK; } - if (req->poll_update.update_user_data) - preq->cqe.user_data = req->poll_update.new_user_data; + if (poll_update->update_user_data) + preq->cqe.user_data = poll_update->new_user_data; ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ From bd8587e4997a88635a37fac546223346e512a30b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:24:42 -0600 Subject: [PATCH 008/172] io_uring: remove recvmsg knowledge from io_arm_poll_handler() There's a special case for recvmsg with MSG_ERRQUEUE set. This is problematic as it means the core needs to know about this special request type. For now, just add a generic flag for it. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9114d4a42f2bca..2c061424797d7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -846,6 +846,7 @@ enum { REQ_F_PARTIAL_IO_BIT, REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, + REQ_F_CLEAR_POLLIN_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -916,6 +917,8 @@ enum { REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), /* ->extra1 and ->extra2 are initialised */ REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), + /* recvmsg special flag, clear EPOLLIN */ + REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), }; struct async_poll { @@ -6145,6 +6148,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (sr->msg_flags & MSG_ERRQUEUE) + req->flags |= REQ_F_CLEAR_POLLIN; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -7023,8 +7028,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if ((req->opcode == IORING_OP_RECVMSG) && - (req->sr_msg.msg_flags & MSG_ERRQUEUE)) + if (req->flags & REQ_F_CLEAR_POLLIN) mask &= ~EPOLLIN; } else { mask |= EPOLLOUT | EPOLLWRNORM; From 8ff86d85b74d547a4e5f09acc884dd4cc173d087 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:27:38 -0600 Subject: [PATCH 009/172] io_uring: convert net related opcodes to use io_cmd_type This converts accept, connect, send/recv, sendmsg/recvmsg, shutdown, and socket to use io_cmd_type. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 53 +++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2c061424797d7c..6798859c601d65 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,13 +979,10 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_accept accept; struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; struct io_timeout_rem timeout_rem; - struct io_connect connect; - struct io_sr_msg sr_msg; struct io_open open; struct io_close close; struct io_rsrc_update rsrc_update; @@ -995,7 +992,6 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; - struct io_shutdown shutdown; struct io_rename rename; struct io_unlink unlink; struct io_mkdir mkdir; @@ -1003,7 +999,6 @@ struct io_kiocb { struct io_hardlink hardlink; struct io_msg msg; struct io_xattr xattr; - struct io_socket sock; struct io_uring_cmd uring_cmd; }; @@ -5824,16 +5819,19 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; - req->shutdown.how = READ_ONCE(sqe->len); + shutdown->how = READ_ONCE(sqe->len); return 0; } static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) { + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); struct socket *sock; int ret; @@ -5844,7 +5842,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - ret = __sys_shutdown_sock(sock, req->shutdown.how); + ret = __sys_shutdown_sock(sock, shutdown->how); io_req_complete(req, ret); return 0; } @@ -5881,10 +5879,12 @@ static int io_setup_async_msg(struct io_kiocb *req, static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + iomsg->msg.msg_name = &iomsg->addr; iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, - req->sr_msg.msg_flags, &iomsg->free_iov); + return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, + &iomsg->free_iov); } static int io_sendmsg_prep_async(struct io_kiocb *req) @@ -5899,7 +5899,7 @@ static int io_sendmsg_prep_async(struct io_kiocb *req) static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -5923,8 +5923,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -5981,7 +5981,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) static int io_send(struct io_kiocb *req, unsigned int issue_flags) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct msghdr msg; struct iovec iov; struct socket *sock; @@ -6039,7 +6039,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct iovec __user *uiov; size_t iov_len; int ret; @@ -6072,7 +6072,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct compat_iovec __user *uiov; compat_uptr_t ptr; compat_size_t len; @@ -6135,7 +6135,7 @@ static int io_recvmsg_prep_async(struct io_kiocb *req) static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -6161,8 +6161,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned int cflags; unsigned flags; @@ -6238,7 +6238,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct msghdr msg; struct socket *sock; struct iovec iov; @@ -6314,7 +6314,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_accept *accept = &req->accept; + struct io_accept *accept = io_kiocb_to_cmd(req); unsigned flags; if (sqe->len || sqe->buf_index) @@ -6348,7 +6348,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_accept *accept = &req->accept; + struct io_accept *accept = io_kiocb_to_cmd(req); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; bool fixed = !!accept->file_slot; @@ -6413,7 +6413,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_socket *sock = &req->sock; + struct io_socket *sock = io_kiocb_to_cmd(req); if (sqe->addr || sqe->rw_flags || sqe->buf_index) return -EINVAL; @@ -6434,7 +6434,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_socket(struct io_kiocb *req, unsigned int issue_flags) { - struct io_socket *sock = &req->sock; + struct io_socket *sock = io_kiocb_to_cmd(req); bool fixed = !!sock->file_slot; struct file *file; int ret, fd; @@ -6468,14 +6468,14 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags) static int io_connect_prep_async(struct io_kiocb *req) { struct io_async_connect *io = req->async_data; - struct io_connect *conn = &req->connect; + struct io_connect *conn = io_kiocb_to_cmd(req); return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); } static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_connect *conn = &req->connect; + struct io_connect *conn = io_kiocb_to_cmd(req); if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -6487,6 +6487,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_connect(struct io_kiocb *req, unsigned int issue_flags) { + struct io_connect *connect = io_kiocb_to_cmd(req); struct io_async_connect __io, *io; unsigned file_flags; int ret; @@ -6495,8 +6496,8 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { io = req->async_data; } else { - ret = move_addr_to_kernel(req->connect.addr, - req->connect.addr_len, + ret = move_addr_to_kernel(connect->addr, + connect->addr_len, &__io.address); if (ret) goto out; @@ -6506,7 +6507,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_connect_file(req->file, &io->address, - req->connect.addr_len, file_flags); + connect->addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { if (req_has_async_data(req)) return -EAGAIN; From e4a71006eace14a6699d66a514b6310cd8b57f7d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:30:45 -0600 Subject: [PATCH 010/172] io_uring: convert the sync and fallocate paths to use io_cmd_type They all share the same struct io_sync, convert them to use the io_cmd_type approach instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6798859c601d65..b59edae29956f2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; struct io_timeout_rem timeout_rem; @@ -5085,30 +5084,32 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sync *sync = io_kiocb_to_cmd(req); + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; - req->sync.flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + sync->flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) return -EINVAL; - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); return 0; } static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) { - loff_t end = req->sync.off + req->sync.len; + struct io_sync *sync = io_kiocb_to_cmd(req); + loff_t end = sync->off + sync->len; int ret; /* fsync always requires a blocking context */ if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = vfs_fsync_range(req->file, req->sync.off, - end > 0 ? end : LLONG_MAX, - req->sync.flags & IORING_FSYNC_DATASYNC); + ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, + sync->flags & IORING_FSYNC_DATASYNC); io_req_complete(req, ret); return 0; } @@ -5116,24 +5117,26 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sync *sync = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->addr); - req->sync.mode = READ_ONCE(sqe->len); + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->addr); + sync->mode = READ_ONCE(sqe->len); return 0; } static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sync *sync = io_kiocb_to_cmd(req); int ret; /* fallocate always requiring blocking context */ if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); io_req_complete(req, ret); @@ -5792,25 +5795,27 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sync *sync = io_kiocb_to_cmd(req); + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); - req->sync.flags = READ_ONCE(sqe->sync_range_flags); + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); + sync->flags = READ_ONCE(sqe->sync_range_flags); return 0; } static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sync *sync = io_kiocb_to_cmd(req); int ret; /* sync_file_range always requires a blocking context */ if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = sync_file_range(req->file, req->sync.off, req->sync.len, - req->sync.flags); + ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_complete(req, ret); return 0; } From f38987f09a062c17e9c5119e23b41abc6326cd4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:33:01 -0600 Subject: [PATCH 011/172] io_uring: convert cancel path to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b59edae29956f2..ef7b5430bb14bb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_cancel cancel; struct io_timeout timeout; struct io_timeout_rem timeout_rem; struct io_open open; @@ -7699,19 +7698,21 @@ static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) static int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_cancel *cancel = io_kiocb_to_cmd(req); + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sqe->off || sqe->len || sqe->splice_fd_in) return -EINVAL; - req->cancel.addr = READ_ONCE(sqe->addr); - req->cancel.flags = READ_ONCE(sqe->cancel_flags); - if (req->cancel.flags & ~CANCEL_FLAGS) + cancel->addr = READ_ONCE(sqe->addr); + cancel->flags = READ_ONCE(sqe->cancel_flags); + if (cancel->flags & ~CANCEL_FLAGS) return -EINVAL; - if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { - if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) + if (cancel->flags & IORING_ASYNC_CANCEL_FD) { + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; - req->cancel.fd = READ_ONCE(sqe->fd); + cancel->fd = READ_ONCE(sqe->fd); } return 0; @@ -7753,20 +7754,21 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { + struct io_cancel *cancel = io_kiocb_to_cmd(req); struct io_cancel_data cd = { .ctx = req->ctx, - .data = req->cancel.addr, - .flags = req->cancel.flags, + .data = cancel->addr, + .flags = cancel->flags, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->cancel.fd, + req->file = io_file_get_fixed(req, cancel->fd, issue_flags); else - req->file = io_file_get_normal(req, req->cancel.fd); + req->file = io_file_get_normal(req, cancel->fd); if (!req->file) { ret = -EBADF; goto done; From a43714ace50d85b4cb19af46152f624affeef2d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:45:22 -0600 Subject: [PATCH 012/172] io_uring: convert timeout path to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 117 +++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ef7b5430bb14bb..d0daa65f7138cc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,8 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_timeout timeout; - struct io_timeout_rem timeout_rem; struct io_open open; struct io_close close; struct io_rsrc_update rsrc_update; @@ -1652,7 +1650,9 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) static inline bool io_is_timeout_noseq(struct io_kiocb *req) { - return !req->timeout.off; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + + return !timeout->off; } static __cold void io_fallback_req_func(struct work_struct *work) @@ -1898,11 +1898,13 @@ static void io_kill_timeout(struct io_kiocb *req, int status) struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); + if (status) req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&req->timeout.list); + list_del_init(&timeout->list); io_req_tw_post_queue(req, status, 0); } } @@ -1925,10 +1927,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - struct io_kiocb *req, *tmp; + struct io_timeout *timeout, *tmp; spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); u32 events_needed, events_got; if (io_is_timeout_noseq(req)) @@ -1941,7 +1944,7 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) * these subtractions won't have wrapped, so we can check if * target is in [last_seq, current_seq] by comparing the two. */ - events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; + events_needed = timeout->target_seq - ctx->cq_last_tm_flush; events_got = seq - ctx->cq_last_tm_flush; if (events_got < events_needed) break; @@ -2546,11 +2549,12 @@ static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { struct io_timeout_data *io = link->async_data; + struct io_timeout *timeout = io_kiocb_to_cmd(link); io_remove_next_linked(req); - link->timeout.head = NULL; + timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { - list_del(&link->timeout.list); + list_del(&timeout->list); return link; } } @@ -7302,11 +7306,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->timeout_lock, flags); - list_del_init(&req->timeout.list); + list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); spin_unlock_irqrestore(&ctx->timeout_lock, flags); @@ -7324,29 +7329,32 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { + struct io_timeout *timeout; struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->timeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != req->cqe.user_data) + cd->data != tmp->cqe.user_data) continue; if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) + if (cd->seq == tmp->work.cancel_seq) continue; - req->work.cancel_seq = cd->seq; + tmp->work.cancel_seq = cd->seq; } - found = true; + req = tmp; break; } - if (!found) + if (!req) return ERR_PTR(-ENOENT); io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return ERR_PTR(-EALREADY); - list_del_init(&req->timeout.list); + timeout = io_kiocb_to_cmd(req); + list_del_init(&timeout->list); return req; } @@ -7386,15 +7394,18 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; + struct io_timeout *timeout; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->ltimeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->cqe.user_data; - if (found) + if (user_data == tmp->cqe.user_data) { + req = tmp; break; + } } - if (!found) + if (!req) return -ENOENT; io = req->async_data; @@ -7412,14 +7423,15 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, { struct io_cancel_data cd = { .data = user_data, }; struct io_kiocb *req = io_timeout_extract(ctx, &cd); + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_timeout_data *data; if (IS_ERR(req)) return PTR_ERR(req); - req->timeout.off = 0; /* noseq */ + timeout->off = 0; /* noseq */ data = req->async_data; - list_add_tail(&req->timeout.list, &ctx->timeout_list); + list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); @@ -7429,7 +7441,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, static int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_timeout_rem *tr = &req->timeout_rem; + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; @@ -7469,11 +7481,11 @@ static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) */ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) { - struct io_timeout_rem *tr = &req->timeout_rem; + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; int ret; - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { struct io_cancel_data cd = { .data = tr->addr, }; spin_lock(&ctx->completion_lock); @@ -7500,6 +7512,7 @@ static int __io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); @@ -7516,8 +7529,8 @@ static int __io_timeout_prep(struct io_kiocb *req, if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; - INIT_LIST_HEAD(&req->timeout.list); - req->timeout.off = off; + INIT_LIST_HEAD(&timeout->list); + timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; @@ -7536,7 +7549,7 @@ static int __io_timeout_prep(struct io_kiocb *req, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; - INIT_LIST_HEAD(&req->timeout.list); + INIT_LIST_HEAD(&timeout->list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -7547,7 +7560,7 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; if (link->last->opcode == IORING_OP_LINK_TIMEOUT) return -EINVAL; - req->timeout.head = link->last; + timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; } return 0; @@ -7567,10 +7580,11 @@ static int io_link_timeout_prep(struct io_kiocb *req, static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data = req->async_data; struct list_head *entry; - u32 tail, off = req->timeout.off; + u32 tail, off = timeout->off; spin_lock_irq(&ctx->timeout_lock); @@ -7585,7 +7599,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - req->timeout.target_seq = tail + off; + timeout->target_seq = tail + off; /* Update the last seq here in case io_flush_timeouts() hasn't. * This is safe because ->completion_lock is held, and submissions @@ -7598,17 +7612,17 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, - timeout.list); + struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); + struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ - if (off >= nxt->timeout.target_seq - tail) + if (off >= nextt->target_seq - tail) break; } add: - list_add(&req->timeout.list, entry); + list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); @@ -8198,7 +8212,8 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { - struct io_kiocb *prev = req->timeout.prev; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; if (prev) { @@ -8222,12 +8237,13 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *prev, *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->timeout_lock, flags); - prev = req->timeout.head; - req->timeout.head = NULL; + prev = timeout->head; + timeout->head = NULL; /* * We don't expect the list to be empty, that will only happen if we @@ -8238,8 +8254,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } - list_del(&req->timeout.list); - req->timeout.prev = prev; + list_del(&timeout->list); + timeout->prev = prev; spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; @@ -8249,6 +8265,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static void io_queue_linked_timeout(struct io_kiocb *req) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); @@ -8256,13 +8273,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req) * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer */ - if (req->timeout.head) { + if (timeout->head) { struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - list_add_tail(&req->timeout.list, &ctx->ltimeout_list); + list_add_tail(&timeout->list, &ctx->ltimeout_list); } spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ @@ -10930,12 +10947,14 @@ static __cold void io_ring_exit_work(struct work_struct *work) static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) { - struct io_kiocb *req, *tmp; + struct io_timeout *timeout, *tmp; int canceled = 0; spin_lock(&ctx->completion_lock); spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); + if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; From dd752582e398d22fe6798f065953f4266bf4f5db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:49:25 -0600 Subject: [PATCH 013/172] io_uring: convert open/close path to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 72 +++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d0daa65f7138cc..65eb41a60d741d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,8 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_open open; - struct io_close close; struct io_rsrc_update rsrc_update; struct io_fadvise fadvise; struct io_madvise madvise; @@ -5148,6 +5146,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_open *open = io_kiocb_to_cmd(req); const char __user *fname; int ret; @@ -5157,38 +5156,40 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EBADF; /* open.how should be already initialised */ - if (!(req->open.how.flags & O_PATH) && force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; + if (!(open->how.flags & O_PATH) && force_o_largefile()) + open->how.flags |= O_LARGEFILE; - req->open.dfd = READ_ONCE(sqe->fd); + open->dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.filename = getname(fname); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; + open->filename = getname(fname); + if (IS_ERR(open->filename)) { + ret = PTR_ERR(open->filename); + open->filename = NULL; return ret; } - req->open.file_slot = READ_ONCE(sqe->file_index); - if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) + open->file_slot = READ_ONCE(sqe->file_index); + if (open->file_slot && (open->how.flags & O_CLOEXEC)) return -EINVAL; - req->open.nofile = rlimit(RLIMIT_NOFILE); + open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_open *open = io_kiocb_to_cmd(req); u64 mode = READ_ONCE(sqe->len); u64 flags = READ_ONCE(sqe->open_flags); - req->open.how = build_open_how(flags, mode); + open->how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_open *open = io_kiocb_to_cmd(req); struct open_how __user *how; size_t len; int ret; @@ -5198,8 +5199,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; - ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, - len); + ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); if (ret) return ret; @@ -5261,35 +5261,36 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { + struct io_open *open = io_kiocb_to_cmd(req); struct open_flags op; struct file *file; bool resolve_nonblock, nonblock_set; - bool fixed = !!req->open.file_slot; + bool fixed = !!open->file_slot; int ret; - ret = build_open_flags(&req->open.how, &op); + ret = build_open_flags(&open->how, &op); if (ret) goto err; nonblock_set = op.open_flag & O_NONBLOCK; - resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; + resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, * it'll always -EAGAIN */ - if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) return -EAGAIN; op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } if (!fixed) { - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); + ret = __get_unused_fd_flags(open->how.flags, open->nofile); if (ret < 0) goto err; } - file = do_filp_open(req->open.dfd, req->open.filename, &op); + file = do_filp_open(open->dfd, open->filename, &op); if (IS_ERR(file)) { /* * We could hang on to this 'fd' on retrying, but seems like @@ -5315,9 +5316,9 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) fd_install(ret, file); else ret = io_fixed_fd_install(req, issue_flags, file, - req->open.file_slot); + open->file_slot); err: - putname(req->open.filename); + putname(open->filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); @@ -5737,14 +5738,16 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_close *close = io_kiocb_to_cmd(req); + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; - req->close.fd = READ_ONCE(sqe->fd); - req->close.file_slot = READ_ONCE(sqe->file_index); - if (req->close.file_slot && req->close.fd) + close->fd = READ_ONCE(sqe->fd); + close->file_slot = READ_ONCE(sqe->file_index); + if (close->file_slot && close->fd) return -EINVAL; return 0; @@ -5753,12 +5756,12 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_close(struct io_kiocb *req, unsigned int issue_flags) { struct files_struct *files = current->files; - struct io_close *close = &req->close; + struct io_close *close = io_kiocb_to_cmd(req); struct fdtable *fdt; struct file *file; int ret = -EBADF; - if (req->close.file_slot) { + if (close->file_slot) { ret = io_close_fixed(req, issue_flags); goto err; } @@ -7982,10 +7985,13 @@ static void io_clean_op(struct io_kiocb *req) break; } case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - if (req->open.filename) - putname(req->open.filename); + case IORING_OP_OPENAT2: { + struct io_open *open = io_kiocb_to_cmd(req); + + if (open->filename) + putname(open->filename); break; + } case IORING_OP_RENAMEAT: putname(req->rename.oldpath); putname(req->rename.newpath); @@ -9831,7 +9837,9 @@ static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { - return __io_close_fixed(req, issue_flags, req->close.file_slot - 1); + struct io_close *close = io_kiocb_to_cmd(req); + + return __io_close_fixed(req, issue_flags, close->file_slot - 1); } static int __io_sqe_files_update(struct io_ring_ctx *ctx, From 37d4842f11c5a8d4eeb8e85bf2120a167d174456 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:51:05 -0600 Subject: [PATCH 014/172] io_uring: convert madvise/fadvise to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 65eb41a60d741d..60d462d7c8479b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,8 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_fadvise fadvise; - struct io_madvise madvise; struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; @@ -5629,12 +5627,14 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; - req->madvise.addr = READ_ONCE(sqe->addr); - req->madvise.len = READ_ONCE(sqe->len); - req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + ma->addr = READ_ONCE(sqe->addr); + ma->len = READ_ONCE(sqe->len); + ma->advice = READ_ONCE(sqe->fadvise_advice); return 0; #else return -EOPNOTSUPP; @@ -5644,7 +5644,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = &req->madvise; + struct io_madvise *ma = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -5660,18 +5660,20 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_fadvise *fa = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; - req->fadvise.offset = READ_ONCE(sqe->off); - req->fadvise.len = READ_ONCE(sqe->len); - req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + fa->offset = READ_ONCE(sqe->off); + fa->len = READ_ONCE(sqe->len); + fa->advice = READ_ONCE(sqe->fadvise_advice); return 0; } static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) { - struct io_fadvise *fa = &req->fadvise; + struct io_fadvise *fa = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) { From bb040a21fd0579ec7575a0ede7185d2504c021b7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:59:28 -0600 Subject: [PATCH 015/172] io_uring: convert file system request types to use io_cmd_type This converts statx, rename, unlink, mkdir, symlink, and hardlink to use io_cmd_type. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 102 +++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 60d462d7c8479b..38c34da088b5ac 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -983,12 +983,6 @@ struct io_kiocb { struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; - struct io_statx statx; - struct io_rename rename; - struct io_unlink unlink; - struct io_mkdir mkdir; - struct io_symlink symlink; - struct io_hardlink hardlink; struct io_msg msg; struct io_xattr xattr; struct io_uring_cmd uring_cmd; @@ -4367,7 +4361,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) static int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_rename *ren = &req->rename; + struct io_rename *ren = io_kiocb_to_cmd(req); const char __user *oldf, *newf; if (sqe->buf_index || sqe->splice_fd_in) @@ -4397,7 +4391,7 @@ static int io_renameat_prep(struct io_kiocb *req, static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_rename *ren = &req->rename; + struct io_rename *ren = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4654,7 +4648,7 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_unlink *un = &req->unlink; + struct io_unlink *un = io_kiocb_to_cmd(req); const char __user *fname; if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) @@ -4679,7 +4673,7 @@ static int io_unlinkat_prep(struct io_kiocb *req, static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_unlink *un = &req->unlink; + struct io_unlink *un = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4698,7 +4692,7 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) static int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_mkdir *mkd = &req->mkdir; + struct io_mkdir *mkd = io_kiocb_to_cmd(req); const char __user *fname; if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4720,7 +4714,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_mkdir *mkd = &req->mkdir; + struct io_mkdir *mkd = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4736,7 +4730,7 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) static int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_symlink *sl = &req->symlink; + struct io_symlink *sl = io_kiocb_to_cmd(req); const char __user *oldpath, *newpath; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4764,7 +4758,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_symlink *sl = &req->symlink; + struct io_symlink *sl = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4780,7 +4774,7 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) static int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_hardlink *lnk = &req->hardlink; + struct io_hardlink *lnk = io_kiocb_to_cmd(req); const char __user *oldf, *newf; if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4810,7 +4804,7 @@ static int io_linkat_prep(struct io_kiocb *req, static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_hardlink *lnk = &req->hardlink; + struct io_hardlink *lnk = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -5696,6 +5690,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_statx *sx = io_kiocb_to_cmd(req); const char __user *path; if (sqe->buf_index || sqe->splice_fd_in) @@ -5703,20 +5698,20 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_FIXED_FILE) return -EBADF; - req->statx.dfd = READ_ONCE(sqe->fd); - req->statx.mask = READ_ONCE(sqe->len); + sx->dfd = READ_ONCE(sqe->fd); + sx->mask = READ_ONCE(sqe->len); path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->statx.flags = READ_ONCE(sqe->statx_flags); + sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + sx->flags = READ_ONCE(sqe->statx_flags); - req->statx.filename = getname_flags(path, - getname_statx_lookup_flags(req->statx.flags), - NULL); + sx->filename = getname_flags(path, + getname_statx_lookup_flags(sx->flags), + NULL); - if (IS_ERR(req->statx.filename)) { - int ret = PTR_ERR(req->statx.filename); + if (IS_ERR(sx->filename)) { + int ret = PTR_ERR(sx->filename); - req->statx.filename = NULL; + sx->filename = NULL; return ret; } @@ -5726,14 +5721,13 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_statx(struct io_kiocb *req, unsigned int issue_flags) { - struct io_statx *ctx = &req->statx; + struct io_statx *sx = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, - ctx->buffer); + ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_complete(req, ret); return 0; } @@ -7994,28 +7988,46 @@ static void io_clean_op(struct io_kiocb *req) putname(open->filename); break; } - case IORING_OP_RENAMEAT: - putname(req->rename.oldpath); - putname(req->rename.newpath); + case IORING_OP_RENAMEAT: { + struct io_rename *ren = io_kiocb_to_cmd(req); + + putname(ren->oldpath); + putname(ren->newpath); break; - case IORING_OP_UNLINKAT: - putname(req->unlink.filename); + } + case IORING_OP_UNLINKAT: { + struct io_unlink *ul = io_kiocb_to_cmd(req); + + putname(ul->filename); break; - case IORING_OP_MKDIRAT: - putname(req->mkdir.filename); + } + case IORING_OP_MKDIRAT: { + struct io_mkdir *md = io_kiocb_to_cmd(req); + + putname(md->filename); break; - case IORING_OP_SYMLINKAT: - putname(req->symlink.oldpath); - putname(req->symlink.newpath); + } + case IORING_OP_SYMLINKAT: { + struct io_symlink *sl = io_kiocb_to_cmd(req); + + putname(sl->oldpath); + putname(sl->newpath); break; - case IORING_OP_LINKAT: - putname(req->hardlink.oldpath); - putname(req->hardlink.newpath); + } + case IORING_OP_LINKAT: { + struct io_hardlink *hl = io_kiocb_to_cmd(req); + + putname(hl->oldpath); + putname(hl->newpath); break; - case IORING_OP_STATX: - if (req->statx.filename) - putname(req->statx.filename); + } + case IORING_OP_STATX: { + struct io_statx *sx = io_kiocb_to_cmd(req); + + if (sx->filename) + putname(sx->filename); break; + } case IORING_OP_SETXATTR: case IORING_OP_FSETXATTR: case IORING_OP_GETXATTR: From 3e93a3571a17022d8bde08249e468349da6b7305 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:01:09 -0600 Subject: [PATCH 016/172] io_uring: convert epoll to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 38c34da088b5ac..bcdc6ed7f46bb7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,7 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; struct io_msg msg; @@ -5577,18 +5576,20 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) + struct io_epoll *epoll = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; - req->epoll.epfd = READ_ONCE(sqe->fd); - req->epoll.op = READ_ONCE(sqe->len); - req->epoll.fd = READ_ONCE(sqe->off); + epoll->epfd = READ_ONCE(sqe->fd); + epoll->op = READ_ONCE(sqe->len); + epoll->fd = READ_ONCE(sqe->off); - if (ep_op_has_event(req->epoll.op)) { + if (ep_op_has_event(epoll->op)) { struct epoll_event __user *ev; ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + if (copy_from_user(&epoll->event, ev, sizeof(*ev))) return -EFAULT; } @@ -5601,7 +5602,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_EPOLL) - struct io_epoll *ie = &req->epoll; + struct io_epoll *ie = io_kiocb_to_cmd(req); int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; From 2511d3030c5eaea37baee030cb36e70426ad4e6a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:01:47 -0600 Subject: [PATCH 017/172] io_uring: convert splice to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bcdc6ed7f46bb7..d0251d0744494d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,7 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_splice splice; struct io_provide_buf pbuf; struct io_msg msg; struct io_xattr xattr; @@ -4918,7 +4917,7 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; sp->len = READ_ONCE(sqe->len); @@ -4939,7 +4938,7 @@ static int io_tee_prep(struct io_kiocb *req, static int io_tee(struct io_kiocb *req, unsigned int issue_flags) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; struct file *in; @@ -4971,7 +4970,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); sp->off_in = READ_ONCE(sqe->splice_off_in); sp->off_out = READ_ONCE(sqe->off); @@ -4980,7 +4979,7 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_splice(struct io_kiocb *req, unsigned int issue_flags) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; From c1ee5595015502dedd2d602ecc65bff9907fb14b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:03:49 -0600 Subject: [PATCH 018/172] io_uring: convert msg and nop to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d0251d0744494d..41253ef58ada7e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,8 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_provide_buf pbuf; - struct io_msg msg; struct io_xattr xattr; struct io_uring_cmd uring_cmd; }; @@ -5030,19 +5028,21 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_msg *msg = io_kiocb_to_cmd(req); + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || sqe->buf_index || sqe->personality)) return -EINVAL; - req->msg.user_data = READ_ONCE(sqe->off); - req->msg.len = READ_ONCE(sqe->len); + msg->user_data = READ_ONCE(sqe->off); + msg->len = READ_ONCE(sqe->len); return 0; } static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { + struct io_msg *msg = io_kiocb_to_cmd(req); struct io_ring_ctx *target_ctx; - struct io_msg *msg = &req->msg; bool filled; int ret; @@ -5324,7 +5324,7 @@ static int io_openat(struct io_kiocb *req, unsigned int issue_flags) static int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); u64 tmp; if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || @@ -5381,7 +5381,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; @@ -5409,7 +5409,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); u64 tmp; if (sqe->rw_flags || sqe->splice_fd_in) @@ -5528,7 +5528,7 @@ static __cold int io_init_bl_list(struct io_ring_ctx *ctx) static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; From ea5af87d29cfe7323f9a401539c526bdd43416a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:05:49 -0600 Subject: [PATCH 019/172] io_uring: convert rsrc_update to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 41253ef58ada7e..be8da26f70d6d6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_rsrc_update rsrc_update; struct io_xattr xattr; struct io_uring_cmd uring_cmd; }; @@ -7800,23 +7799,26 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; - req->rsrc_update.offset = READ_ONCE(sqe->off); - req->rsrc_update.nr_args = READ_ONCE(sqe->len); - if (!req->rsrc_update.nr_args) + up->offset = READ_ONCE(sqe->off); + up->nr_args = READ_ONCE(sqe->len); + if (!up->nr_args) return -EINVAL; - req->rsrc_update.arg = READ_ONCE(sqe->addr); + up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { - __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg); + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; @@ -7824,7 +7826,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, if (!req->ctx->file_data) return -ENXIO; - for (done = 0; done < req->rsrc_update.nr_args; done++) { + for (done = 0; done < up->nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; break; @@ -7853,23 +7855,24 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rsrc_update *up = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update2 up; + struct io_uring_rsrc_update2 up2; int ret; - up.offset = req->rsrc_update.offset; - up.data = req->rsrc_update.arg; - up.nr = 0; - up.tags = 0; - up.resv = 0; - up.resv2 = 0; + up2.offset = up->offset; + up2.data = up->arg; + up2.nr = 0; + up2.tags = 0; + up2.resv = 0; + up2.resv2 = 0; - if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) { + if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up, req->rsrc_update.nr_args); + &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } From ceb452e1b4ba4ab207dfe119c47bf61d4519dc2e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:06:46 -0600 Subject: [PATCH 020/172] io_uring: convert xattr to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index be8da26f70d6d6..0bb3c63f38696c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_xattr xattr; struct io_uring_cmd uring_cmd; }; @@ -4402,7 +4401,7 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) static inline void __io_xattr_finish(struct io_kiocb *req) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); if (ix->filename) putname(ix->filename); @@ -4422,7 +4421,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret) static int __io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *name; int ret; @@ -4465,7 +4464,7 @@ static int io_fgetxattr_prep(struct io_kiocb *req, static int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *path; int ret; @@ -4486,7 +4485,7 @@ static int io_getxattr_prep(struct io_kiocb *req, static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4502,7 +4501,7 @@ static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int ret; @@ -4531,7 +4530,7 @@ static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) static int __io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *name; int ret; @@ -4562,7 +4561,7 @@ static int __io_setxattr_prep(struct io_kiocb *req, static int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *path; int ret; @@ -4590,7 +4589,7 @@ static int io_fsetxattr_prep(struct io_kiocb *req, static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, struct path *path) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); int ret; ret = mnt_want_write(path->mnt); @@ -4617,7 +4616,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int ret; From 9a3a11f977f9972b812cc8666d1ffdb93699bd92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:09:32 -0600 Subject: [PATCH 021/172] io_uring: convert iouring_cmd to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0bb3c63f38696c..21246d2e6221ff 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_uring_cmd uring_cmd; }; u8 opcode; @@ -4814,15 +4813,17 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { - req->uring_cmd.task_work_cb(&req->uring_cmd); + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + + ioucmd->task_work_cb(ioucmd); } void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *)) { - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - req->uring_cmd.task_work_cb = task_work_cb; + ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; io_req_task_work_add(req); } @@ -4842,7 +4843,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, */ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) { - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); if (ret < 0) req_set_fail(req); @@ -4855,18 +4856,19 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done); static int io_uring_cmd_prep_async(struct io_kiocb *req) { + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); size_t cmd_size; cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + memcpy(req->async_data, ioucmd->cmd, cmd_size); return 0; } static int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); if (sqe->rw_flags || sqe->__pad1) return -EINVAL; @@ -4877,7 +4879,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req, static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) { - struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct file *file = req->file; int ret; From 890968dc03361b92957e183fe8f692371e4ef18b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:19:47 -0600 Subject: [PATCH 022/172] io_uring: unify struct io_symlink and io_hardlink They are really just a subset of each other, just use the one type. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 21246d2e6221ff..808acb854b660d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -761,14 +761,7 @@ struct io_mkdir { struct filename *filename; }; -struct io_symlink { - struct file *file; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; -}; - -struct io_hardlink { +struct io_link { struct file *file; int old_dfd; int new_dfd; @@ -4723,7 +4716,7 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) static int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_symlink *sl = io_kiocb_to_cmd(req); + struct io_link *sl = io_kiocb_to_cmd(req); const char __user *oldpath, *newpath; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4751,7 +4744,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_symlink *sl = io_kiocb_to_cmd(req); + struct io_link *sl = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4767,7 +4760,7 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) static int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_hardlink *lnk = io_kiocb_to_cmd(req); + struct io_link *lnk = io_kiocb_to_cmd(req); const char __user *oldf, *newf; if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4797,7 +4790,7 @@ static int io_linkat_prep(struct io_kiocb *req, static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_hardlink *lnk = io_kiocb_to_cmd(req); + struct io_link *lnk = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -8011,15 +8004,9 @@ static void io_clean_op(struct io_kiocb *req) putname(md->filename); break; } - case IORING_OP_SYMLINKAT: { - struct io_symlink *sl = io_kiocb_to_cmd(req); - - putname(sl->oldpath); - putname(sl->newpath); - break; - } + case IORING_OP_SYMLINKAT: case IORING_OP_LINKAT: { - struct io_hardlink *hl = io_kiocb_to_cmd(req); + struct io_link *hl = io_kiocb_to_cmd(req); putname(hl->oldpath); putname(hl->newpath); From 4d4c9cff4f702d10f65473a6b4994ce1a13e64ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:26:28 -0600 Subject: [PATCH 023/172] io_uring: define a request type cleanup handler This can move request type specific cleanup into a private handler, removing the need for the core io_uring parts to know what types they are dealing with. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 155 ++++++++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 69 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 808acb854b660d..75d8c31a59d50e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1093,6 +1093,7 @@ struct io_op_def { int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); + void (*cleanup)(struct io_kiocb *); }; static const struct io_op_def io_op_defs[]; @@ -3433,6 +3434,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static void io_readv_writev_cleanup(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + kfree(io->free_iovec); +} + static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) { switch (ret) { @@ -4391,7 +4399,15 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static inline void __io_xattr_finish(struct io_kiocb *req) +static void io_renameat_cleanup(struct io_kiocb *req) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + + putname(ren->oldpath); + putname(ren->newpath); +} + +static inline void io_xattr_cleanup(struct io_kiocb *req) { struct io_xattr *ix = io_kiocb_to_cmd(req); @@ -4406,7 +4422,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret) { req->flags &= ~REQ_F_NEED_CLEANUP; - __io_xattr_finish(req); + io_xattr_cleanup(req); io_req_complete(req, ret); } @@ -4675,6 +4691,13 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_unlinkat_cleanup(struct io_kiocb *req) +{ + struct io_unlink *ul = io_kiocb_to_cmd(req); + + putname(ul->filename); +} + static int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4713,6 +4736,13 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_mkdirat_cleanup(struct io_kiocb *req) +{ + struct io_mkdir *md = io_kiocb_to_cmd(req); + + putname(md->filename); +} + static int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4804,6 +4834,14 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_link_cleanup(struct io_kiocb *req) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + + putname(sl->oldpath); + putname(sl->newpath); +} + static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); @@ -5314,6 +5352,14 @@ static int io_openat(struct io_kiocb *req, unsigned int issue_flags) return io_openat2(req, issue_flags); } +static void io_open_cleanup(struct io_kiocb *req) +{ + struct io_open *open = io_kiocb_to_cmd(req); + + if (open->filename) + putname(open->filename); +} + static int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5725,6 +5771,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_statx_cleanup(struct io_kiocb *req) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + + if (sx->filename) + putname(sx->filename); +} + static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_close *close = io_kiocb_to_cmd(req); @@ -5897,6 +5951,13 @@ static int io_sendmsg_prep_async(struct io_kiocb *req) return ret; } +static void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) +{ + struct io_async_msghdr *io = req->async_data; + + kfree(io->free_iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -7958,74 +8019,10 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_NEED_CLEANUP) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); - break; - } - case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: { - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); - break; - } - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: { - struct io_open *open = io_kiocb_to_cmd(req); - - if (open->filename) - putname(open->filename); - break; - } - case IORING_OP_RENAMEAT: { - struct io_rename *ren = io_kiocb_to_cmd(req); - - putname(ren->oldpath); - putname(ren->newpath); - break; - } - case IORING_OP_UNLINKAT: { - struct io_unlink *ul = io_kiocb_to_cmd(req); - - putname(ul->filename); - break; - } - case IORING_OP_MKDIRAT: { - struct io_mkdir *md = io_kiocb_to_cmd(req); - - putname(md->filename); - break; - } - case IORING_OP_SYMLINKAT: - case IORING_OP_LINKAT: { - struct io_link *hl = io_kiocb_to_cmd(req); - - putname(hl->oldpath); - putname(hl->newpath); - break; - } - case IORING_OP_STATX: { - struct io_statx *sx = io_kiocb_to_cmd(req); + const struct io_op_def *def = &io_op_defs[req->opcode]; - if (sx->filename) - putname(sx->filename); - break; - } - case IORING_OP_SETXATTR: - case IORING_OP_FSETXATTR: - case IORING_OP_GETXATTR: - case IORING_OP_FGETXATTR: - __io_xattr_finish(req); - break; - } + if (def->cleanup) + def->cleanup(req); } if ((req->flags & REQ_F_POLLED) && req->apoll) { kfree(req->apoll->double_poll); @@ -12838,6 +12835,7 @@ static const struct io_op_def io_op_defs[] = { .prep = io_prep_rw, .issue = io_read, .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -12852,6 +12850,7 @@ static const struct io_op_def io_op_defs[] = { .prep = io_prep_rw, .issue = io_write, .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -12911,6 +12910,9 @@ static const struct io_op_def io_op_defs[] = { .prep = io_sendmsg_prep, .issue = io_sendmsg, .prep_async = io_sendmsg_prep_async, +#if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, +#endif }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -12922,6 +12924,9 @@ static const struct io_op_def io_op_defs[] = { .prep = io_recvmsg_prep, .issue = io_recvmsg, .prep_async = io_recvmsg_prep_async, +#if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, +#endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, @@ -12972,6 +12977,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_OPENAT] = { .prep = io_openat_prep, .issue = io_openat, + .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { .prep = io_close_prep, @@ -12987,6 +12993,7 @@ static const struct io_op_def io_op_defs[] = { .audit_skip = 1, .prep = io_statx_prep, .issue = io_statx, + .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .needs_file = 1, @@ -13046,6 +13053,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_OPENAT2] = { .prep = io_openat2_prep, .issue = io_openat2, + .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -13089,22 +13097,27 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_RENAMEAT] = { .prep = io_renameat_prep, .issue = io_renameat, + .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { .prep = io_unlinkat_prep, .issue = io_unlinkat, + .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { .prep = io_mkdirat_prep, .issue = io_mkdirat, + .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { .prep = io_symlinkat_prep, .issue = io_symlinkat, + .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { .prep = io_linkat_prep, .issue = io_linkat, + .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .needs_file = 1, @@ -13116,19 +13129,23 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .prep = io_fsetxattr_prep, .issue = io_fsetxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { .prep = io_setxattr_prep, .issue = io_setxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, .prep = io_fgetxattr_prep, .issue = io_fgetxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { .prep = io_getxattr_prep, .issue = io_getxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, From e27f928ee1cb068ac6c18ea244351a4c90a61139 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:56:14 -0600 Subject: [PATCH 024/172] io_uring: add io_uring_types.h This adds definitions of structs that both the core and the various opcode handlers need to know about. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 491 +------------------------------------ io_uring/io_uring_types.h | 496 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 499 insertions(+), 488 deletions(-) create mode 100644 io_uring/io_uring_types.h diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75d8c31a59d50e..ff7886c35490ab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,6 +90,8 @@ #include "../fs/internal.h" #include "io-wq.h" +#include "io_uring_types.h" + #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 @@ -122,89 +124,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; -}; - -/* - * This data is shared with the application through the mmap at offsets - * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_sqring_offsets when calling io_uring_setup. - */ -struct io_rings { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The kernel controls head of the sq ring and the tail of the cq ring, - * and the application controls tail of the sq ring and the head of the - * cq ring. - */ - struct io_uring sq, cq; - /* - * Bitmasks to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 sq_ring_mask, cq_ring_mask; - /* Ring sizes (constant, power of 2) */ - u32 sq_ring_entries, cq_ring_entries; - /* - * Number of invalid entries dropped by the kernel due to - * invalid index stored in array - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * After a new SQ head value was read by the application this - * counter includes all submissions that were dropped reaching - * the new SQ head (and possibly more). - */ - u32 sq_dropped; - /* - * Runtime SQ flags - * - * Written by the kernel, shouldn't be modified by the - * application. - * - * The application needs a full memory barrier before checking - * for IORING_SQ_NEED_WAKEUP after updating the sq tail. - */ - atomic_t sq_flags; - /* - * Runtime CQ flags - * - * Written by the application, shouldn't be modified by the - * kernel. - */ - u32 cq_flags; - /* - * Number of completion events lost because the queue was full; - * this should be avoided by the application by making sure - * there are not more requests pending than there is space in - * the completion queue. - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * As completion events come in out of order this counter is not - * ordered with any other data. - */ - u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -252,12 +171,6 @@ struct io_rsrc_put { }; }; -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - struct io_rsrc_node { struct percpu_ref refs; struct list_head node; @@ -310,14 +223,6 @@ struct io_buffer { __u16 bgid; }; -struct io_restriction { - DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); - DECLARE_BITMAP(sqe_op, IORING_OP_LAST); - u8 sqe_flags_allowed; - u8 sqe_flags_required; - bool registered; -}; - enum { IO_SQ_THREAD_SHOULD_STOP = 0, IO_SQ_THREAD_SHOULD_PARK, @@ -347,186 +252,7 @@ struct io_sq_data { #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; - -struct io_submit_state { - /* inline/task_work completion list, under ->uring_lock */ - struct io_wq_work_node free_list; - /* batch completion logic */ - struct io_wq_work_list compl_reqs; - struct io_submit_link link; - - bool plug_started; - bool need_plug; - bool flush_cqes; - unsigned short submit_nr; - struct blk_plug plug; -}; - -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; -}; - -#define BGID_ARRAY 64 - -struct io_ring_ctx { - /* const or read-mostly hot data */ - struct { - struct percpu_ref refs; - - struct io_rings *rings; - unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; - unsigned int drain_next: 1; - unsigned int restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int drain_disabled: 1; - unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; - } ____cacheline_aligned_in_smp; - - /* submission data */ - struct { - struct mutex uring_lock; - - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 *sq_array; - struct io_uring_sqe *sq_sqes; - unsigned cached_sq_head; - unsigned sq_entries; - struct list_head defer_list; - - /* - * Fixed resources fast path, should be accessed only under - * uring_lock, and updated through io_uring_register(2) - */ - struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; - atomic_t cancel_seq; - struct io_file_table file_table; - unsigned nr_user_files; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct io_submit_state submit_state; - - struct io_buffer_list *io_bl; - struct xarray io_bl_xa; - struct list_head io_buffers_cache; - - struct list_head timeout_list; - struct list_head ltimeout_list; - struct list_head cq_overflow_list; - struct list_head apoll_cache; - struct xarray personalities; - u32 pers_next; - unsigned sq_thread_idle; - } ____cacheline_aligned_in_smp; - - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - struct { - /* - * We cache a range of free CQEs we can use, once exhausted it - * should go through a slower range setup, see __io_get_cqe() - */ - struct io_uring_cqe *cqe_cached; - struct io_uring_cqe *cqe_sentinel; - - unsigned cached_cq_tail; - unsigned cq_entries; - struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; - unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - } ____cacheline_aligned_in_smp; - - struct { - spinlock_t completion_lock; - - spinlock_t timeout_lock; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; - unsigned cancel_hash_bits; - bool poll_multi_queue; - - struct list_head io_buffers_comp; - } ____cacheline_aligned_in_smp; - - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - - struct list_head io_buffers_pages; - }; - - /* Keep this last, we don't need it for the fast path */ - struct { - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; - - /* Only used for accounting purposes */ - struct user_struct *user; - struct mm_struct *mm_account; - - /* ctx exit and cancelation */ - struct llist_head fallback_llist; - struct delayed_work fallback_work; - struct work_struct exit_work; - struct list_head tctx_list; - struct completion ref_comp; - u32 iowq_limits[2]; - bool iowq_limits_set; - }; -}; +#define BGID_ARRAY 64 /* * Arbitrary limit, can be raised if need be @@ -808,232 +534,21 @@ struct io_xattr { struct filename *filename; }; -enum { - REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, - REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, - REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, - REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, - REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, - REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, - - /* first byte is taken by user flags, shift it to not overlap */ - REQ_F_FAIL_BIT = 8, - REQ_F_INFLIGHT_BIT, - REQ_F_CUR_POS_BIT, - REQ_F_NOWAIT_BIT, - REQ_F_LINK_TIMEOUT_BIT, - REQ_F_NEED_CLEANUP_BIT, - REQ_F_POLLED_BIT, - REQ_F_BUFFER_SELECTED_BIT, - REQ_F_BUFFER_RING_BIT, - REQ_F_COMPLETE_INLINE_BIT, - REQ_F_REISSUE_BIT, - REQ_F_CREDS_BIT, - REQ_F_REFCOUNT_BIT, - REQ_F_ARM_LTIMEOUT_BIT, - REQ_F_ASYNC_DATA_BIT, - REQ_F_SKIP_LINK_CQES_BIT, - REQ_F_SINGLE_POLL_BIT, - REQ_F_DOUBLE_POLL_BIT, - REQ_F_PARTIAL_IO_BIT, - REQ_F_CQE32_INIT_BIT, - REQ_F_APOLL_MULTISHOT_BIT, - REQ_F_CLEAR_POLLIN_BIT, - /* keep async read/write and isreg together and in order */ - REQ_F_SUPPORT_NOWAIT_BIT, - REQ_F_ISREG_BIT, - - /* not a real bit, just to check we're not overflowing the space */ - __REQ_F_LAST_BIT, -}; - -enum { - /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), - /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), - /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), - /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), - /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), - /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), - - /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), - /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), - /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), - /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), - /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), - /* completion is deferred through io_comp_state */ - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), - /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), - /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), - /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), - /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), - /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), - /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), - /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), - /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), - /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), - /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), - /* ->extra1 and ->extra2 are initialised */ - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), - /* recvmsg special flag, clear EPOLLIN */ - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), -}; - struct async_poll { struct io_poll poll; struct io_poll *double_poll; }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); - -struct io_task_work { - union { - struct io_wq_work_node node; - struct llist_node fallback_node; - }; - io_req_tw_func_t func; -}; - enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, }; -struct io_cqe { - __u64 user_data; - __s32 res; - /* fd initially, then cflags for completion */ - union { - __u32 flags; - int fd; - }; -}; - enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, }; -/* - * Each request type overlays its private data structure on top of this one. - * They must not exceed this one in size. - */ -struct io_cmd_data { - struct file *file; - /* each command gets 56 bytes of data */ - __u8 data[56]; -}; - -#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) -#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) - -struct io_kiocb { - union { - /* - * NOTE! Each of the io_kiocb union members has the file pointer - * as the first entry in their struct definition. So you can - * access the file pointer through any of the sub-structs, - * or directly as just 'file' in this struct. - */ - struct file *file; - struct io_cmd_data cmd; - }; - - u8 opcode; - /* polled IO has completed */ - u8 iopoll_completed; - /* - * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. - */ - u16 buf_index; - unsigned int flags; - - struct io_cqe cqe; - - struct io_ring_ctx *ctx; - struct task_struct *task; - - struct io_rsrc_node *rsrc_node; - - union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; - - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - }; - - union { - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; - /* cache ->apoll->events */ - __poll_t apoll_events; - }; - atomic_t refs; - atomic_t poll_refs; - struct io_task_work io_task_work; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - union { - struct hlist_node hash_node; - struct { - u64 extra1; - u64 extra2; - }; - }; - /* internal polling, see IORING_FEAT_FAST_POLL */ - struct async_poll *apoll; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; - /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ - struct io_kiocb *link; - /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; - struct io_wq_work work; -}; - struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h new file mode 100644 index 00000000000000..1a0f592ff6fc35 --- /dev/null +++ b/io_uring/io_uring_types.h @@ -0,0 +1,496 @@ +#ifndef IO_URING_TYPES_H +#define IO_URING_TYPES_H + +#include +#include + +#include "io-wq.h" + +struct io_uring { + u32 head ____cacheline_aligned_in_smp; + u32 tail ____cacheline_aligned_in_smp; +}; + +/* + * This data is shared with the application through the mmap at offsets + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ +struct io_rings { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head of the sq ring and the tail of the cq ring, + * and the application controls tail of the sq ring and the head of the + * cq ring. + */ + struct io_uring sq, cq; + /* + * Bitmasks to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ + u32 sq_ring_mask, cq_ring_mask; + /* Ring sizes (constant, power of 2) */ + u32 sq_ring_entries, cq_ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ + u32 sq_dropped; + /* + * Runtime SQ flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ + atomic_t sq_flags; + /* + * Runtime CQ flags + * + * Written by the application, shouldn't be modified by the + * kernel. + */ + u32 cq_flags; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending than there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ + u32 cq_overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; +}; + +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; + bool registered; +}; + +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + +struct io_submit_state { + /* inline/task_work completion list, under ->uring_lock */ + struct io_wq_work_node free_list; + /* batch completion logic */ + struct io_wq_work_list compl_reqs; + struct io_submit_link link; + + bool plug_started; + bool need_plug; + bool flush_cqes; + unsigned short submit_nr; + struct blk_plug plug; +}; + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; +}; + +struct io_file_table { + struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; +}; + +struct io_ring_ctx { + /* const or read-mostly hot data */ + struct { + struct percpu_ref refs; + + struct io_rings *rings; + unsigned int flags; + enum task_work_notify_mode notify_method; + unsigned int compat: 1; + unsigned int drain_next: 1; + unsigned int restricted: 1; + unsigned int off_timeout_used: 1; + unsigned int drain_active: 1; + unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; + unsigned int syscall_iopoll: 1; + } ____cacheline_aligned_in_smp; + + /* submission data */ + struct { + struct mutex uring_lock; + + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ + u32 *sq_array; + struct io_uring_sqe *sq_sqes; + unsigned cached_sq_head; + unsigned sq_entries; + struct list_head defer_list; + + /* + * Fixed resources fast path, should be accessed only under + * uring_lock, and updated through io_uring_register(2) + */ + struct io_rsrc_node *rsrc_node; + int rsrc_cached_refs; + atomic_t cancel_seq; + struct io_file_table file_table; + unsigned nr_user_files; + unsigned nr_user_bufs; + struct io_mapped_ubuf **user_bufs; + + struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + + struct list_head timeout_list; + struct list_head ltimeout_list; + struct list_head cq_overflow_list; + struct list_head apoll_cache; + struct xarray personalities; + u32 pers_next; + unsigned sq_thread_idle; + } ____cacheline_aligned_in_smp; + + /* IRQ completion list, under ->completion_lock */ + struct io_wq_work_list locked_free_list; + unsigned int locked_free_nr; + + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct list_head sqd_list; + + unsigned long check_cq; + + struct { + /* + * We cache a range of free CQEs we can use, once exhausted it + * should go through a slower range setup, see __io_get_cqe() + */ + struct io_uring_cqe *cqe_cached; + struct io_uring_cqe *cqe_sentinel; + + unsigned cached_cq_tail; + unsigned cq_entries; + struct io_ev_fd __rcu *io_ev_fd; + struct wait_queue_head cq_wait; + unsigned cq_extra; + atomic_t cq_timeouts; + unsigned cq_last_tm_flush; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t completion_lock; + + spinlock_t timeout_lock; + + /* + * ->iopoll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct io_wq_work_list iopoll_list; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; + bool poll_multi_queue; + + struct list_head io_buffers_comp; + } ____cacheline_aligned_in_smp; + + struct io_restriction restrictions; + + /* slow path rsrc auxilary data, used by update/register */ + struct { + struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; + struct io_rsrc_data *file_data; + struct io_rsrc_data *buf_data; + + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; + }; + + /* Keep this last, we don't need it for the fast path */ + struct { + #if defined(CONFIG_UNIX) + struct socket *ring_sock; + #endif + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + + /* Only used for accounting purposes */ + struct user_struct *user; + struct mm_struct *mm_account; + + /* ctx exit and cancelation */ + struct llist_head fallback_llist; + struct delayed_work fallback_work; + struct work_struct exit_work; + struct list_head tctx_list; + struct completion ref_comp; + u32 iowq_limits[2]; + bool iowq_limits_set; + }; +}; + +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, + + /* first byte is taken by user flags, shift it to not overlap */ + REQ_F_FAIL_BIT = 8, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_NEED_CLEANUP_BIT, + REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, + REQ_F_BUFFER_RING_BIT, + REQ_F_COMPLETE_INLINE_BIT, + REQ_F_REISSUE_BIT, + REQ_F_CREDS_BIT, + REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_ASYNC_DATA_BIT, + REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, + REQ_F_PARTIAL_IO_BIT, + REQ_F_CQE32_INIT_BIT, + REQ_F_APOLL_MULTISHOT_BIT, + REQ_F_CLEAR_POLLIN_BIT, + /* keep async read/write and isreg together and in order */ + REQ_F_SUPPORT_NOWAIT_BIT, + REQ_F_ISREG_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* IOSQE_CQE_SKIP_SUCCESS */ + REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), + + /* fail rest of links */ + REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), + /* on inflight list, should be cancelled and waited on exit reliably */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* has or had linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* needs cleanup */ + REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* buffer selected from ring, needs commit */ + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), + /* completion is deferred through io_comp_state */ + REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), + /* caller should reissue async */ + REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + /* supports async reads/writes */ + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* has creds assigned */ + REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + /* skip refcounting if not set */ + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* ->async_data allocated */ + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + /* don't post CQEs while failing linked requests */ + REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + /* fast poll multishot mode */ + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + /* ->extra1 and ->extra2 are initialised */ + REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), + /* recvmsg special flag, clear EPOLLIN */ + REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), +}; + +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); + +struct io_task_work { + union { + struct io_wq_work_node node; + struct llist_node fallback_node; + }; + io_req_tw_func_t func; +}; + +struct io_cqe { + __u64 user_data; + __s32 res; + /* fd initially, then cflags for completion */ + union { + __u32 flags; + int fd; + }; +}; + +/* + * Each request type overlays its private data structure on top of this one. + * They must not exceed this one in size. + */ +struct io_cmd_data { + struct file *file; + /* each command gets 56 bytes of data */ + __u8 data[56]; +}; + +#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) +#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) + +struct io_kiocb { + union { + /* + * NOTE! Each of the io_kiocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'file' in this struct. + */ + struct file *file; + struct io_cmd_data cmd; + }; + + u8 opcode; + /* polled IO has completed */ + u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ + u16 buf_index; + unsigned int flags; + + struct io_cqe cqe; + + struct io_ring_ctx *ctx; + struct task_struct *task; + + struct io_rsrc_node *rsrc_node; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + + /* + * stores buffer ID for ring provided buffers, valid IFF + * REQ_F_BUFFER_RING is set. + */ + struct io_buffer_list *buf_list; + }; + + union { + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + /* cache ->apoll->events */ + __poll_t apoll_events; + }; + atomic_t refs; + atomic_t poll_refs; + struct io_task_work io_task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + union { + struct hlist_node hash_node; + struct { + u64 extra1; + u64 extra2; + }; + }; + /* internal polling, see IORING_FEAT_FAST_POLL */ + struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; + /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + struct io_kiocb *link; + /* custom credentials, valid IFF REQ_F_CREDS is set */ + const struct cred *creds; + struct io_wq_work work; +}; + +#endif From de23077eda61f549dbdadc4b6aa95f6e7b251552 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 12:45:38 -0600 Subject: [PATCH 025/172] io_uring: set completion results upfront Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 +++++++++------------ io_uring/io_uring.h | 13 +++++++++++++ 2 files changed, 22 insertions(+), 12 deletions(-) create mode 100644 io_uring/io_uring.h diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ff7886c35490ab..92e51bbb769c96 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -91,6 +91,7 @@ #include "io-wq.h" #include "io_uring_types.h" +#include "io_uring.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -1876,21 +1877,15 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) io_cqring_ev_posted(ctx); } -static inline void io_req_complete_state(struct io_kiocb *req, s32 res, - u32 cflags) -{ - req->cqe.res = res; - req->cqe.flags = cflags; - req->flags |= REQ_F_COMPLETE_INLINE; -} - static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, s32 res, u32 cflags) { - if (issue_flags & IO_URING_F_COMPLETE_DEFER) - io_req_complete_state(req, res, cflags); - else + if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + io_req_set_res(req, res, cflags); + req->flags |= REQ_F_COMPLETE_INLINE; + } else { io_req_complete_post(req, res, cflags); + } } static inline void io_req_complete(struct io_kiocb *req, s32 res) @@ -2749,7 +2744,8 @@ static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) int res = req->cqe.res; if (*locked) { - io_req_complete_state(req, res, io_put_kbuf(req, 0)); + io_req_set_res(req, res, io_put_kbuf(req, 0)); + req->flags |= REQ_F_COMPLETE_INLINE; io_req_add_compl_list(req); } else { io_req_complete_post(req, res, @@ -4394,6 +4390,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) if (ret < 0) req_set_fail(req); + io_req_set_res(req, 0, ret); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); io_req_complete(req, ret); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h new file mode 100644 index 00000000000000..522e652197572e --- /dev/null +++ b/io_uring/io_uring.h @@ -0,0 +1,13 @@ +#ifndef IOU_CORE_H +#define IOU_CORE_H + +#include +#include "io_uring_types.h" + +static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) +{ + req->cqe.res = res; + req->cqe.flags = cflags; +} + +#endif From 97b388d70b53fd7d286ac1b81e5a88bd6af98209 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 15:21:00 -0600 Subject: [PATCH 026/172] io_uring: handle completions in the core Normally request handlers complete requests themselves, if they don't return an error. For the latter case, the core will complete it for them. This is unhandy for pushing opcode handlers further out, as we don't want a bunch of inline completion code and we don't want to make the completion path slower than it is now. Let the core handle any completion, unless the handler explicitly asks us not to. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 271 ++++++++++++++++++++++---------------------- io_uring/io_uring.h | 5 + 2 files changed, 142 insertions(+), 134 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92e51bbb769c96..7b8f1c9b7b48ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -625,7 +625,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); static void io_dismantle_req(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1126,7 +1125,7 @@ static inline void req_set_fail(struct io_kiocb *req) static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); - req->cqe.res = res; + io_req_set_res(req, res, 0); } static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) @@ -1855,50 +1854,37 @@ static void __io_req_complete_put(struct io_kiocb *req) } } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req) { - if (!(req->flags & REQ_F_CQE_SKIP)) { - req->cqe.res = res; - req->cqe.flags = cflags; + if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(req->ctx, req); - } __io_req_complete_put(req); } -static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) +static void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - __io_req_complete_post(req, res, cflags); + __io_req_complete_post(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); } -static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - s32 res, u32 cflags) +static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) { - if (issue_flags & IO_URING_F_COMPLETE_DEFER) { - io_req_set_res(req, res, cflags); + if (issue_flags & IO_URING_F_COMPLETE_DEFER) req->flags |= REQ_F_COMPLETE_INLINE; - } else { - io_req_complete_post(req, res, cflags); - } -} - -static inline void io_req_complete(struct io_kiocb *req, s32 res) -{ - if (res < 0) - req_set_fail(req); - __io_req_complete(req, 0, res, 0); + else + io_req_complete_post(req); } static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_complete_post(req); } /* @@ -2071,7 +2057,8 @@ static void io_fail_links(struct io_kiocb *req) link->flags |= REQ_F_CQE_SKIP; else link->flags &= ~REQ_F_CQE_SKIP; - __io_req_complete_post(link, res, 0); + io_req_set_res(link, res, 0); + __io_req_complete_post(link); link = nxt; } } @@ -2185,11 +2172,12 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (unlikely(!*uring_locked)) spin_lock(&(*ctx)->completion_lock); } - if (likely(*uring_locked)) + if (likely(*uring_locked)) { req->io_task_work.func(req, uring_locked); - else - __io_req_complete_post(req, req->cqe.res, - io_put_kbuf_comp(req)); + } else { + req->cqe.flags = io_put_kbuf_comp(req); + __io_req_complete_post(req); + } node = next; } while (node); @@ -2317,13 +2305,12 @@ static void io_req_task_prio_work_add(struct io_kiocb *req) static void io_req_tw_post(struct io_kiocb *req, bool *locked) { - io_req_complete_post(req, req->cqe.res, req->cqe.flags); + io_req_complete_post(req); } static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) { - req->cqe.res = res; - req->cqe.flags = cflags; + io_req_set_res(req, res, cflags); req->io_task_work.func = io_req_tw_post; io_req_task_work_add(req); } @@ -2347,7 +2334,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked) static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { - req->cqe.res = ret; + io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; io_req_task_work_add(req); } @@ -2741,15 +2728,13 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - int res = req->cqe.res; - if (*locked) { - io_req_set_res(req, res, io_put_kbuf(req, 0)); + req->cqe.flags |= io_put_kbuf(req, 0); req->flags |= REQ_F_COMPLETE_INLINE; io_req_add_compl_list(req); } else { - io_req_complete_post(req, res, - io_put_kbuf(req, IO_URING_F_UNLOCKED)); + req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED); + io_req_complete_post(req); } } @@ -2758,8 +2743,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->cqe.res, - io_put_kbuf(req, issue_flags)); + io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); + __io_req_complete(req, issue_flags); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -2769,7 +2754,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) if (__io_complete_rw_common(req, res)) return; - req->cqe.res = res; + io_req_set_res(req, res, 0); req->io_task_work.func = io_req_task_complete; io_req_task_prio_work_add(req); } @@ -3745,7 +3730,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) */ ret = io_iter_do_read(rw, &s->iter); if (ret == -EIOCBQUEUED) - return 0; + return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&s->iter, &s->iter_state); @@ -3756,7 +3741,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) /* it's faster to check here then delegate to kfree */ if (iovec) kfree(iovec); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -3850,6 +3835,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) goto copy_iov; done: kiocb_done(req, ret2, issue_flags); + ret = IOU_ISSUE_SKIP_COMPLETE; } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); @@ -3906,8 +3892,8 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) ren->newpath, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_renameat_cleanup(struct io_kiocb *req) @@ -3934,7 +3920,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret) req->flags &= ~REQ_F_NEED_CLEANUP; io_xattr_cleanup(req); - io_req_complete(req, ret); + io_req_set_res(req, ret, 0); } static int __io_getxattr_prep(struct io_kiocb *req, @@ -4015,7 +4001,7 @@ static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) &ix->ctx); io_xattr_finish(req, ret); - return 0; + return IOU_OK; } static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -4043,7 +4029,7 @@ static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) } io_xattr_finish(req, ret); - return 0; + return IOU_OK; } static int __io_setxattr_prep(struct io_kiocb *req, @@ -4129,8 +4115,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = __io_setxattr(req, issue_flags, &req->file->f_path); io_xattr_finish(req, ret); - - return 0; + return IOU_OK; } static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -4155,7 +4140,7 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) } io_xattr_finish(req, ret); - return 0; + return IOU_OK; } static int io_unlinkat_prep(struct io_kiocb *req, @@ -4198,8 +4183,8 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_unlinkat(un->dfd, un->filename); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_unlinkat_cleanup(struct io_kiocb *req) @@ -4243,8 +4228,8 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_mkdirat_cleanup(struct io_kiocb *req) @@ -4294,8 +4279,8 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_linkat_prep(struct io_kiocb *req, @@ -4341,8 +4326,8 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) lnk->newpath, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_link_cleanup(struct io_kiocb *req) @@ -4393,7 +4378,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) io_req_set_res(req, 0, ret); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); - io_req_complete(req, ret); + __io_req_complete(req, 0); } EXPORT_SYMBOL_GPL(io_uring_cmd_done); @@ -4450,9 +4435,12 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - if (ret != -EIOCBQUEUED) + if (ret != -EIOCBQUEUED) { io_uring_cmd_done(ioucmd, ret, 0); - return 0; + return IOU_OK; + } + + return IOU_ISSUE_SKIP_COMPLETE; } static int __io_splice_prep(struct io_kiocb *req, @@ -4505,8 +4493,8 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4550,8 +4538,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4564,8 +4552,8 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - __io_req_complete(req, issue_flags, 0, 0); - return 0; + io_req_set_res(req, 0, 0); + return IOU_OK; } static int io_msg_ring_prep(struct io_kiocb *req, @@ -4609,11 +4597,11 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); + io_req_set_res(req, ret, 0); /* put file to avoid an attempt to IOPOLL the req */ io_put_file(req->file); req->file = NULL; - return 0; + return IOU_OK; } static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4644,8 +4632,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_fallocate_prep(struct io_kiocb *req, @@ -4673,8 +4661,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4855,8 +4843,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_openat(struct io_kiocb *req, unsigned int issue_flags) @@ -4951,9 +4939,10 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); io_ring_submit_unlock(ctx, issue_flags); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static int io_provide_buffers_prep(struct io_kiocb *req, @@ -5117,9 +5106,10 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); io_ring_submit_unlock(ctx, issue_flags); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static int io_epoll_ctl_prep(struct io_kiocb *req, @@ -5162,8 +5152,8 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; #else return -EOPNOTSUPP; #endif @@ -5196,8 +5186,8 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; #else return -EOPNOTSUPP; #endif @@ -5235,8 +5225,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5279,8 +5269,8 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_statx_cleanup(struct io_kiocb *req) @@ -5350,8 +5340,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) err: if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5377,8 +5367,8 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } #if defined(CONFIG_NET) @@ -5409,8 +5399,8 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, shutdown->how); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static bool io_net_retry(struct socket *sock, int flags) @@ -5548,8 +5538,8 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_send(struct io_kiocb *req, unsigned int issue_flags) @@ -5605,8 +5595,8 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int __io_recvmsg_copy_hdr(struct io_kiocb *req, @@ -5805,8 +5795,8 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; + io_req_set_res(req, ret, cflags); + return IOU_OK; } static int io_recv(struct io_kiocb *req, unsigned int issue_flags) @@ -5881,8 +5871,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; + io_req_set_res(req, ret, cflags); + return IOU_OK; } static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5948,7 +5938,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) */ if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) - ret = 0; + ret = IOU_ISSUE_SKIP_COMPLETE; return ret; } if (ret == -ERESTARTSYS) @@ -5963,8 +5953,8 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) } if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } if (ret >= 0) { bool filled; @@ -6034,8 +6024,8 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags) ret = io_fixed_fd_install(req, issue_flags, file, sock->file_slot); } - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_connect_prep_async(struct io_kiocb *req) @@ -6096,8 +6086,8 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) out: if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } #else /* !CONFIG_NET */ #define IO_NETOP_FN(op) \ @@ -6328,7 +6318,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) io_poll_remove_entries(req); spin_lock(&ctx->completion_lock); hash_del(&req->hash_node); - __io_req_complete_post(req, req->cqe.res, 0); + req->cqe.flags = 0; + __io_req_complete_post(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -6357,7 +6348,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t __maybe_unused events) { - req->cqe.res = mask; + io_req_set_res(req, mask, 0); /* * This is useful for poll that is armed on behalf of another * request, and where the wakeup path could be on a different @@ -6810,12 +6801,16 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); - if (!ret && ipt.error) + if (ret) { + io_req_set_res(req, ret, 0); + return IOU_OK; + } + if (ipt.error) { req_set_fail(req); - ret = ret ?: ipt.error; - if (ret) - __io_req_complete(req, issue_flags, ret, 0); - return 0; + return ipt.error; + } + + return IOU_ISSUE_SKIP_COMPLETE; } static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) @@ -6850,20 +6845,22 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ - if (!ret2) + if (!ret2 || ret2 == -EIOCBQUEUED) goto out; } req_set_fail(preq); - preq->cqe.res = -ECANCELED; + io_req_set_res(preq, -ECANCELED, 0); locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &locked); out: - if (ret < 0) + if (ret < 0) { req_set_fail(req); + return ret; + } /* complete update request, we're done with it */ - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -6884,7 +6881,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); - req->cqe.res = -ETIME; + io_req_set_res(req, -ETIME, 0); req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; @@ -7069,8 +7066,8 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int __io_timeout_prep(struct io_kiocb *req, @@ -7191,7 +7188,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static bool io_cancel_cb(struct io_wq_work *work, void *data) @@ -7359,8 +7356,8 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_files_update_prep(struct io_kiocb *req, @@ -7445,8 +7442,8 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_req_prep_async(struct io_kiocb *req) @@ -7590,8 +7587,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); - if (ret) + + if (ret == IOU_OK) + __io_req_complete(req, issue_flags); + else if (ret != IOU_ISSUE_SKIP_COMPLETE) return ret; + /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) io_iopoll_req_issued(req, issue_flags); @@ -7668,7 +7669,7 @@ static void io_wq_submit_work(struct io_wq_work *work) } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret) + if (ret < 0) io_req_task_queue_fail(req, ret); } @@ -7745,10 +7746,12 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req, &cd); } - io_req_complete_post(req, ret ?: -ETIME, 0); + io_req_set_res(req, ret ?: -ETIME, 0); + io_req_complete_post(req); io_put_req(prev); } else { - io_req_complete_post(req, -ETIME, 0); + io_req_set_res(req, -ETIME, 0); + io_req_complete_post(req); } } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 522e652197572e..73943dbe884e5a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -4,6 +4,11 @@ #include #include "io_uring_types.h" +enum { + IOU_OK = 0, + IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, +}; + static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) { req->cqe.res = res; From 5e2a18d93fec514fc74f58a6061b74a79764af69 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 11:46:43 -0600 Subject: [PATCH 027/172] io_uring: move xattr related opcodes to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 248 +----------------------------------------- io_uring/xattr.c | 259 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/xattr.h | 15 +++ 4 files changed, 277 insertions(+), 247 deletions(-) create mode 100644 io_uring/xattr.c create mode 100644 io_uring/xattr.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 3680425df9478b..479b6957b85ff3 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7b8f1c9b7b48ea..1af97e4a78b7f0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -80,7 +80,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -93,6 +92,8 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "xattr.h" + #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 @@ -529,12 +530,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct io_xattr { - struct file *file; - struct xattr_ctx ctx; - struct filename *filename; -}; - struct async_poll { struct io_poll poll; struct io_poll *double_poll; @@ -3904,245 +3899,6 @@ static void io_renameat_cleanup(struct io_kiocb *req) putname(ren->newpath); } -static inline void io_xattr_cleanup(struct io_kiocb *req) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - - if (ix->filename) - putname(ix->filename); - - kfree(ix->ctx.kname); - kvfree(ix->ctx.kvalue); -} - -static void io_xattr_finish(struct io_kiocb *req, int ret) -{ - req->flags &= ~REQ_F_NEED_CLEANUP; - - io_xattr_cleanup(req); - io_req_set_res(req, ret, 0); -} - -static int __io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - ix->ctx.kvalue = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - if (ix->ctx.flags) - return -EINVAL; - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = strncpy_from_user(ix->ctx.kname->name, name, - sizeof(ix->ctx.kname->name)); - if (!ret || ret == sizeof(ix->ctx.kname->name)) - ret = -ERANGE; - if (ret < 0) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_fgetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_getxattr_prep(req, sqe); -} - -static int io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *path; - int ret; - - ret = __io_getxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), - req->file->f_path.dentry, - &ix->ctx); - - io_xattr_finish(req, ret); - return IOU_OK; -} - -static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = do_getxattr(mnt_user_ns(path.mnt), - path.dentry, - &ix->ctx); - - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return IOU_OK; -} - -static int __io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.kvalue = NULL; - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = setxattr_copy(name, &ix->ctx); - if (ret) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *path; - int ret; - - ret = __io_setxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fsetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_setxattr_prep(req, sqe); -} - -static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, - struct path *path) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - int ret; - - ret = mnt_want_write(path->mnt); - if (!ret) { - ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); - mnt_drop_write(path->mnt); - } - - return ret; -} - -static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = __io_setxattr(req, issue_flags, &req->file->f_path); - io_xattr_finish(req, ret); - return IOU_OK; -} - -static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = __io_setxattr(req, issue_flags, &path); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return IOU_OK; -} - static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { diff --git a/io_uring/xattr.c b/io_uring/xattr.c new file mode 100644 index 00000000000000..79adf4efba0184 --- /dev/null +++ b/io_uring/xattr.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "xattr.h" + +struct io_xattr { + struct file *file; + struct xattr_ctx ctx; + struct filename *filename; +}; + +void io_xattr_cleanup(struct io_kiocb *req) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + + if (ix->filename) + putname(ix->filename); + + kfree(ix->ctx.kname); + kvfree(ix->ctx.kvalue); +} + +static void io_xattr_finish(struct io_kiocb *req, int ret) +{ + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_xattr_cleanup(req); + io_req_set_res(req, ret, 0); +} + +static int __io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + ix->ctx.kvalue = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + if (ix->ctx.flags) + return -EINVAL; + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = strncpy_from_user(ix->ctx.kname->name, name, + sizeof(ix->ctx.kname->name)); + if (!ret || ret == sizeof(ix->ctx.kname->name)) + ret = -ERANGE; + if (ret < 0) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_getxattr_prep(req, sqe); +} + +int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *path; + int ret; + + ret = __io_getxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), + req->file->f_path.dentry, + &ix->ctx); + + io_xattr_finish(req, ret); + return IOU_OK; +} + +int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = do_getxattr(mnt_user_ns(path.mnt), + path.dentry, + &ix->ctx); + + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return IOU_OK; +} + +static int __io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.kvalue = NULL; + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = setxattr_copy(name, &ix->ctx); + if (ret) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *path; + int ret; + + ret = __io_setxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_setxattr_prep(req, sqe); +} + +static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, + struct path *path) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + int ret; + + ret = mnt_want_write(path->mnt); + if (!ret) { + ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); + mnt_drop_write(path->mnt); + } + + return ret; +} + +int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = __io_setxattr(req, issue_flags, &req->file->f_path); + io_xattr_finish(req, ret); + return IOU_OK; +} + +int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = __io_setxattr(req, issue_flags, &path); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return IOU_OK; +} diff --git a/io_uring/xattr.h b/io_uring/xattr.h new file mode 100644 index 00000000000000..9b459d2ae90c5a --- /dev/null +++ b/io_uring/xattr.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +void io_xattr_cleanup(struct io_kiocb *req); + +int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags); + +int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_setxattr(struct io_kiocb *req, unsigned int issue_flags); + +int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags); + +int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_getxattr(struct io_kiocb *req, unsigned int issue_flags); From e28683bdfc2f2cb0dab042f9079cda89dbf589d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 11:56:42 -0600 Subject: [PATCH 028/172] io_uring: move nop into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 15 +-------------- io_uring/nop.c | 26 ++++++++++++++++++++++++++ io_uring/nop.h | 4 ++++ 4 files changed, 32 insertions(+), 15 deletions(-) create mode 100644 io_uring/nop.c create mode 100644 io_uring/nop.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 479b6957b85ff3..32c02a38f83b42 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1af97e4a78b7f0..3e74925bd4b454 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "io_uring.h" #include "xattr.h" +#include "nop.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -4298,20 +4299,6 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return 0; -} - -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ -static int io_nop(struct io_kiocb *req, unsigned int issue_flags) -{ - io_req_set_res(req, 0, 0); - return IOU_OK; -} - static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { diff --git a/io_uring/nop.c b/io_uring/nop.c new file mode 100644 index 00000000000000..d363d8ce70a3b7 --- /dev/null +++ b/io_uring/nop.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "nop.h" + +int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return 0; +} + +/* + * IORING_OP_NOP just posts a completion event, nothing else. + */ +int io_nop(struct io_kiocb *req, unsigned int issue_flags) +{ + io_req_set_res(req, 0, 0); + return IOU_OK; +} diff --git a/io_uring/nop.h b/io_uring/nop.h new file mode 100644 index 00000000000000..97f1535c9dec42 --- /dev/null +++ b/io_uring/nop.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_nop(struct io_kiocb *req, unsigned int issue_flags); From 11aeb71406ddd0ef526ad1df48b54aae628aad3b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:13:00 -0600 Subject: [PATCH 029/172] io_uring: split out filesystem related operations This splits out renameat, unlinkat, mkdirat, symlinkat, and linkat. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/fs.c | 294 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/fs.h | 20 +++ io_uring/io_uring.c | 283 +----------------------------------------- 4 files changed, 316 insertions(+), 283 deletions(-) create mode 100644 io_uring/fs.c create mode 100644 io_uring/fs.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 32c02a38f83b42..50e68c9a4d1bb6 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/fs.c b/io_uring/fs.c new file mode 100644 index 00000000000000..aac1bc5255b07f --- /dev/null +++ b/io_uring/fs.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "fs.h" + +struct io_rename { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + +struct io_unlink { + struct file *file; + int dfd; + int flags; + struct filename *filename; +}; + +struct io_mkdir { + struct file *file; + int dfd; + umode_t mode; + struct filename *filename; +}; + +struct io_link { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + +int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + const char __user *oldf, *newf; + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ren->old_dfd = READ_ONCE(sqe->fd); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ren->new_dfd = READ_ONCE(sqe->len); + ren->flags = READ_ONCE(sqe->rename_flags); + + ren->oldpath = getname(oldf); + if (IS_ERR(ren->oldpath)) + return PTR_ERR(ren->oldpath); + + ren->newpath = getname(newf); + if (IS_ERR(ren->newpath)) { + putname(ren->oldpath); + return PTR_ERR(ren->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_renameat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, + ren->newpath, ren->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_renameat_cleanup(struct io_kiocb *req) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + + putname(ren->oldpath); + putname(ren->newpath); +} + +int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_unlink *un = io_kiocb_to_cmd(req); + const char __user *fname; + + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + un->dfd = READ_ONCE(sqe->fd); + + un->flags = READ_ONCE(sqe->unlink_flags); + if (un->flags & ~AT_REMOVEDIR) + return -EINVAL; + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + un->filename = getname(fname); + if (IS_ERR(un->filename)) + return PTR_ERR(un->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_unlink *un = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + if (un->flags & AT_REMOVEDIR) + ret = do_rmdir(un->dfd, un->filename); + else + ret = do_unlinkat(un->dfd, un->filename); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_unlinkat_cleanup(struct io_kiocb *req) +{ + struct io_unlink *ul = io_kiocb_to_cmd(req); + + putname(ul->filename); +} + +int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_mkdir *mkd = io_kiocb_to_cmd(req); + const char __user *fname; + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + mkd->dfd = READ_ONCE(sqe->fd); + mkd->mode = READ_ONCE(sqe->len); + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + mkd->filename = getname(fname); + if (IS_ERR(mkd->filename)) + return PTR_ERR(mkd->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_mkdir *mkd = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_mkdirat_cleanup(struct io_kiocb *req) +{ + struct io_mkdir *md = io_kiocb_to_cmd(req); + + putname(md->filename); +} + +int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + const char __user *oldpath, *newpath; + + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + sl->new_dfd = READ_ONCE(sqe->fd); + oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + + sl->oldpath = getname(oldpath); + if (IS_ERR(sl->oldpath)) + return PTR_ERR(sl->oldpath); + + sl->newpath = getname(newpath); + if (IS_ERR(sl->newpath)) { + putname(sl->oldpath); + return PTR_ERR(sl->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_link *lnk = io_kiocb_to_cmd(req); + const char __user *oldf, *newf; + + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + lnk->old_dfd = READ_ONCE(sqe->fd); + lnk->new_dfd = READ_ONCE(sqe->len); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + lnk->flags = READ_ONCE(sqe->hardlink_flags); + + lnk->oldpath = getname(oldf); + if (IS_ERR(lnk->oldpath)) + return PTR_ERR(lnk->oldpath); + + lnk->newpath = getname(newf); + if (IS_ERR(lnk->newpath)) { + putname(lnk->oldpath); + return PTR_ERR(lnk->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_linkat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_link *lnk = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, + lnk->newpath, lnk->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_link_cleanup(struct io_kiocb *req) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + + putname(sl->oldpath); + putname(sl->newpath); +} diff --git a/io_uring/fs.h b/io_uring/fs.h new file mode 100644 index 00000000000000..0bb5efe3d6bbdf --- /dev/null +++ b/io_uring/fs.h @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_renameat(struct io_kiocb *req, unsigned int issue_flags); +void io_renameat_cleanup(struct io_kiocb *req); + +int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags); +void io_unlinkat_cleanup(struct io_kiocb *req); + +int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags); +void io_mkdirat_cleanup(struct io_kiocb *req); + +int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags); + +int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_linkat(struct io_kiocb *req, unsigned int issue_flags); +void io_link_cleanup(struct io_kiocb *req); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3e74925bd4b454..5b1e67ff0faadf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -94,6 +94,7 @@ #include "xattr.h" #include "nop.h" +#include "fs.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -467,38 +468,6 @@ struct io_shutdown { int how; }; -struct io_rename { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - -struct io_unlink { - struct file *file; - int dfd; - int flags; - struct filename *filename; -}; - -struct io_mkdir { - struct file *file; - int dfd; - umode_t mode; - struct filename *filename; -}; - -struct io_link { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - struct io_msg { struct file *file; u64 user_data; @@ -3845,256 +3814,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static int io_renameat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_rename *ren = io_kiocb_to_cmd(req); - const char __user *oldf, *newf; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ren->old_dfd = READ_ONCE(sqe->fd); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ren->new_dfd = READ_ONCE(sqe->len); - ren->flags = READ_ONCE(sqe->rename_flags); - - ren->oldpath = getname(oldf); - if (IS_ERR(ren->oldpath)) - return PTR_ERR(ren->oldpath); - - ren->newpath = getname(newf); - if (IS_ERR(ren->newpath)) { - putname(ren->oldpath); - return PTR_ERR(ren->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rename *ren = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, - ren->newpath, ren->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_renameat_cleanup(struct io_kiocb *req) -{ - struct io_rename *ren = io_kiocb_to_cmd(req); - - putname(ren->oldpath); - putname(ren->newpath); -} - -static int io_unlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_unlink *un = io_kiocb_to_cmd(req); - const char __user *fname; - - if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - un->dfd = READ_ONCE(sqe->fd); - - un->flags = READ_ONCE(sqe->unlink_flags); - if (un->flags & ~AT_REMOVEDIR) - return -EINVAL; - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - un->filename = getname(fname); - if (IS_ERR(un->filename)) - return PTR_ERR(un->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_unlink *un = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (un->flags & AT_REMOVEDIR) - ret = do_rmdir(un->dfd, un->filename); - else - ret = do_unlinkat(un->dfd, un->filename); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_unlinkat_cleanup(struct io_kiocb *req) -{ - struct io_unlink *ul = io_kiocb_to_cmd(req); - - putname(ul->filename); -} - -static int io_mkdirat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_mkdir *mkd = io_kiocb_to_cmd(req); - const char __user *fname; - - if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - mkd->dfd = READ_ONCE(sqe->fd); - mkd->mode = READ_ONCE(sqe->len); - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - mkd->filename = getname(fname); - if (IS_ERR(mkd->filename)) - return PTR_ERR(mkd->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_mkdir *mkd = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_mkdirat_cleanup(struct io_kiocb *req) -{ - struct io_mkdir *md = io_kiocb_to_cmd(req); - - putname(md->filename); -} - -static int io_symlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_link *sl = io_kiocb_to_cmd(req); - const char __user *oldpath, *newpath; - - if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - sl->new_dfd = READ_ONCE(sqe->fd); - oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - - sl->oldpath = getname(oldpath); - if (IS_ERR(sl->oldpath)) - return PTR_ERR(sl->oldpath); - - sl->newpath = getname(newpath); - if (IS_ERR(sl->newpath)) { - putname(sl->oldpath); - return PTR_ERR(sl->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_link *sl = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_linkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_link *lnk = io_kiocb_to_cmd(req); - const char __user *oldf, *newf; - - if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - lnk->old_dfd = READ_ONCE(sqe->fd); - lnk->new_dfd = READ_ONCE(sqe->len); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - lnk->flags = READ_ONCE(sqe->hardlink_flags); - - lnk->oldpath = getname(oldf); - if (IS_ERR(lnk->oldpath)) - return PTR_ERR(lnk->oldpath); - - lnk->newpath = getname(newf); - if (IS_ERR(lnk->newpath)) { - putname(lnk->oldpath); - return PTR_ERR(lnk->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_link *lnk = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, - lnk->newpath, lnk->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_link_cleanup(struct io_kiocb *req) -{ - struct io_link *sl = io_kiocb_to_cmd(req); - - putname(sl->oldpath); - putname(sl->newpath); -} - static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); From 531113bbd5bfc93e8de45440752af11c751e4aaf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:19:47 -0600 Subject: [PATCH 030/172] io_uring: split out splice related operations This splits out splice and tee support. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 133 ++------------------------------------------ io_uring/io_uring.h | 19 +++++++ io_uring/splice.c | 123 ++++++++++++++++++++++++++++++++++++++++ io_uring/splice.h | 7 +++ 5 files changed, 154 insertions(+), 130 deletions(-) create mode 100644 io_uring/splice.c create mode 100644 io_uring/splice.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 50e68c9a4d1bb6..c6aca2af6f4ab9 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5b1e67ff0faadf..43d4044d3bb958 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "xattr.h" #include "nop.h" #include "fs.h" +#include "splice.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -436,15 +437,6 @@ struct io_epoll { struct epoll_event event; }; -struct io_splice { - struct file *file_out; - loff_t off_out; - loff_t off_in; - u64 len; - int splice_fd_in; - unsigned int flags; -}; - struct io_provide_buf { struct file *file; __u64 addr; @@ -596,9 +588,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned issue_flags); -static struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1078,15 +1067,6 @@ static inline bool req_has_async_data(struct io_kiocb *req) return req->flags & REQ_F_ASYNC_DATA; } -static inline void req_set_fail(struct io_kiocb *req) -{ - req->flags |= REQ_F_FAIL; - if (req->flags & REQ_F_CQE_SKIP) { - req->flags &= ~REQ_F_CQE_SKIP; - req->flags |= REQ_F_SKIP_LINK_CQES; - } -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -1941,12 +1921,6 @@ static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) return container_of(node, struct io_kiocb, comp_list); } -static inline void io_put_file(struct file *file) -{ - if (file) - fput(file); -} - static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; @@ -3919,105 +3893,6 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return IOU_ISSUE_SKIP_COMPLETE; } -static int __io_splice_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - - sp->len = READ_ONCE(sqe->len); - sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) - return -EINVAL; - sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); - return 0; -} - -static int io_tee_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) - return -EINVAL; - return __io_splice_prep(req, sqe); -} - -static int io_tee(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - if (sp->len) - ret = do_tee(in, out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - - sp->off_in = READ_ONCE(sqe->splice_off_in); - sp->off_out = READ_ONCE(sqe->off); - return __io_splice_prep(req, sqe); -} - -static int io_splice(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - loff_t *poff_in, *poff_out; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; - poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; - - if (sp->len) - ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -7157,8 +7032,8 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file file_slot->file_ptr = file_ptr; } -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned int issue_flags) +inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct file *file = NULL; @@ -7181,7 +7056,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, return file; } -static struct file *io_file_get_normal(struct io_kiocb *req, int fd) +struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 73943dbe884e5a..02c00122b97a31 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -9,10 +9,29 @@ enum { IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, }; +static inline void req_set_fail(struct io_kiocb *req) +{ + req->flags |= REQ_F_FAIL; + if (req->flags & REQ_F_CQE_SKIP) { + req->flags &= ~REQ_F_CQE_SKIP; + req->flags |= REQ_F_SKIP_LINK_CQES; + } +} + static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) { req->cqe.res = res; req->cqe.flags = cflags; } +static inline void io_put_file(struct file *file) +{ + if (file) + fput(file); +} + +struct file *io_file_get_normal(struct io_kiocb *req, int fd); +struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned issue_flags); + #endif diff --git a/io_uring/splice.c b/io_uring/splice.c new file mode 100644 index 00000000000000..0e19d63303452a --- /dev/null +++ b/io_uring/splice.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "splice.h" + +struct io_splice { + struct file *file_out; + loff_t off_out; + loff_t off_in; + u64 len; + int splice_fd_in; + unsigned int flags; +}; + +static int __io_splice_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + return 0; +} + +int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) + return -EINVAL; + return __io_splice_prep(req, sqe); +} + +int io_tee(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + struct file *in; + long ret = 0; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + + if (sp->len) + ret = do_tee(in, out, sp->len, flags); + + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(in); +done: + if (ret != sp->len) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + return __io_splice_prep(req, sqe); +} + +int io_splice(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + struct file *in; + long ret = 0; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + + if (sp->len) + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(in); +done: + if (ret != sp->len) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/splice.h b/io_uring/splice.h new file mode 100644 index 00000000000000..542f94168ad3a1 --- /dev/null +++ b/io_uring/splice.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_tee(struct io_kiocb *req, unsigned int issue_flags); + +int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_splice(struct io_kiocb *req, unsigned int issue_flags); From 0d58472740370108f8ece9076d79f736f53b5b77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:25:19 -0600 Subject: [PATCH 031/172] io_uring: split out fs related sync/fallocate functions This splits out sync_file_range, fsync, and fallocate. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 97 +------------------------------------- io_uring/sync.c | 111 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/sync.h | 10 ++++ 4 files changed, 124 insertions(+), 97 deletions(-) create mode 100644 io_uring/sync.c create mode 100644 io_uring/sync.h diff --git a/io_uring/Makefile b/io_uring/Makefile index c6aca2af6f4ab9..7285c6aef8da0f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,6 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ + sync.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43d4044d3bb958..7bcf8da7dd4063 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -96,6 +96,7 @@ #include "nop.h" #include "fs.h" #include "splice.h" +#include "sync.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -336,14 +337,6 @@ struct io_socket { unsigned long nofile; }; -struct io_sync { - struct file *file; - loff_t len; - loff_t off; - int flags; - int mode; -}; - struct io_cancel { struct file *file; u64 addr; @@ -3941,67 +3934,6 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - sync->flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; - - sync->off = READ_ONCE(sqe->off); - sync->len = READ_ONCE(sqe->len); - return 0; -} - -static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - loff_t end = sync->off + sync->len; - int ret; - - /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, - sync->flags & IORING_FSYNC_DATASYNC); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_fallocate_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - sync->off = READ_ONCE(sqe->off); - sync->len = READ_ONCE(sqe->addr); - sync->mode = READ_ONCE(sqe->len); - return 0; -} - -static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - int ret; - - /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); - if (ret >= 0) - fsnotify_modify(req->file); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req); @@ -4681,33 +4613,6 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - sync->off = READ_ONCE(sqe->off); - sync->len = READ_ONCE(sqe->len); - sync->flags = READ_ONCE(sqe->sync_range_flags); - return 0; -} - -static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - int ret; - - /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - #if defined(CONFIG_NET) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/sync.c b/io_uring/sync.c new file mode 100644 index 00000000000000..9ee8ff865521f3 --- /dev/null +++ b/io_uring/sync.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "sync.h" + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; + int mode; +}; + +int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); + sync->flags = READ_ONCE(sqe->sync_range_flags); + return 0; +} + +int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + int ret; + + /* sync_file_range always requires a blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + sync->flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); + return 0; +} + +int io_fsync(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + loff_t end = sync->off + sync->len; + int ret; + + /* fsync always requires a blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, + sync->flags & IORING_FSYNC_DATASYNC); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->addr); + sync->mode = READ_ONCE(sqe->len); + return 0; +} + +int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + int ret; + + /* fallocate always requiring blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); + if (ret >= 0) + fsnotify_modify(req->file); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/sync.h b/io_uring/sync.h new file mode 100644 index 00000000000000..e873c888da79b1 --- /dev/null +++ b/io_uring/sync.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags); + +int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fsync(struct io_kiocb *req, unsigned int issue_flags); + +int io_fallocate(struct io_kiocb *req, unsigned int issue_flags); +int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); From f4c163dd7d4b1031772317cd3cd58dd6711ee51e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:28:33 -0600 Subject: [PATCH 032/172] io_uring: split out fadvise/madvise operations Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/advise.c | 100 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/advise.h | 7 ++++ io_uring/io_uring.c | 85 +------------------------------------ 4 files changed, 109 insertions(+), 85 deletions(-) create mode 100644 io_uring/advise.c create mode 100644 io_uring/advise.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 7285c6aef8da0f..4492aa24397efe 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -3,5 +3,5 @@ # Makefile for io_uring obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o + sync.o advise.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/advise.c b/io_uring/advise.c new file mode 100644 index 00000000000000..8870fdf66ffbbd --- /dev/null +++ b/io_uring/advise.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "advise.h" + +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + +int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + return -EINVAL; + + ma->addr = READ_ONCE(sqe->addr); + ma->len = READ_ONCE(sqe->len); + ma->advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int io_madvise(struct io_kiocb *req, unsigned int issue_flags) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); + io_req_set_res(req, ret, 0); + return IOU_OK; +#else + return -EOPNOTSUPP; +#endif +} + +int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_fadvise *fa = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + return -EINVAL; + + fa->offset = READ_ONCE(sqe->off); + fa->len = READ_ONCE(sqe->len); + fa->advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_fadvise *fa = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) { + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + break; + default: + return -EAGAIN; + } + } + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/advise.h b/io_uring/advise.h new file mode 100644 index 00000000000000..5ece2a045185ff --- /dev/null +++ b/io_uring/advise.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_madvise(struct io_kiocb *req, unsigned int issue_flags); + +int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fadvise(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7bcf8da7dd4063..c2041fb10aa2c4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "fs.h" #include "splice.h" #include "sync.h" +#include "advise.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -408,20 +409,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_fadvise { - struct file *file; - u64 offset; - u32 len; - u32 advice; -}; - -struct io_madvise { - struct file *file; - u64 addr; - u32 len; - u32 advice; -}; - struct io_epoll { struct file *file; int epfd; @@ -4428,76 +4415,6 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) #endif } -static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - - ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); - ma->advice = READ_ONCE(sqe->fadvise_advice); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - io_req_set_res(req, ret, 0); - return IOU_OK; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_fadvise *fa = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - - fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); - fa->advice = READ_ONCE(sqe->fadvise_advice); - return 0; -} - -static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_fadvise *fa = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } - - ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req); From 453b329be5eacfc48dd43035af82bc7f28ecfedf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:43:10 -0600 Subject: [PATCH 033/172] io_uring: separate out file table handling code Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/filetable.c | 57 ++++++++++++++++++++++++++ io_uring/filetable.h | 58 ++++++++++++++++++++++++++ io_uring/io_uring.c | 86 --------------------------------------- io_uring/io_uring_types.h | 7 +--- 5 files changed, 117 insertions(+), 93 deletions(-) create mode 100644 io_uring/filetable.c create mode 100644 io_uring/filetable.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 4492aa24397efe..5efc4fe565a172 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -3,5 +3,5 @@ # Makefile for io_uring obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o + sync.o advise.o filetable.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/filetable.c b/io_uring/filetable.c new file mode 100644 index 00000000000000..560629a93c04b0 --- /dev/null +++ b/io_uring/filetable.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" + +int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ + struct io_file_table *table = &ctx->file_table; + unsigned long nr = ctx->nr_user_files; + int ret; + + do { + ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); + if (ret != nr) + return ret; + + if (!table->alloc_hint) + break; + + nr = table->alloc_hint; + table->alloc_hint = 0; + } while (1); + + return -ENFILE; +} + +bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +{ + table->files = kvcalloc(nr_files, sizeof(table->files[0]), + GFP_KERNEL_ACCOUNT); + if (unlikely(!table->files)) + return false; + + table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); + if (unlikely(!table->bitmap)) { + kvfree(table->files); + return false; + } + + return true; +} + +void io_free_file_tables(struct io_file_table *table) +{ + kvfree(table->files); + bitmap_free(table->bitmap); + table->files = NULL; + table->bitmap = NULL; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h new file mode 100644 index 00000000000000..fe1ec581958dda --- /dev/null +++ b/io_uring/filetable.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_FILE_TABLE_H +#define IOU_FILE_TABLE_H + +struct io_ring_ctx; + +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM 0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM 0x0UL +#endif +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) + +struct io_fixed_file { + /* file * with additional FFS_* flags */ + unsigned long file_ptr; +}; + +struct io_file_table { + struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; +}; + +bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); +void io_free_file_tables(struct io_file_table *table); +int io_file_bitmap_get(struct io_ring_ctx *ctx); + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ + __clear_bit(bit, table->bitmap); + table->alloc_hint = bit; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ + WARN_ON_ONCE(test_bit(bit, table->bitmap)); + __set_bit(bit, table->bitmap); + table->alloc_hint = bit + 1; +} + +static inline struct io_fixed_file * +io_fixed_file_slot(struct io_file_table *table, unsigned i) +{ + return &table->files[i]; +} + +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c2041fb10aa2c4..4b4d6fd509d13f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -146,28 +146,6 @@ struct io_overflow_cqe { struct io_uring_cqe cqe; }; -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) - -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - struct io_rsrc_put { struct list_head list; u64 tag; @@ -3983,27 +3961,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } -static int io_file_bitmap_get(struct io_ring_ctx *ctx) -{ - struct io_file_table *table = &ctx->file_table; - unsigned long nr = ctx->nr_user_files; - int ret; - - do { - ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); - if (ret != nr) - return ret; - - if (!table->alloc_hint) - break; - - nr = table->alloc_hint; - table->alloc_hint = 0; - } while (1); - - return -ENFILE; -} - /* * Note when io_fixed_fd_install() returns error value, it will ensure * fput() is called correspondingly. @@ -6832,12 +6789,6 @@ static void io_wq_submit_work(struct io_wq_work *work) io_req_task_queue_fail(req, ret); } -static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) -{ - return &table->files[i]; -} - static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { @@ -7934,43 +7885,6 @@ static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_pu return ret; } -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) -{ - table->files = kvcalloc(nr_files, sizeof(table->files[0]), - GFP_KERNEL_ACCOUNT); - if (unlikely(!table->files)) - return false; - - table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); - if (unlikely(!table->bitmap)) { - kvfree(table->files); - return false; - } - - return true; -} - -static void io_free_file_tables(struct io_file_table *table) -{ - kvfree(table->files); - bitmap_free(table->bitmap); - table->files = NULL; - table->bitmap = NULL; -} - -static inline void io_file_bitmap_set(struct io_file_table *table, int bit) -{ - WARN_ON_ONCE(test_bit(bit, table->bitmap)); - __set_bit(bit, table->bitmap); - table->alloc_hint = bit + 1; -} - -static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) -{ - __clear_bit(bit, table->bitmap); - table->alloc_hint = bit; -} - static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { #if !defined(IO_URING_SCM_ALL) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 1a0f592ff6fc35..dba72113c59d67 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -5,6 +5,7 @@ #include #include "io-wq.h" +#include "filetable.h" struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -122,12 +123,6 @@ struct io_ev_fd { struct rcu_head rcu; }; -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - struct io_ring_ctx { /* const or read-mostly hot data */ struct { From cd40cae29ef815de6f7e72207b677c78f43f4688 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:54:43 -0600 Subject: [PATCH 034/172] io_uring: split out open/close operations Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 311 ++----------------------------------------- io_uring/io_uring.h | 32 +++++ io_uring/openclose.c | 283 +++++++++++++++++++++++++++++++++++++++ io_uring/openclose.h | 14 ++ 5 files changed, 345 insertions(+), 298 deletions(-) create mode 100644 io_uring/openclose.c create mode 100644 io_uring/openclose.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 5efc4fe565a172..e60def39ca2c6a 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -3,5 +3,6 @@ # Makefile for io_uring obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o + sync.o advise.o filetable.o \ + openclose.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4b4d6fd509d13f..a79186ba8c44ec 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -98,6 +98,7 @@ #include "splice.h" #include "sync.h" #include "advise.h" +#include "openclose.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -283,12 +284,6 @@ struct io_poll_update { bool update_user_data; }; -struct io_close { - struct file *file; - int fd; - u32 file_slot; -}; - struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -371,15 +366,6 @@ struct io_sr_msg { unsigned int flags; }; -struct io_open { - struct file *file; - int dfd; - u32 file_slot; - struct filename *filename; - struct open_how how; - unsigned long nofile; -}; - struct io_rsrc_update { struct file *file; u64 arg; @@ -555,9 +541,6 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset); -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); @@ -670,10 +653,15 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_is_uring_fops(struct file *file) +{ + return file->f_op == &io_uring_fops; +} + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) - if (file->f_op == &io_uring_fops) { + if (io_is_uring_fops(file)) { struct io_ring_ctx *ctx = file->private_data; return ctx->ring_sock->sk; @@ -699,26 +687,6 @@ static inline bool io_file_need_scm(struct file *filp) } #endif -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - lockdep_assert_held(&ctx->uring_lock); - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - lockdep_assert_held(&ctx->uring_lock); -} - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -3899,74 +3867,12 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_open *open = io_kiocb_to_cmd(req); - const char __user *fname; - int ret; - - if (unlikely(sqe->buf_index)) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - /* open.how should be already initialised */ - if (!(open->how.flags & O_PATH) && force_o_largefile()) - open->how.flags |= O_LARGEFILE; - - open->dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - open->filename = getname(fname); - if (IS_ERR(open->filename)) { - ret = PTR_ERR(open->filename); - open->filename = NULL; - return ret; - } - - open->file_slot = READ_ONCE(sqe->file_index); - if (open->file_slot && (open->how.flags & O_CLOEXEC)) - return -EINVAL; - - open->nofile = rlimit(RLIMIT_NOFILE); - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_open *open = io_kiocb_to_cmd(req); - u64 mode = READ_ONCE(sqe->len); - u64 flags = READ_ONCE(sqe->open_flags); - - open->how = build_open_how(flags, mode); - return __io_openat_prep(req, sqe); -} - -static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_open *open = io_kiocb_to_cmd(req); - struct open_how __user *how; - size_t len; - int ret; - - how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - len = READ_ONCE(sqe->len); - if (len < OPEN_HOW_SIZE_VER0) - return -EINVAL; - - ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); - if (ret) - return ret; - - return __io_openat_prep(req, sqe); -} - /* * Note when io_fixed_fd_install() returns error value, it will ensure * fput() is called correspondingly. */ -static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) { bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; struct io_ring_ctx *ctx = req->ctx; @@ -3993,86 +3899,6 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, return ret; } -static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_open *open = io_kiocb_to_cmd(req); - struct open_flags op; - struct file *file; - bool resolve_nonblock, nonblock_set; - bool fixed = !!open->file_slot; - int ret; - - ret = build_open_flags(&open->how, &op); - if (ret) - goto err; - nonblock_set = op.open_flag & O_NONBLOCK; - resolve_nonblock = open->how.resolve & RESOLVE_CACHED; - if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; - op.lookup_flags |= LOOKUP_CACHED; - op.open_flag |= O_NONBLOCK; - } - - if (!fixed) { - ret = __get_unused_fd_flags(open->how.flags, open->nofile); - if (ret < 0) - goto err; - } - - file = do_filp_open(open->dfd, open->filename, &op); - if (IS_ERR(file)) { - /* - * We could hang on to this 'fd' on retrying, but seems like - * marginal gain for something that is now known to be a slower - * path. So just put it, and we'll get a new one when we retry. - */ - if (!fixed) - put_unused_fd(ret); - - ret = PTR_ERR(file); - /* only retry if RESOLVE_CACHED wasn't already set by application */ - if (ret == -EAGAIN && - (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) - return -EAGAIN; - goto err; - } - - if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) - file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); - - if (!fixed) - fd_install(ret, file); - else - ret = io_fixed_fd_install(req, issue_flags, file, - open->file_slot); -err: - putname(open->filename); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_openat(struct io_kiocb *req, unsigned int issue_flags) -{ - return io_openat2(req, issue_flags); -} - -static void io_open_cleanup(struct io_kiocb *req) -{ - struct io_open *open = io_kiocb_to_cmd(req); - - if (open->filename) - putname(open->filename); -} - static int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4424,69 +4250,6 @@ static void io_statx_cleanup(struct io_kiocb *req) putname(sx->filename); } -static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_close *close = io_kiocb_to_cmd(req); - - if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - close->fd = READ_ONCE(sqe->fd); - close->file_slot = READ_ONCE(sqe->file_index); - if (close->file_slot && close->fd) - return -EINVAL; - - return 0; -} - -static int io_close(struct io_kiocb *req, unsigned int issue_flags) -{ - struct files_struct *files = current->files; - struct io_close *close = io_kiocb_to_cmd(req); - struct fdtable *fdt; - struct file *file; - int ret = -EBADF; - - if (close->file_slot) { - ret = io_close_fixed(req, issue_flags); - goto err; - } - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (close->fd >= fdt->max_fds) { - spin_unlock(&files->file_lock); - goto err; - } - file = rcu_dereference_protected(fdt->fd[close->fd], - lockdep_is_held(&files->file_lock)); - if (!file || file->f_op == &io_uring_fops) { - spin_unlock(&files->file_lock); - goto err; - } - - /* if the file has a flush method, be safe and punt to async */ - if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { - spin_unlock(&files->file_lock); - return -EAGAIN; - } - - file = __close_fd_get_file(close->fd); - spin_unlock(&files->file_lock); - if (!file) - goto err; - - /* No ->flush() or already async, safely close from here */ - ret = filp_close(file, current->files); -err: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - #if defined(CONFIG_NET) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -7744,8 +7507,8 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void) return ref_node; } -static void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill) __must_hold(&ctx->uring_lock) { WARN_ON_ONCE(!ctx->rsrc_backup_node); @@ -7772,7 +7535,7 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx, } } -static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; @@ -8319,8 +8082,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc) { u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; @@ -8386,52 +8149,6 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, return ret; } -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *file_slot; - struct file *file; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - ret = -ENXIO; - if (unlikely(!ctx->file_data)) - goto out; - ret = -EINVAL; - if (offset >= ctx->nr_user_files) - goto out; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto out; - - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - ret = -EBADF; - if (!file_slot->file_ptr) - goto out; - - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); - if (ret) - goto out; - - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); - ret = 0; -out: - io_ring_submit_unlock(ctx, issue_flags); - return ret; -} - -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_close *close = io_kiocb_to_cmd(req); - - return __io_close_fixed(req, issue_flags, close->file_slot - 1); -} - static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 02c00122b97a31..ebb225e8501251 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -2,6 +2,7 @@ #define IOU_CORE_H #include +#include #include "io_uring_types.h" enum { @@ -30,8 +31,39 @@ static inline void io_put_file(struct file *file) fput(file); } +static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, + unsigned issue_flags) +{ + lockdep_assert_held(&ctx->uring_lock); + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, + unsigned issue_flags) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); +} + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot); + +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc); +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill); +bool io_is_uring_fops(struct file *file); #endif diff --git a/io_uring/openclose.c b/io_uring/openclose.c new file mode 100644 index 00000000000000..fa35bd56a33086 --- /dev/null +++ b/io_uring/openclose.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "openclose.h" + +struct io_open { + struct file *file; + int dfd; + u32 file_slot; + struct filename *filename; + struct open_how how; + unsigned long nofile; +}; + +struct io_close { + struct file *file; + int fd; + u32 file_slot; +}; + +static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_open *open = io_kiocb_to_cmd(req); + const char __user *fname; + int ret; + + if (unlikely(sqe->buf_index)) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + /* open.how should be already initialised */ + if (!(open->how.flags & O_PATH) && force_o_largefile()) + open->how.flags |= O_LARGEFILE; + + open->dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + open->filename = getname(fname); + if (IS_ERR(open->filename)) { + ret = PTR_ERR(open->filename); + open->filename = NULL; + return ret; + } + + open->file_slot = READ_ONCE(sqe->file_index); + if (open->file_slot && (open->how.flags & O_CLOEXEC)) + return -EINVAL; + + open->nofile = rlimit(RLIMIT_NOFILE); + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_open *open = io_kiocb_to_cmd(req); + u64 mode = READ_ONCE(sqe->len); + u64 flags = READ_ONCE(sqe->open_flags); + + open->how = build_open_how(flags, mode); + return __io_openat_prep(req, sqe); +} + +int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_open *open = io_kiocb_to_cmd(req); + struct open_how __user *how; + size_t len; + int ret; + + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); + if (ret) + return ret; + + return __io_openat_prep(req, sqe); +} + +int io_openat2(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_open *open = io_kiocb_to_cmd(req); + struct open_flags op; + struct file *file; + bool resolve_nonblock, nonblock_set; + bool fixed = !!open->file_slot; + int ret; + + ret = build_open_flags(&open->how, &op); + if (ret) + goto err; + nonblock_set = op.open_flag & O_NONBLOCK; + resolve_nonblock = open->how.resolve & RESOLVE_CACHED; + if (issue_flags & IO_URING_F_NONBLOCK) { + /* + * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, + * it'll always -EAGAIN + */ + if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + return -EAGAIN; + op.lookup_flags |= LOOKUP_CACHED; + op.open_flag |= O_NONBLOCK; + } + + if (!fixed) { + ret = __get_unused_fd_flags(open->how.flags, open->nofile); + if (ret < 0) + goto err; + } + + file = do_filp_open(open->dfd, open->filename, &op); + if (IS_ERR(file)) { + /* + * We could hang on to this 'fd' on retrying, but seems like + * marginal gain for something that is now known to be a slower + * path. So just put it, and we'll get a new one when we retry. + */ + if (!fixed) + put_unused_fd(ret); + + ret = PTR_ERR(file); + /* only retry if RESOLVE_CACHED wasn't already set by application */ + if (ret == -EAGAIN && + (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) + return -EAGAIN; + goto err; + } + + if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) + file->f_flags &= ~O_NONBLOCK; + fsnotify_open(file); + + if (!fixed) + fd_install(ret, file); + else + ret = io_fixed_fd_install(req, issue_flags, file, + open->file_slot); +err: + putname(open->filename); + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_openat(struct io_kiocb *req, unsigned int issue_flags) +{ + return io_openat2(req, issue_flags); +} + +void io_open_cleanup(struct io_kiocb *req) +{ + struct io_open *open = io_kiocb_to_cmd(req); + + if (open->filename) + putname(open->filename); +} + +int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *file_slot; + struct file *file; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + ret = -ENXIO; + if (unlikely(!ctx->file_data)) + goto out; + ret = -EINVAL; + if (offset >= ctx->nr_user_files) + goto out; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto out; + + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); + ret = -EBADF; + if (!file_slot->file_ptr) + goto out; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + goto out; + + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); + io_rsrc_node_switch(ctx, ctx->file_data); + ret = 0; +out: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_close *close = io_kiocb_to_cmd(req); + + return __io_close_fixed(req, issue_flags, close->file_slot - 1); +} + +int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_close *close = io_kiocb_to_cmd(req); + + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) + return -EBADF; + + close->fd = READ_ONCE(sqe->fd); + close->file_slot = READ_ONCE(sqe->file_index); + if (close->file_slot && close->fd) + return -EINVAL; + + return 0; +} + +int io_close(struct io_kiocb *req, unsigned int issue_flags) +{ + struct files_struct *files = current->files; + struct io_close *close = io_kiocb_to_cmd(req); + struct fdtable *fdt; + struct file *file; + int ret = -EBADF; + + if (close->file_slot) { + ret = io_close_fixed(req, issue_flags); + goto err; + } + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (close->fd >= fdt->max_fds) { + spin_unlock(&files->file_lock); + goto err; + } + file = rcu_dereference_protected(fdt->fd[close->fd], + lockdep_is_held(&files->file_lock)); + if (!file || io_is_uring_fops(file)) { + spin_unlock(&files->file_lock); + goto err; + } + + /* if the file has a flush method, be safe and punt to async */ + if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { + spin_unlock(&files->file_lock); + return -EAGAIN; + } + + file = __close_fd_get_file(close->fd); + spin_unlock(&files->file_lock); + if (!file) + goto err; + + /* No ->flush() or already async, safely close from here */ + ret = filp_close(file, current->files); +err: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h new file mode 100644 index 00000000000000..9f578f3fad870d --- /dev/null +++ b/io_uring/openclose.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset); + +int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_openat(struct io_kiocb *req, unsigned int issue_flags); +void io_open_cleanup(struct io_kiocb *req); + +int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_openat2(struct io_kiocb *req, unsigned int issue_flags); + +int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_close(struct io_kiocb *req, unsigned int issue_flags); From 99f15d8d61364299ae780cc739c74068a6d2538d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 05:59:19 -0600 Subject: [PATCH 035/172] io_uring: move uring_cmd handling to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 127 ++----------------------------------------- io_uring/io_uring.h | 9 +++ io_uring/uring_cmd.c | 115 +++++++++++++++++++++++++++++++++++++++ io_uring/uring_cmd.h | 13 +++++ 5 files changed, 142 insertions(+), 124 deletions(-) create mode 100644 io_uring/uring_cmd.c create mode 100644 io_uring/uring_cmd.h diff --git a/io_uring/Makefile b/io_uring/Makefile index e60def39ca2c6a..2e2cbeb272a890 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ - openclose.o + openclose.o uring_cmd.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a79186ba8c44ec..469d89f3cf0cd1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -99,6 +99,7 @@ #include "sync.h" #include "advise.h" #include "openclose.h" +#include "uring_cmd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -473,14 +474,6 @@ struct io_cancel_data { int seq; }; -/* - * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into - * the following sqe if SQE128 is used. - */ -#define uring_cmd_pdu_size(is_sqe128) \ - ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ - offsetof(struct io_uring_sqe, cmd)) - struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -988,11 +981,6 @@ static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, return matched; } -static inline bool req_has_async_data(struct io_kiocb *req) -{ - return req->flags & REQ_F_ASYNC_DATA; -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -1743,7 +1731,7 @@ static void io_req_complete_post(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) +inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) req->flags |= REQ_F_COMPLETE_INLINE; @@ -2151,7 +2139,7 @@ static void __io_req_task_work_add(struct io_kiocb *req, } } -static void io_req_task_work_add(struct io_kiocb *req) +void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; @@ -3268,7 +3256,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, } } -static inline bool io_alloc_async_data(struct io_kiocb *req) +bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); @@ -3714,111 +3702,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - - ioucmd->task_work_cb(ioucmd); -} - -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) -{ - struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - - ioucmd->task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; - io_req_task_work_add(req); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); - -static inline void io_req_set_cqe32_extra(struct io_kiocb *req, - u64 extra1, u64 extra2) -{ - req->extra1 = extra1; - req->extra2 = extra2; - req->flags |= REQ_F_CQE32_INIT; -} - -/* - * Called by consumers of io_uring_cmd, if they originally returned - * -EIOCBQUEUED upon receiving the command. - */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) -{ - struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - - if (ret < 0) - req_set_fail(req); - - io_req_set_res(req, 0, ret); - if (req->ctx->flags & IORING_SETUP_CQE32) - io_req_set_cqe32_extra(req, res2, 0); - __io_req_complete(req, 0); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_done); - -static int io_uring_cmd_prep_async(struct io_kiocb *req) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - size_t cmd_size; - - cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - - memcpy(req->async_data, ioucmd->cmd, cmd_size); - return 0; -} - -static int io_uring_cmd_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - - if (sqe->rw_flags || sqe->__pad1) - return -EINVAL; - ioucmd->cmd = sqe->cmd; - ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; -} - -static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (!req->file->f_op->uring_cmd) - return -EOPNOTSUPP; - - if (ctx->flags & IORING_SETUP_SQE128) - issue_flags |= IO_URING_F_SQE128; - if (ctx->flags & IORING_SETUP_CQE32) - issue_flags |= IO_URING_F_CQE32; - if (ctx->flags & IORING_SETUP_IOPOLL) - issue_flags |= IO_URING_F_IOPOLL; - - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - - ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } - - if (ret != -EIOCBQUEUED) { - io_uring_cmd_done(ioucmd, ret, 0); - return IOU_OK; - } - - return IOU_ISSUE_SKIP_COMPLETE; -} - static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -11533,8 +11416,6 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); - BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { BUG_ON(!io_op_defs[i].prep); BUG_ON(!io_op_defs[i].issue); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ebb225e8501251..6a07e902120ab3 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -25,6 +25,11 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline bool req_has_async_data(struct io_kiocb *req) +{ + return req->flags & REQ_F_ASYNC_DATA; +} + static inline void io_put_file(struct file *file) { if (file) @@ -53,6 +58,8 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, lockdep_assert_held(&ctx->uring_lock); } +void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -65,5 +72,7 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); bool io_is_uring_fops(struct file *file); +bool io_alloc_async_data(struct io_kiocb *req); +void io_req_task_work_add(struct io_kiocb *req); #endif diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c new file mode 100644 index 00000000000000..abf78918a0995f --- /dev/null +++ b/io_uring/uring_cmd.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "uring_cmd.h" + +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + + ioucmd->task_work_cb(ioucmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + ioucmd->task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +static inline void io_req_set_cqe32_extra(struct io_kiocb *req, + u64 extra1, u64 extra2) +{ + req->extra1 = extra1; + req->extra2 = extra2; + req->flags |= REQ_F_CQE32_INIT; +} + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (ret < 0) + req_set_fail(req); + + io_req_set_res(req, 0, ret); + if (req->ctx->flags & IORING_SETUP_CQE32) + io_req_set_cqe32_extra(req, res2, 0); + __io_req_complete(req, 0); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, ioucmd->cmd, cmd_size); + return 0; +} + +int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + + if (sqe->rw_flags || sqe->__pad1) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) { + io_uring_cmd_done(ioucmd, ret, 0); + return IOU_OK; + } + + return IOU_ISSUE_SKIP_COMPLETE; +} diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h new file mode 100644 index 00000000000000..7c6697d13cb2e4 --- /dev/null +++ b/io_uring/uring_cmd.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); +int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_uring_cmd_prep_async(struct io_kiocb *req); + +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) From 4cf90495281b43f5f597ef4a9abcc83a63973571 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:04:14 -0600 Subject: [PATCH 036/172] io_uring: add a dummy -EOPNOTSUPP prep handler Add it and use it for the epoll handling, if epoll isn't configured. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 469d89f3cf0cd1..15d0377b7d9d86 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4034,10 +4034,16 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) return IOU_ISSUE_SKIP_COMPLETE; } +static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, + const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +#if defined(CONFIG_EPOLL) static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_EPOLL) struct io_epoll *epoll = io_kiocb_to_cmd(req); if (sqe->buf_index || sqe->splice_fd_in) @@ -4056,14 +4062,10 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, } return 0; -#else - return -EOPNOTSUPP; -#endif } static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { -#if defined(CONFIG_EPOLL) struct io_epoll *ie = io_kiocb_to_cmd(req); int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -4076,10 +4078,8 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; -#else - return -EOPNOTSUPP; -#endif } +#endif static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -11246,8 +11246,12 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, +#if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_SPLICE] = { .needs_file = 1, @@ -11418,7 +11422,8 @@ static int __init io_uring_init(void) for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { BUG_ON(!io_op_defs[i].prep); - BUG_ON(!io_op_defs[i].issue); + if (io_op_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_op_defs[i].issue); } req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | From a9c210cebe13c36487a239ae7f4671a389fed127 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:09:18 -0600 Subject: [PATCH 037/172] io_uring: move epoll handler to its own file Would be nice to sort out Kconfig for this and don't even compile epoll.c if we don't have epoll configured. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/epoll.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/epoll.h | 6 +++++ io_uring/io_uring.c | 50 +----------------------------------- 4 files changed, 70 insertions(+), 50 deletions(-) create mode 100644 io_uring/epoll.c create mode 100644 io_uring/epoll.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e2cbeb272a890..59e70f2d8c56d4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ - openclose.o uring_cmd.o + openclose.o uring_cmd.o epoll.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/epoll.c b/io_uring/epoll.c new file mode 100644 index 00000000000000..acbb32498127ad --- /dev/null +++ b/io_uring/epoll.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "epoll.h" + +#if defined(CONFIG_EPOLL) +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; +}; + +int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll *epoll = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + epoll->epfd = READ_ONCE(sqe->fd); + epoll->op = READ_ONCE(sqe->len); + epoll->fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(epoll->op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&epoll->event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +} + +int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll *ie = io_kiocb_to_cmd(req); + int ret; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} +#endif diff --git a/io_uring/epoll.h b/io_uring/epoll.h new file mode 100644 index 00000000000000..870cce11ba982d --- /dev/null +++ b/io_uring/epoll.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 + +#if defined(CONFIG_EPOLL) +int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 15d0377b7d9d86..78828e294e53b7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -100,6 +100,7 @@ #include "advise.h" #include "openclose.h" #include "uring_cmd.h" +#include "epoll.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -374,14 +375,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_epoll { - struct file *file; - int epfd; - int op; - int fd; - struct epoll_event event; -}; - struct io_provide_buf { struct file *file; __u64 addr; @@ -4040,47 +4033,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -#if defined(CONFIG_EPOLL) -static int io_epoll_ctl_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_epoll *epoll = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - - epoll->epfd = READ_ONCE(sqe->fd); - epoll->op = READ_ONCE(sqe->len); - epoll->fd = READ_ONCE(sqe->off); - - if (ep_op_has_event(epoll->op)) { - struct epoll_event __user *ev; - - ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&epoll->event, ev, sizeof(*ev))) - return -EFAULT; - } - - return 0; -} - -static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_epoll *ie = io_kiocb_to_cmd(req); - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} -#endif - static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req); From e0da14def1ee0a9cd9c88893321e9a3d900f8e23 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:12:18 -0600 Subject: [PATCH 038/172] io_uring: move statx handling to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 62 +------------------------------------ io_uring/statx.c | 74 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/statx.h | 5 +++ 4 files changed, 82 insertions(+), 62 deletions(-) create mode 100644 io_uring/statx.c create mode 100644 io_uring/statx.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 59e70f2d8c56d4..de953c022c6ebd 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o + openclose.o uring_cmd.o epoll.o \ + statx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 78828e294e53b7..eb01d1aadeb486 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -101,6 +101,7 @@ #include "openclose.h" #include "uring_cmd.h" #include "epoll.h" +#include "statx.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -384,15 +385,6 @@ struct io_provide_buf { __u16 bid; }; -struct io_statx { - struct file *file; - int dfd; - unsigned int mask; - unsigned int flags; - struct filename *filename; - struct statx __user *buffer; -}; - struct io_shutdown { struct file *file; int how; @@ -4033,58 +4025,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_statx *sx = io_kiocb_to_cmd(req); - const char __user *path; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - sx->dfd = READ_ONCE(sqe->fd); - sx->mask = READ_ONCE(sqe->len); - path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sx->flags = READ_ONCE(sqe->statx_flags); - - sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags), - NULL); - - if (IS_ERR(sx->filename)) { - int ret = PTR_ERR(sx->filename); - - sx->filename = NULL; - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_statx(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_statx *sx = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_statx_cleanup(struct io_kiocb *req) -{ - struct io_statx *sx = io_kiocb_to_cmd(req); - - if (sx->filename) - putname(sx->filename); -} - #if defined(CONFIG_NET) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/statx.c b/io_uring/statx.c new file mode 100644 index 00000000000000..83b15687e9c5ee --- /dev/null +++ b/io_uring/statx.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "statx.h" + +struct io_statx { + struct file *file; + int dfd; + unsigned int mask; + unsigned int flags; + struct filename *filename; + struct statx __user *buffer; +}; + +int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + const char __user *path; + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) + return -EBADF; + + sx->dfd = READ_ONCE(sqe->fd); + sx->mask = READ_ONCE(sqe->len); + path = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + sx->flags = READ_ONCE(sqe->statx_flags); + + sx->filename = getname_flags(path, + getname_statx_lookup_flags(sx->flags), + NULL); + + if (IS_ERR(sx->filename)) { + int ret = PTR_ERR(sx->filename); + + sx->filename = NULL; + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_statx(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_statx_cleanup(struct io_kiocb *req) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + + if (sx->filename) + putname(sx->filename); +} diff --git a/io_uring/statx.h b/io_uring/statx.h new file mode 100644 index 00000000000000..9a17f4d45a7d5f --- /dev/null +++ b/io_uring/statx.h @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_statx(struct io_kiocb *req, unsigned int issue_flags); +void io_statx_cleanup(struct io_kiocb *req); From f9ead18c10589a351f395ac5aa107360f2f6ce53 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:25:13 -0600 Subject: [PATCH 039/172] io_uring: split network related opcodes into its own file While at it, convert the handlers to just use io_eopnotsupp_prep() if CONFIG_NET isn't set. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 873 ++------------------------------------------ io_uring/io_uring.h | 22 ++ io_uring/net.c | 779 +++++++++++++++++++++++++++++++++++++++ io_uring/net.h | 43 +++ 5 files changed, 884 insertions(+), 835 deletions(-) create mode 100644 io_uring/net.c create mode 100644 io_uring/net.h diff --git a/io_uring/Makefile b/io_uring/Makefile index de953c022c6ebd..c9ec1bbabfbd36 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o + statx.o net.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eb01d1aadeb486..cbc20985cd6f34 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -102,6 +102,7 @@ #include "uring_cmd.h" #include "epoll.h" #include "statx.h" +#include "net.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -131,8 +132,6 @@ #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) -#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) - #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_mapped_ubuf { @@ -295,25 +294,6 @@ struct io_timeout_data { u32 flags; }; -struct io_accept { - struct file *file; - struct sockaddr __user *addr; - int __user *addr_len; - int flags; - u32 file_slot; - unsigned long nofile; -}; - -struct io_socket { - struct file *file; - int domain; - int type; - int protocol; - int flags; - u32 file_slot; - unsigned long nofile; -}; - struct io_cancel { struct file *file; u64 addr; @@ -350,25 +330,6 @@ struct io_rw { rwf_t flags; }; -struct io_connect { - struct file *file; - struct sockaddr __user *addr; - int addr_len; -}; - -struct io_sr_msg { - struct file *file; - union { - struct compat_msghdr __user *umsg_compat; - struct user_msghdr __user *umsg; - void __user *buf; - }; - int msg_flags; - size_t len; - size_t done_io; - unsigned int flags; -}; - struct io_rsrc_update { struct file *file; u64 arg; @@ -385,30 +346,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_shutdown { - struct file *file; - int how; -}; - struct io_msg { struct file *file; u64 user_data; u32 len; }; -struct io_async_connect { - struct sockaddr_storage address; -}; - -struct io_async_msghdr { - struct iovec fast_iov[UIO_FASTIOV]; - /* points to an allocated iov, if NULL we use fast_iov instead */ - struct iovec *free_iov; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -517,9 +460,6 @@ static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index); - static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); @@ -808,8 +748,7 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) +inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { unsigned int cflags; @@ -1291,12 +1230,6 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static inline void io_commit_cqring(struct io_ring_ctx *ctx) -{ - /* order cqe stores with ring update */ - smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); -} - static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { @@ -1418,7 +1351,7 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) * 1:1 relationship between how many times this function is called (and * hence the eventfd count) and number of CQEs posted to the CQ ring. */ -static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) @@ -1639,8 +1572,8 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, } } -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { struct io_uring_cqe *cqe; @@ -2980,8 +2913,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return u64_to_user_ptr(buf->addr); } -static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -3073,13 +3006,6 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, issue_flags); } -static inline bool io_do_buffer_select(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return false; - return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); -} - static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) @@ -4025,755 +3951,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -#if defined(CONFIG_NET) -static int io_shutdown_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_shutdown *shutdown = io_kiocb_to_cmd(req); - - if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - shutdown->how = READ_ONCE(sqe->len); - return 0; -} - -static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_shutdown *shutdown = io_kiocb_to_cmd(req); - struct socket *sock; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = __sys_shutdown_sock(sock, shutdown->how); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg) -{ - struct io_async_msghdr *async_msg = req->async_data; - - if (async_msg) - return -EAGAIN; - if (io_alloc_async_data(req)) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - async_msg = req->async_data; - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - async_msg->msg.msg_name = &async_msg->addr; - /* if were using fast_iov, set it to the new one */ - if (!async_msg->free_iov) - async_msg->msg.msg_iter.iov = async_msg->fast_iov; - - return -EAGAIN; -} - -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - - iomsg->msg.msg_name = &iomsg->addr; - iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, - &iomsg->free_iov); -} - -static int io_sendmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_sendmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); -} - -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - - if (unlikely(sqe->file_index || sqe->addr2)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct io_async_msghdr iomsg, *kmsg; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct msghdr msg; - struct iovec iov; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - return ret; - - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct iovec __user *uiov; - size_t iov_len; - int ret; - - ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, - &iomsg->uaddr, &uiov, &iov_len); - if (ret) - return ret; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len > 1) - return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, - false); - if (ret > 0) - ret = 0; - } - - return ret; -} - -#ifdef CONFIG_COMPAT -static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct compat_iovec __user *uiov; - compat_uptr_t ptr; - compat_size_t len; - int ret; - - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, - &ptr, &len); - if (ret) - return ret; - - uiov = compat_ptr(ptr); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - - if (len > 1) - return -EINVAL; - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, (struct iovec __user *)uiov, len, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); - if (ret < 0) - return ret; - } - - return 0; -} -#endif - -static int io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, iomsg); -#endif - - return __io_recvmsg_copy_hdr(req, iomsg); -} - -static int io_recvmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_recvmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - - if (unlikely(sqe->file_index || sqe->addr2)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - if (sr->msg_flags & MSG_ERRQUEUE) - req->flags |= REQ_F_CLEAR_POLLIN; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct io_async_msghdr iomsg, *kmsg; - struct socket *sock; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); - } - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - kmsg->msg.msg_get_inq = 1; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { - req_set_fail(req); - } - - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (kmsg->msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct msghdr msg; - struct socket *sock; - struct iovec iov; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - } - - ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_flags = 0; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - ret = sock_recvmsg(sock, &msg, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { -out_free: - req_set_fail(req); - } - - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_accept *accept = io_kiocb_to_cmd(req); - unsigned flags; - - if (sqe->len || sqe->buf_index) - return -EINVAL; - - accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - accept->flags = READ_ONCE(sqe->accept_flags); - accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) - return -EINVAL; - - accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot) { - if (accept->flags & SOCK_CLOEXEC) - return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && - accept->file_slot != IORING_FILE_INDEX_ALLOC) - return -EINVAL; - } - if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) - accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) - req->flags |= REQ_F_APOLL_MULTISHOT; - return 0; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_accept *accept = io_kiocb_to_cmd(req); - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - bool fixed = !!accept->file_slot; - struct file *file; - int ret, fd; - -retry: - if (!fixed) { - fd = __get_unused_fd_flags(accept->flags, accept->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, - accept->flags); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if ((req->flags & IO_APOLL_MULTI_POLLED) == - IO_APOLL_MULTI_POLLED) - ret = IOU_ISSUE_SKIP_COMPLETE; - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - accept->file_slot); - } - - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, 0); - return IOU_OK; - } - if (ret >= 0) { - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, - IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - goto retry; - } - ret = -ECANCELED; - } - - return ret; -} - -static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_socket *sock = io_kiocb_to_cmd(req); - - if (sqe->addr || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - - sock->domain = READ_ONCE(sqe->fd); - sock->type = READ_ONCE(sqe->off); - sock->protocol = READ_ONCE(sqe->len); - sock->file_slot = READ_ONCE(sqe->file_index); - sock->nofile = rlimit(RLIMIT_NOFILE); - - sock->flags = sock->type & ~SOCK_TYPE_MASK; - if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) - return -EINVAL; - if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - return 0; -} - -static int io_socket(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_socket *sock = io_kiocb_to_cmd(req); - bool fixed = !!sock->file_slot; - struct file *file; - int ret, fd; - - if (!fixed) { - fd = __get_unused_fd_flags(sock->flags, sock->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = __sys_socket_file(sock->domain, sock->type, sock->protocol); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - sock->file_slot); - } - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = io_kiocb_to_cmd(req); - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_connect *conn = io_kiocb_to_cmd(req); - - if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - conn->addr_len = READ_ONCE(sqe->addr2); - return 0; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_connect *connect = io_kiocb_to_cmd(req); - struct io_async_connect __io, *io; - unsigned file_flags; - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(connect->addr, - connect->addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - - file_flags = force_nonblock ? O_NONBLOCK : 0; - - ret = __sys_connect_file(req->file, &io->address, - connect->addr_len, file_flags); - if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; -out: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} -#else /* !CONFIG_NET */ -#define IO_NETOP_FN(op) \ -static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ -{ \ - return -EOPNOTSUPP; \ -} - -#define IO_NETOP_PREP(op) \ -IO_NETOP_FN(op) \ -static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ -{ \ - return -EOPNOTSUPP; \ -} \ - -#define IO_NETOP_PREP_ASYNC(op) \ -IO_NETOP_PREP(op) \ -static int io_##op##_prep_async(struct io_kiocb *req) \ -{ \ - return -EOPNOTSUPP; \ -} - -IO_NETOP_PREP_ASYNC(sendmsg); -IO_NETOP_PREP_ASYNC(recvmsg); -IO_NETOP_PREP_ASYNC(connect); -IO_NETOP_PREP(accept); -IO_NETOP_PREP(socket); -IO_NETOP_PREP(shutdown); -IO_NETOP_FN(send); -IO_NETOP_FN(recv); -#endif /* CONFIG_NET */ - struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; @@ -7874,8 +7051,8 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, return 0; } -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) +int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) __must_hold(&req->ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; @@ -10986,12 +10163,14 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, +#if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, .prep_async = io_sendmsg_prep_async, -#if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECVMSG] = { @@ -11000,12 +10179,14 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, +#if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, .prep_async = io_recvmsg_prep_async, -#if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { @@ -11026,8 +10207,12 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ +#if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -11044,10 +10229,14 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, +#if defined(CONFIG_NET) .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, .prep_async = io_connect_prep_async, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -11117,8 +10306,12 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, +#if defined(CONFIG_NET) .prep = io_sendmsg_prep, .issue = io_send, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_RECV] = { .needs_file = 1, @@ -11127,8 +10320,12 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, +#if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_OPENAT2] = { .prep = io_openat2_prep, @@ -11175,8 +10372,12 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, +#if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_RENAMEAT] = { .prep = io_renameat_prep, @@ -11233,8 +10434,12 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_SOCKET] = { .audit_skip = 1, +#if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_URING_CMD] = { .needs_file = 1, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6a07e902120ab3..4b46385720c585 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -58,13 +58,35 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, lockdep_assert_held(&ctx->uring_lock); } +static inline void io_commit_cqring(struct io_ring_ctx *ctx) +{ + /* order cqe stores with ring update */ + smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); +} + void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags); +void io_cqring_ev_posted(struct io_ring_ctx *ctx); +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags); +unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); + +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); +int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, diff --git a/io_uring/net.c b/io_uring/net.c new file mode 100644 index 00000000000000..2434548d0c1fb3 --- /dev/null +++ b/io_uring/net.c @@ -0,0 +1,779 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "net.h" + +#if defined(CONFIG_NET) +struct io_shutdown { + struct file *file; + int how; +}; + +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; + u32 file_slot; + unsigned long nofile; +}; + +struct io_socket { + struct file *file; + int domain; + int type; + int protocol; + int flags; + u32 file_slot; + unsigned long nofile; +}; + +struct io_connect { + struct file *file; + struct sockaddr __user *addr; + int addr_len; +}; + +struct io_sr_msg { + struct file *file; + union { + struct compat_msghdr __user *umsg_compat; + struct user_msghdr __user *umsg; + void __user *buf; + }; + int msg_flags; + size_t len; + size_t done_io; + unsigned int flags; +}; + +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) + +int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); + + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + shutdown->how = READ_ONCE(sqe->len); + return 0; +} + +int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); + struct socket *sock; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_shutdown_sock(sock, shutdown->how); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + +static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) +{ + struct io_async_msghdr *async_msg = req->async_data; + + if (async_msg) + return -EAGAIN; + if (io_alloc_async_data(req)) { + kfree(kmsg->free_iov); + return -ENOMEM; + } + async_msg = req->async_data; + req->flags |= REQ_F_NEED_CLEANUP; + memcpy(async_msg, kmsg, sizeof(*kmsg)); + async_msg->msg.msg_name = &async_msg->addr; + /* if were using fast_iov, set it to the new one */ + if (!async_msg->free_iov) + async_msg->msg.msg_iter.iov = async_msg->fast_iov; + + return -EAGAIN; +} + +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + iomsg->msg.msg_name = &iomsg->addr; + iomsg->free_iov = iomsg->fast_iov; + return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, + &iomsg->free_iov); +} + +int io_sendmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) +{ + struct io_async_msghdr *io = req->async_data; + + kfree(io->free_iov); +} + +int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + if (unlikely(sqe->file_index || sqe->addr2)) + return -EINVAL; + + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->ioprio); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + if (sr->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + sr->done_io = 0; + return 0; +} + +int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct io_async_msghdr iomsg, *kmsg; + struct socket *sock; + unsigned flags; + int min_ret = 0; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + flags = sr->msg_flags; + if (issue_flags & IO_URING_F_NONBLOCK) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); + } + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_send(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct msghdr msg; + struct iovec iov; + struct socket *sock; + unsigned flags; + int min_ret = 0; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = sr->msg_flags; + if (issue_flags & IO_URING_F_NONBLOCK) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } + req_set_fail(req); + } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct iovec __user *uiov; + size_t iov_len; + int ret; + + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (iov_len > 1) + return -EINVAL; + if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = iomsg->fast_iov[0].iov_len; + iomsg->free_iov = NULL; + } else { + iomsg->free_iov = iomsg->fast_iov; + ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &iomsg->free_iov, &iomsg->msg.msg_iter, + false); + if (ret > 0) + ret = 0; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct compat_iovec __user *uiov; + compat_uptr_t ptr; + compat_size_t len; + int ret; + + ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, + &ptr, &len); + if (ret) + return ret; + + uiov = compat_ptr(ptr); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + if (len > 1) + return -EINVAL; + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = clen; + iomsg->free_iov = NULL; + } else { + iomsg->free_iov = iomsg->fast_iov; + ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + UIO_FASTIOV, &iomsg->free_iov, + &iomsg->msg.msg_iter, true); + if (ret < 0) + return ret; + } + + return 0; +} +#endif + +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->msg.msg_name = &iomsg->addr; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return __io_compat_recvmsg_copy_hdr(req, iomsg); +#endif + + return __io_recvmsg_copy_hdr(req, iomsg); +} + +int io_recvmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + ret = io_recvmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + if (unlikely(sqe->file_index || sqe->addr2)) + return -EINVAL; + + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->ioprio); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + if (sr->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + if (sr->msg_flags & MSG_ERRQUEUE) + req->flags |= REQ_F_CLEAR_POLLIN; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + sr->done_io = 0; + return 0; +} + +int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct io_async_msghdr iomsg, *kmsg; + struct socket *sock; + unsigned int cflags; + unsigned flags; + int ret, min_ret = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + kmsg->fast_iov[0].iov_base = buf; + kmsg->fast_iov[0].iov_len = sr->len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, + sr->len); + } + + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + + kmsg->msg.msg_get_inq = 1; + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } + + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + cflags = io_put_kbuf(req, issue_flags); + if (kmsg->msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + io_req_set_res(req, ret, cflags); + return IOU_OK; +} + +int io_recv(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct msghdr msg; + struct socket *sock; + struct iovec iov; + unsigned int cflags; + unsigned flags; + int ret, min_ret = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; + } + + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + goto out_free; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_flags = 0; + msg.msg_controllen = 0; + msg.msg_iocb = NULL; + + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + ret = sock_recvmsg(sock, &msg, flags); + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { +out_free: + req_set_fail(req); + } + + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + cflags = io_put_kbuf(req, issue_flags); + if (msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + io_req_set_res(req, ret, cflags); + return IOU_OK; +} + +int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_accept *accept = io_kiocb_to_cmd(req); + unsigned flags; + + if (sqe->len || sqe->buf_index) + return -EINVAL; + + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + accept->flags = READ_ONCE(sqe->accept_flags); + accept->nofile = rlimit(RLIMIT_NOFILE); + flags = READ_ONCE(sqe->ioprio); + if (flags & ~IORING_ACCEPT_MULTISHOT) + return -EINVAL; + + accept->file_slot = READ_ONCE(sqe->file_index); + if (accept->file_slot) { + if (accept->flags & SOCK_CLOEXEC) + return -EINVAL; + if (flags & IORING_ACCEPT_MULTISHOT && + accept->file_slot != IORING_FILE_INDEX_ALLOC) + return -EINVAL; + } + if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) + accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + if (flags & IORING_ACCEPT_MULTISHOT) + req->flags |= REQ_F_APOLL_MULTISHOT; + return 0; +} + +int io_accept(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_accept *accept = io_kiocb_to_cmd(req); + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; + bool fixed = !!accept->file_slot; + struct file *file; + int ret, fd; + +retry: + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + accept->flags); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && force_nonblock) { + /* + * if it's multishot and polled, we don't need to + * return EAGAIN to arm the poll infra since it + * has already been done + */ + if ((req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) + ret = IOU_ISSUE_SKIP_COMPLETE; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + accept->file_slot); + } + + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + io_req_set_res(req, ret, 0); + return IOU_OK; + } + if (ret >= 0) { + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + goto retry; + } + ret = -ECANCELED; + } + + return ret; +} + +int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_socket *sock = io_kiocb_to_cmd(req); + + if (sqe->addr || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + sock->domain = READ_ONCE(sqe->fd); + sock->type = READ_ONCE(sqe->off); + sock->protocol = READ_ONCE(sqe->len); + sock->file_slot = READ_ONCE(sqe->file_index); + sock->nofile = rlimit(RLIMIT_NOFILE); + + sock->flags = sock->type & ~SOCK_TYPE_MASK; + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) + return -EINVAL; + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + return 0; +} + +int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_socket *sock = io_kiocb_to_cmd(req); + bool fixed = !!sock->file_slot; + struct file *file; + int ret, fd; + + if (!fixed) { + fd = __get_unused_fd_flags(sock->flags, sock->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + sock->file_slot); + } + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_connect_prep_async(struct io_kiocb *req) +{ + struct io_async_connect *io = req->async_data; + struct io_connect *conn = io_kiocb_to_cmd(req); + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); +} + +int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_connect *conn = io_kiocb_to_cmd(req); + + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + conn->addr_len = READ_ONCE(sqe->addr2); + return 0; +} + +int io_connect(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_connect *connect = io_kiocb_to_cmd(req); + struct io_async_connect __io, *io; + unsigned file_flags; + int ret; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + if (req_has_async_data(req)) { + io = req->async_data; + } else { + ret = move_addr_to_kernel(connect->addr, + connect->addr_len, + &__io.address); + if (ret) + goto out; + io = &__io; + } + + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, &io->address, + connect->addr_len, file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + if (req_has_async_data(req)) + return -EAGAIN; + if (io_alloc_async_data(req)) { + ret = -ENOMEM; + goto out; + } + memcpy(req->async_data, &__io, sizeof(__io)); + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; +out: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} +#endif diff --git a/io_uring/net.h b/io_uring/net.h new file mode 100644 index 00000000000000..81d71d1647704d --- /dev/null +++ b/io_uring/net.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#if defined(CONFIG_NET) +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + /* points to an allocated iov, if NULL we use fast_iov instead */ + struct iovec *free_iov; + struct sockaddr __user *uaddr; + struct msghdr msg; + struct sockaddr_storage addr; +}; + +struct io_async_connect { + struct sockaddr_storage address; +}; + +int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); + +int io_sendmsg_prep_async(struct io_kiocb *req); +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); +int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); +int io_send(struct io_kiocb *req, unsigned int issue_flags); + +int io_recvmsg_prep_async(struct io_kiocb *req); +int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); +int io_recv(struct io_kiocb *req, unsigned int issue_flags); + +int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_accept(struct io_kiocb *req, unsigned int issue_flags); + +int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_socket(struct io_kiocb *req, unsigned int issue_flags); + +int io_connect_prep_async(struct io_kiocb *req); +int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_connect(struct io_kiocb *req, unsigned int issue_flags); +#endif From 36404b09aa609e00f8f0108356830c22b99b3cbf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:42:08 -0600 Subject: [PATCH 040/172] io_uring: move msg_ring into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 55 +------------------------------------- io_uring/msg_ring.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/msg_ring.h | 4 +++ 4 files changed, 71 insertions(+), 55 deletions(-) create mode 100644 io_uring/msg_ring.c create mode 100644 io_uring/msg_ring.h diff --git a/io_uring/Makefile b/io_uring/Makefile index c9ec1bbabfbd36..d7cf992c841a38 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o net.o + statx.o net.o msg_ring.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cbc20985cd6f34..a0173ab5178c95 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -103,6 +103,7 @@ #include "epoll.h" #include "statx.h" #include "net.h" +#include "msg_ring.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -346,12 +347,6 @@ struct io_provide_buf { __u16 bid; }; -struct io_msg { - struct file *file; - u64 user_data; - u32 len; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -3613,54 +3608,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static int io_msg_ring_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_msg *msg = io_kiocb_to_cmd(req); - - if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || - sqe->buf_index || sqe->personality)) - return -EINVAL; - - msg->user_data = READ_ONCE(sqe->off); - msg->len = READ_ONCE(sqe->len); - return 0; -} - -static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_msg *msg = io_kiocb_to_cmd(req); - struct io_ring_ctx *target_ctx; - bool filled; - int ret; - - ret = -EBADFD; - if (req->file->f_op != &io_uring_fops) - goto done; - - ret = -EOVERFLOW; - target_ctx = req->file->private_data; - - spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); - io_commit_cqring(target_ctx); - spin_unlock(&target_ctx->completion_lock); - - if (filled) { - io_cqring_ev_posted(target_ctx); - ret = 0; - } - -done: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - /* put file to avoid an attempt to IOPOLL the req */ - io_put_file(req->file); - req->file = NULL; - return IOU_OK; -} - /* * Note when io_fixed_fd_install() returns error value, it will ensure * fput() is called correspondingly. diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c new file mode 100644 index 00000000000000..3b89f9a0a0b459 --- /dev/null +++ b/io_uring/msg_ring.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "msg_ring.h" + +struct io_msg { + struct file *file; + u64 user_data; + u32 len; +}; + +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_msg *msg = io_kiocb_to_cmd(req); + + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || + sqe->buf_index || sqe->personality)) + return -EINVAL; + + msg->user_data = READ_ONCE(sqe->off); + msg->len = READ_ONCE(sqe->len); + return 0; +} + +int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_msg *msg = io_kiocb_to_cmd(req); + struct io_ring_ctx *target_ctx; + bool filled; + int ret; + + ret = -EBADFD; + if (!io_is_uring_fops(req->file)) + goto done; + + ret = -EOVERFLOW; + target_ctx = req->file->private_data; + + spin_lock(&target_ctx->completion_lock); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); + io_commit_cqring(target_ctx); + spin_unlock(&target_ctx->completion_lock); + + if (filled) { + io_cqring_ev_posted(target_ctx); + ret = 0; + } + +done: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; + return IOU_OK; +} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h new file mode 100644 index 00000000000000..fb9601f202d07d --- /dev/null +++ b/io_uring/msg_ring.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); From e418bbc97bffda868934acfdf8a1173ab044be69 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 08:56:52 -0600 Subject: [PATCH 041/172] io_uring: move our reference counting into a header Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 43 +--------------------------------------- io_uring/refs.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 io_uring/refs.h diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a0173ab5178c95..eea5282b1ca21b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -91,6 +91,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "refs.h" #include "xattr.h" #include "nop.h" @@ -611,54 +612,12 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - return atomic_inc_not_zero(&req->refs); -} - -static inline bool req_ref_put_and_test(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_REFCOUNT))) - return true; - - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} - -static inline void req_ref_get(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); -} - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs)) __io_submit_flush_completions(ctx); } -static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) -{ - if (!(req->flags & REQ_F_REFCOUNT)) { - req->flags |= REQ_F_REFCOUNT; - atomic_set(&req->refs, nr); - } -} - -static inline void io_req_set_refcount(struct io_kiocb *req) -{ - __io_req_set_refcount(req, 1); -} - #define IO_RSRC_REF_BATCH 100 static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) diff --git a/io_uring/refs.h b/io_uring/refs.h new file mode 100644 index 00000000000000..334c5ead4c43d9 --- /dev/null +++ b/io_uring/refs.h @@ -0,0 +1,48 @@ +#ifndef IOU_REQ_REF_H +#define IOU_REQ_REF_H + +#include +#include "io_uring_types.h" + +/* + * Shamelessly stolen from the mm implementation of page reference checking, + * see commit f958d7b528b1 for details. + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + return atomic_inc_not_zero(&req->refs); +} + +static inline bool req_ref_put_and_test(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_REFCOUNT))) + return true; + + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + +static inline void req_ref_get(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_inc(&req->refs); +} + +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) +{ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags |= REQ_F_REFCOUNT; + atomic_set(&req->refs, nr); + } +} + +static inline void io_req_set_refcount(struct io_kiocb *req) +{ + __io_req_set_refcount(req, 1); +} +#endif From 59915143e89fb8dc7b5bd9dcaf628d8181fd54ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 08:57:27 -0600 Subject: [PATCH 042/172] io_uring: move timeout opcodes and handling into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 667 +------------------------------------- io_uring/io_uring.h | 13 +- io_uring/io_uring_types.h | 10 + io_uring/timeout.c | 634 ++++++++++++++++++++++++++++++++++++ io_uring/timeout.h | 35 ++ 6 files changed, 701 insertions(+), 660 deletions(-) create mode 100644 io_uring/timeout.c create mode 100644 io_uring/timeout.h diff --git a/io_uring/Makefile b/io_uring/Makefile index d7cf992c841a38..6ae4e45a15dbfc 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o + statx.o net.o msg_ring.o timeout.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eea5282b1ca21b..3fc59a22d54e89 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -105,6 +105,7 @@ #include "statx.h" #include "net.h" #include "msg_ring.h" +#include "timeout.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -288,14 +289,6 @@ struct io_poll_update { bool update_user_data; }; -struct io_timeout_data { - struct io_kiocb *req; - struct hrtimer timer; - struct timespec64 ts; - enum hrtimer_mode mode; - u32 flags; -}; - struct io_cancel { struct file *file; u64 addr; @@ -303,27 +296,6 @@ struct io_cancel { s32 fd; }; -struct io_timeout { - struct file *file; - u32 off; - u32 target_seq; - struct list_head list; - /* head of the link, used by linked timeouts only */ - struct io_kiocb *head; - /* for linked completions */ - struct io_kiocb *prev; -}; - -struct io_timeout_rem { - struct file *file; - u64 addr; - - /* timeout update */ - struct timespec64 ts; - u32 flags; - bool ltimeout; -}; - struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -388,16 +360,6 @@ struct io_defer_entry { u32 seq; }; -struct io_cancel_data { - struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; - u32 flags; - int seq; -}; - struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -436,7 +398,6 @@ static const struct io_op_def io_op_defs[]; #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) -static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, @@ -444,7 +405,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static void io_dismantle_req(struct io_kiocb *req); -static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); @@ -456,9 +416,7 @@ static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); static struct kmem_cache *req_cachep; @@ -609,9 +567,6 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) } } -#define io_for_each_link(pos, head) \ - for (pos = (head); pos; pos = pos->link) - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs)) @@ -803,24 +758,6 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) io_ring_submit_unlock(ctx, issue_flags); } -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *req; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -877,13 +814,6 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } -static inline bool io_is_timeout_noseq(struct io_kiocb *req) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - - return !timeout->off; -} - static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -1120,24 +1050,6 @@ static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) io_queue_linked_timeout(link); } -static void io_kill_timeout(struct io_kiocb *req, int status) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_timeout_data *io = req->async_data; - - if (hrtimer_try_to_cancel(&io->timer) != -1) { - struct io_timeout *timeout = io_kiocb_to_cmd(req); - - if (status) - req_set_fail(req); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_tw_post_queue(req, status, 0); - } -} - static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { @@ -1152,38 +1064,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) - __must_hold(&ctx->completion_lock) -{ - u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - struct io_timeout *timeout, *tmp; - - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { - struct io_kiocb *req = cmd_to_io_kiocb(timeout); - u32 events_needed, events_got; - - if (io_is_timeout_noseq(req)) - break; - - /* - * Since seq can easily wrap around over time, subtract - * the last seq at which timeouts were flushed before comparing. - * Assuming not more than 2^31-1 events have happened since, - * these subtractions won't have wrapped, so we can check if - * target is in [last_seq, current_seq] by comparing the two. - */ - events_needed = timeout->target_seq - ctx->cq_last_tm_flush; - events_got = seq - ctx->cq_last_tm_flush; - if (events_got < events_needed) - break; - - io_kill_timeout(req, 0); - } - ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); -} - static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { @@ -1585,14 +1465,14 @@ static void __io_req_complete_put(struct io_kiocb *req) } } -static void __io_req_complete_post(struct io_kiocb *req) +void __io_req_complete_post(struct io_kiocb *req) { if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(req->ctx, req); __io_req_complete_put(req); } -static void io_req_complete_post(struct io_kiocb *req) +void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1717,7 +1597,7 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -static __cold void io_free_req(struct io_kiocb *req) +__cold void io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1731,96 +1611,6 @@ static __cold void io_free_req(struct io_kiocb *req) spin_unlock(&ctx->completion_lock); } -static inline void io_remove_next_linked(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - req->link = nxt->link; - nxt->link = NULL; -} - -static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - struct io_timeout_data *io = link->async_data; - struct io_timeout *timeout = io_kiocb_to_cmd(link); - - io_remove_next_linked(req); - timeout->head = NULL; - if (hrtimer_try_to_cancel(&io->timer) != -1) { - list_del(&timeout->list); - return link; - } - } - return NULL; -} - -static void io_fail_links(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *nxt, *link = req->link; - bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; - - req->link = NULL; - while (link) { - long res = -ECANCELED; - - if (link->flags & REQ_F_FAIL) - res = link->cqe.res; - - nxt = link->link; - link->link = NULL; - - trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, - req->opcode, link); - - if (ignore_cqes) - link->flags |= REQ_F_CQE_SKIP; - else - link->flags &= ~REQ_F_CQE_SKIP; - io_req_set_res(link, res, 0); - __io_req_complete_post(link); - link = nxt; - } -} - -static bool io_disarm_next(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *link = NULL; - bool posted = false; - - if (req->flags & REQ_F_ARM_LTIMEOUT) { - link = req->link; - req->flags &= ~REQ_F_ARM_LTIMEOUT; - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - io_remove_next_linked(req); - io_req_tw_post_queue(link, -ECANCELED, 0); - posted = true; - } - } else if (req->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); - if (link) { - posted = true; - io_req_tw_post_queue(link, -ECANCELED, 0); - } - } - if (unlikely((req->flags & REQ_F_FAIL) && - !(req->flags & REQ_F_HARDLINK))) { - posted |= (req->link != NULL); - io_fail_links(req); - } - return posted; -} - static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -2033,7 +1823,7 @@ static void io_req_tw_post(struct io_kiocb *req, bool *locked) io_req_complete_post(req); } -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) { io_req_set_res(req, res, cflags); req->io_task_work.func = io_req_tw_post; @@ -2057,7 +1847,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, -EFAULT); } -static void io_req_task_queue_fail(struct io_kiocb *req, int ret) +void io_req_task_queue_fail(struct io_kiocb *req, int ret) { io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; @@ -2076,7 +1866,7 @@ static void io_req_task_queue_reissue(struct io_kiocb *req) io_req_task_work_add(req); } -static void io_queue_next(struct io_kiocb *req) +void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2177,14 +1967,6 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) return nxt; } -static inline void io_put_req(struct io_kiocb *req) -{ - if (req_ref_put_and_test(req)) { - io_queue_next(req); - io_free_req(req); - } -} - static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ @@ -2451,7 +2233,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) +inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { if (*locked) { req->cqe.flags |= io_put_kbuf(req, 0); @@ -4600,334 +4382,6 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *req = data->req; - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - list_del_init(&timeout->list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) - req_set_fail(req); - - io_req_set_res(req, -ETIME, 0); - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout *timeout; - struct io_timeout_data *io; - struct io_kiocb *req = NULL; - - list_for_each_entry(timeout, &ctx->timeout_list, list) { - struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != tmp->cqe.user_data) - continue; - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == tmp->work.cancel_seq) - continue; - tmp->work.cancel_seq = cd->seq; - } - req = tmp; - break; - } - if (!req) - return ERR_PTR(-ENOENT); - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return ERR_PTR(-EALREADY); - timeout = io_kiocb_to_cmd(req); - list_del_init(&timeout->list); - return req; -} - -static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - spin_lock_irq(&ctx->timeout_lock); - req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); - - if (IS_ERR(req)) - return PTR_ERR(req); - io_req_task_queue_fail(req, -ECANCELED); - return 0; -} - -static clockid_t io_timeout_get_clock(struct io_timeout_data *data) -{ - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } -} - -static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout_data *io; - struct io_timeout *timeout; - struct io_kiocb *req = NULL; - - list_for_each_entry(timeout, &ctx->ltimeout_list, list) { - struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - - if (user_data == tmp->cqe.user_data) { - req = tmp; - break; - } - } - if (!req) - return -ENOENT; - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_cancel_data cd = { .data = user_data, }; - struct io_kiocb *req = io_timeout_extract(ctx, &cd); - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_timeout_data *data; - - if (IS_ERR(req)) - return PTR_ERR(req); - - timeout->off = 0; /* noseq */ - data = req->async_data; - list_add_tail(&timeout->list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_timeout_rem *tr = io_kiocb_to_cmd(req); - - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->buf_index || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - tr->ltimeout = false; - tr->addr = READ_ONCE(sqe->addr); - tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { - if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) - tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) - return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) - return -EINVAL; - } else if (tr->flags) { - /* timeout removal doesn't support flags */ - return -EINVAL; - } - - return 0; -} - -static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) -{ - return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS - : HRTIMER_MODE_REL; -} - -/* - * Remove or update an existing timeout command - */ -static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_timeout_rem *tr = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - int ret; - - if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { - struct io_cancel_data cd = { .data = tr->addr, }; - - spin_lock(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, &cd); - spin_unlock(&ctx->completion_lock); - } else { - enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - - spin_lock_irq(&ctx->timeout_lock); - if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); - } - - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int __io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - bool is_timeout_link) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_timeout_data *data; - unsigned flags; - u32 off = READ_ONCE(sqe->off); - - if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) - return -EINVAL; - if (off && is_timeout_link) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) - return -EINVAL; - /* more than one clock specified is invalid, obviously */ - if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - - INIT_LIST_HEAD(&timeout->list); - timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; - - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -ENOMEM; - - data = req->async_data; - data->req = req; - data->flags = flags; - - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; - - INIT_LIST_HEAD(&timeout->list); - data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); - - if (is_timeout_link) { - struct io_submit_link *link = &req->ctx->submit_state.link; - - if (!link->head) - return -EINVAL; - if (link->last->opcode == IORING_OP_LINK_TIMEOUT) - return -EINVAL; - timeout->head = link->last; - link->last->flags |= REQ_F_ARM_LTIMEOUT; - } - return 0; -} - -static int io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, false); -} - -static int io_link_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, true); -} - -static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = req->async_data; - struct list_head *entry; - u32 tail, off = timeout->off; - - spin_lock_irq(&ctx->timeout_lock); - - /* - * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. If it isn't set, then this is - * a pure timeout request, sequence isn't used. - */ - if (io_is_timeout_noseq(req)) { - entry = ctx->timeout_list.prev; - goto add; - } - - tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - timeout->target_seq = tail + off; - - /* Update the last seq here in case io_flush_timeouts() hasn't. - * This is safe because ->completion_lock is held, and submissions - * and completions are never mixed in the same ->completion_lock section. - */ - ctx->cq_last_tm_flush = tail; - - /* - * Insertion sort, ensuring the first entry in the list is always - * the one we need first. - */ - list_for_each_prev(entry, &ctx->timeout_list) { - struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); - struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); - - if (io_is_timeout_noseq(nxt)) - continue; - /* nxt.seq is behind @tail, otherwise would've been completed */ - if (off >= nextt->target_seq - tail) - break; - } -add: - list_add(&timeout->list, entry); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); - return IOU_ISSUE_SKIP_COMPLETE; -} - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -4979,7 +4433,7 @@ static int io_async_cancel_one(struct io_uring_task *tctx, return ret; } -static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5462,84 +4916,6 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_kiocb *prev = timeout->prev; - int ret = -ENOENT; - - if (prev) { - if (!(req->task->flags & PF_EXITING)) { - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = prev->cqe.user_data, - }; - - ret = io_try_cancel(req, &cd); - } - io_req_set_res(req, ret ?: -ETIME, 0); - io_req_complete_post(req); - io_put_req(prev); - } else { - io_req_set_res(req, -ETIME, 0); - io_req_complete_post(req); - } -} - -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *prev, *req = data->req; - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - prev = timeout->head; - timeout->head = NULL; - - /* - * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. - */ - if (prev) { - io_remove_next_linked(prev); - if (!req_ref_inc_not_zero(prev)) - prev = NULL; - } - list_del(&timeout->list); - timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static void io_queue_linked_timeout(struct io_kiocb *req) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - /* - * If the back reference is NULL, then our linked request finished - * before we got a chance to setup the timer - */ - if (timeout->head) { - struct io_timeout_data *data = req->async_data; - - data->timer.function = io_link_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); - list_add_tail(&timeout->list, &ctx->ltimeout_list); - } - spin_unlock_irq(&ctx->timeout_lock); - /* drop submission reference */ - io_put_req(req); -} - static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { @@ -8116,31 +7492,6 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } -/* Returns true if we found and killed one or more timeouts */ -static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct io_timeout *timeout, *tmp; - int canceled = 0; - - spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { - struct io_kiocb *req = cmd_to_io_kiocb(timeout); - - if (io_match_task(req, tsk, cancel_all)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->timeout_lock); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); - return canceled != 0; -} - static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4b46385720c585..e285e12ccbdbb1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,7 +65,8 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) } void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); - +void io_req_complete_post(struct io_kiocb *req); +void __io_req_complete_post(struct io_kiocb *req); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); @@ -96,5 +97,15 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); +void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_req_task_queue_fail(struct io_kiocb *req, int ret); +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); + +void io_free_req(struct io_kiocb *req); +void io_queue_next(struct io_kiocb *req); + +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) #endif diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index dba72113c59d67..349524907b6bfc 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -488,4 +488,14 @@ struct io_kiocb { struct io_wq_work work; }; +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + #endif diff --git a/io_uring/timeout.c b/io_uring/timeout.c new file mode 100644 index 00000000000000..5e42bfcd683e2a --- /dev/null +++ b/io_uring/timeout.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "refs.h" +#include "timeout.h" + +struct io_timeout { + struct file *file; + u32 off; + u32 target_seq; + struct list_head list; + /* head of the link, used by linked timeouts only */ + struct io_kiocb *head; + /* for linked completions */ + struct io_kiocb *prev; +}; + +struct io_timeout_rem { + struct file *file; + u64 addr; + + /* timeout update */ + struct timespec64 ts; + u32 flags; + bool ltimeout; +}; + +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + + return !timeout->off; +} + +static inline void io_put_req(struct io_kiocb *req) +{ + if (req_ref_put_and_test(req)) { + io_queue_next(req); + io_free_req(req); + } +} + +static void io_kill_timeout(struct io_kiocb *req, int status) + __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) +{ + struct io_timeout_data *io = req->async_data; + + if (hrtimer_try_to_cancel(&io->timer) != -1) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); + + if (status) + req_set_fail(req); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + list_del_init(&timeout->list); + io_req_tw_post_queue(req, status, 0); + } +} + +__cold void io_flush_timeouts(struct io_ring_ctx *ctx) + __must_hold(&ctx->completion_lock) +{ + u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + struct io_timeout *timeout, *tmp; + + spin_lock_irq(&ctx->timeout_lock); + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); + u32 events_needed, events_got; + + if (io_is_timeout_noseq(req)) + break; + + /* + * Since seq can easily wrap around over time, subtract + * the last seq at which timeouts were flushed before comparing. + * Assuming not more than 2^31-1 events have happened since, + * these subtractions won't have wrapped, so we can check if + * target is in [last_seq, current_seq] by comparing the two. + */ + events_needed = timeout->target_seq - ctx->cq_last_tm_flush; + events_got = seq - ctx->cq_last_tm_flush; + if (events_got < events_needed) + break; + + io_kill_timeout(req, 0); + } + ctx->cq_last_tm_flush = seq; + spin_unlock_irq(&ctx->timeout_lock); +} + +static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + struct io_kiocb *nxt, *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; + + req->link = NULL; + while (link) { + long res = -ECANCELED; + + if (link->flags & REQ_F_FAIL) + res = link->cqe.res; + + nxt = link->link; + link->link = NULL; + + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, + req->opcode, link); + + if (ignore_cqes) + link->flags |= REQ_F_CQE_SKIP; + else + link->flags &= ~REQ_F_CQE_SKIP; + io_req_set_res(link, res, 0); + __io_req_complete_post(link); + link = nxt; + } +} + +static inline void io_remove_next_linked(struct io_kiocb *req) +{ + struct io_kiocb *nxt = req->link; + + req->link = nxt->link; + nxt->link = NULL; +} + +bool io_disarm_next(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + struct io_kiocb *link = NULL; + bool posted = false; + + if (req->flags & REQ_F_ARM_LTIMEOUT) { + link = req->link; + req->flags &= ~REQ_F_ARM_LTIMEOUT; + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { + io_remove_next_linked(req); + io_req_tw_post_queue(link, -ECANCELED, 0); + posted = true; + } + } else if (req->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); + link = io_disarm_linked_timeout(req); + spin_unlock_irq(&ctx->timeout_lock); + if (link) { + posted = true; + io_req_tw_post_queue(link, -ECANCELED, 0); + } + } + if (unlikely((req->flags & REQ_F_FAIL) && + !(req->flags & REQ_F_HARDLINK))) { + posted |= (req->link != NULL); + io_fail_links(req); + } + return posted; +} + +struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) + __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) +{ + struct io_timeout_data *io = link->async_data; + struct io_timeout *timeout = io_kiocb_to_cmd(link); + + io_remove_next_linked(req); + timeout->head = NULL; + if (hrtimer_try_to_cancel(&io->timer) != -1) { + list_del(&timeout->list); + return link; + } + + return NULL; +} + +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->timeout_lock, flags); + list_del_init(&timeout->list); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + spin_unlock_irqrestore(&ctx->timeout_lock, flags); + + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); + + io_req_set_res(req, -ETIME, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return HRTIMER_NORESTART; +} + +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout *timeout; + struct io_timeout_data *io; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->timeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); + + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + cd->data != tmp->cqe.user_data) + continue; + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == tmp->work.cancel_seq) + continue; + tmp->work.cancel_seq = cd->seq; + } + req = tmp; + break; + } + if (!req) + return ERR_PTR(-ENOENT); + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return ERR_PTR(-EALREADY); + timeout = io_kiocb_to_cmd(req); + list_del_init(&timeout->list); + return req; +} + +int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + + spin_lock_irq(&ctx->timeout_lock); + req = io_timeout_extract(ctx, cd); + spin_unlock_irq(&ctx->timeout_lock); + + if (IS_ERR(req)) + return PTR_ERR(req); + io_req_task_queue_fail(req, -ECANCELED); + return 0; +} + +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_kiocb *prev = timeout->prev; + int ret = -ENOENT; + + if (prev) { + if (!(req->task->flags & PF_EXITING)) { + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = prev->cqe.user_data, + }; + + ret = io_try_cancel(req, &cd); + } + io_req_set_res(req, ret ?: -ETIME, 0); + io_req_complete_post(req); + io_put_req(prev); + } else { + io_req_set_res(req, -ETIME, 0); + io_req_complete_post(req); + } +} + +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) +{ + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *prev, *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->timeout_lock, flags); + prev = timeout->head; + timeout->head = NULL; + + /* + * We don't expect the list to be empty, that will only happen if we + * race with the completion of the linked work. + */ + if (prev) { + io_remove_next_linked(prev); + if (!req_ref_inc_not_zero(prev)) + prev = NULL; + } + list_del(&timeout->list); + timeout->prev = prev; + spin_unlock_irqrestore(&ctx->timeout_lock, flags); + + req->io_task_work.func = io_req_task_link_timeout; + io_req_task_work_add(req); + return HRTIMER_NORESTART; +} + +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_timeout *timeout; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->ltimeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); + + if (user_data == tmp->cqe.user_data) { + req = tmp; + break; + } + } + if (!req) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_cancel_data cd = { .data = user_data, }; + struct io_kiocb *req = io_timeout_extract(ctx, &cd); + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_timeout_data *data; + + if (IS_ERR(req)) + return PTR_ERR(req); + + timeout->off = 0; /* noseq */ + data = req->async_data; + list_add_tail(&timeout->list, &ctx->timeout_list); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + +int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); + + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) + return -EINVAL; + + tr->ltimeout = false; + tr->addr = READ_ONCE(sqe->addr); + tr->flags = READ_ONCE(sqe->timeout_flags); + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) + return -EINVAL; + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + return -EFAULT; + if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) + return -EINVAL; + } else if (tr->flags) { + /* timeout removal doesn't support flags */ + return -EINVAL; + } + + return 0; +} + +static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) +{ + return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS + : HRTIMER_MODE_REL; +} + +/* + * Remove or update an existing timeout command + */ +int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + int ret; + + if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { + struct io_cancel_data cd = { .data = tr->addr, }; + + spin_lock(&ctx->completion_lock); + ret = io_timeout_cancel(ctx, &cd); + spin_unlock(&ctx->completion_lock); + } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + + spin_lock_irq(&ctx->timeout_lock); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + spin_unlock_irq(&ctx->timeout_lock); + } + + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +static int __io_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool is_timeout_link) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_timeout_data *data; + unsigned flags; + u32 off = READ_ONCE(sqe->off); + + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) + return -EINVAL; + if (off && is_timeout_link) + return -EINVAL; + flags = READ_ONCE(sqe->timeout_flags); + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | + IORING_TIMEOUT_ETIME_SUCCESS)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + + INIT_LIST_HEAD(&timeout->list); + timeout->off = off; + if (unlikely(off && !req->ctx->off_timeout_used)) + req->ctx->off_timeout_used = true; + + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) + return -ENOMEM; + + data = req->async_data; + data->req = req; + data->flags = flags; + + if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) + return -EFAULT; + + if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) + return -EINVAL; + + INIT_LIST_HEAD(&timeout->list); + data->mode = io_translate_timeout_mode(flags); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); + + if (is_timeout_link) { + struct io_submit_link *link = &req->ctx->submit_state.link; + + if (!link->head) + return -EINVAL; + if (link->last->opcode == IORING_OP_LINK_TIMEOUT) + return -EINVAL; + timeout->head = link->last; + link->last->flags |= REQ_F_ARM_LTIMEOUT; + } + return 0; +} + +int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, false); +} + +int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, true); +} + +int io_timeout(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_timeout_data *data = req->async_data; + struct list_head *entry; + u32 tail, off = timeout->off; + + spin_lock_irq(&ctx->timeout_lock); + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. If it isn't set, then this is + * a pure timeout request, sequence isn't used. + */ + if (io_is_timeout_noseq(req)) { + entry = ctx->timeout_list.prev; + goto add; + } + + tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + timeout->target_seq = tail + off; + + /* Update the last seq here in case io_flush_timeouts() hasn't. + * This is safe because ->completion_lock is held, and submissions + * and completions are never mixed in the same ->completion_lock section. + */ + ctx->cq_last_tm_flush = tail; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); + struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); + + if (io_is_timeout_noseq(nxt)) + continue; + /* nxt.seq is behind @tail, otherwise would've been completed */ + if (off >= nextt->target_seq - tail) + break; + } +add: + list_add(&timeout->list, entry); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + spin_unlock_irq(&ctx->timeout_lock); + return IOU_ISSUE_SKIP_COMPLETE; +} + +void io_queue_linked_timeout(struct io_kiocb *req) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); + /* + * If the back reference is NULL, then our linked request finished + * before we got a chance to setup the timer + */ + if (timeout->head) { + struct io_timeout_data *data = req->async_data; + + data->timer.function = io_link_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), + data->mode); + list_add_tail(&timeout->list, &ctx->ltimeout_list); + } + spin_unlock_irq(&ctx->timeout_lock); + /* drop submission reference */ + io_put_req(req); +} + +static bool io_match_task(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) + __must_hold(&req->ctx->timeout_lock) +{ + struct io_kiocb *req; + + if (task && head->task != task) + return false; + if (cancel_all) + return true; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* Returns true if we found and killed one or more timeouts */ +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all) +{ + struct io_timeout *timeout, *tmp; + int canceled = 0; + + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); + + if (io_match_task(req, tsk, cancel_all)) { + io_kill_timeout(req, -ECANCELED); + canceled++; + } + } + spin_unlock_irq(&ctx->timeout_lock); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (canceled != 0) + io_cqring_ev_posted(ctx); + return canceled != 0; +} diff --git a/io_uring/timeout.h b/io_uring/timeout.h new file mode 100644 index 00000000000000..dd7cfb0d936671 --- /dev/null +++ b/io_uring/timeout.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +struct io_timeout_data { + struct io_kiocb *req; + struct hrtimer timer; + struct timespec64 ts; + enum hrtimer_mode mode; + u32 flags; +}; + +struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + +static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *link = req->link; + + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) + return __io_disarm_linked_timeout(req, link); + + return NULL; +} + +__cold void io_flush_timeouts(struct io_ring_ctx *ctx); +int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all); +void io_queue_linked_timeout(struct io_kiocb *req); +bool io_disarm_next(struct io_kiocb *req); + +int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_timeout(struct io_kiocb *req, unsigned int issue_flags); +int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags); From 17437f311490d873a5157f65a84317d16270fd38 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 09:13:39 -0600 Subject: [PATCH 043/172] io_uring: move SQPOLL related handling into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 467 +------------------------------------------- io_uring/io_uring.h | 34 ++++ io_uring/sqpoll.c | 426 ++++++++++++++++++++++++++++++++++++++++ io_uring/sqpoll.h | 29 +++ 5 files changed, 497 insertions(+), 462 deletions(-) create mode 100644 io_uring/sqpoll.c create mode 100644 io_uring/sqpoll.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 6ae4e45a15dbfc..c59a9ca74262a4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o + statx.o net.o msg_ring.o timeout.o \ + sqpoll.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3fc59a22d54e89..17c555aa03bc9c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "io_uring_types.h" #include "io_uring.h" #include "refs.h" +#include "sqpoll.h" #include "xattr.h" #include "nop.h" @@ -109,7 +110,6 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) @@ -214,31 +214,6 @@ struct io_buffer { __u16 bgid; }; -enum { - IO_SQ_THREAD_SHOULD_STOP = 0, - IO_SQ_THREAD_SHOULD_PARK, -}; - -struct io_sq_data { - refcount_t refs; - atomic_t park_pending; - struct mutex lock; - - /* ctx's that are using this sqd */ - struct list_head ctx_list; - - struct task_struct *thread; - struct wait_queue_head wait; - - unsigned sq_thread_idle; - int sq_cpu; - pid_t task_pid; - pid_t task_tgid; - - unsigned long state; - struct completion exited; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 @@ -402,7 +377,6 @@ static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static void io_dismantle_req(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1079,13 +1053,6 @@ static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -static inline bool io_sqring_full(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -1974,28 +1941,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) return __io_cqring_events(ctx); } -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; -} - -static inline bool io_run_task_work(void) -{ - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - - return false; -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; unsigned int poll_flags = BLK_POLL_NOSLEEP; @@ -5297,7 +5243,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) +int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); @@ -5349,173 +5295,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -static inline bool io_sqd_events_pending(struct io_sq_data *sqd) -{ - return READ_ONCE(sqd->state); -} - -static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) -{ - unsigned int to_submit; - int ret = 0; - - to_submit = io_sqring_entries(ctx); - /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) - to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { - const struct cred *creds = NULL; - - if (ctx->sq_creds != current_cred()) - creds = override_creds(ctx->sq_creds); - - mutex_lock(&ctx->uring_lock); - if (!wq_list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, true); - - /* - * Don't submit if refs are dying, good for io_uring_register(), - * but also it is relied upon by io_ring_exit_work() - */ - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && - !(ctx->flags & IORING_SETUP_R_DISABLED)) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); - if (creds) - revert_creds(creds); - } - - return ret; -} - -static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - unsigned sq_thread_idle = 0; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); - sqd->sq_thread_idle = sq_thread_idle; -} - -static bool io_sqd_handle_event(struct io_sq_data *sqd) -{ - bool did_sig = false; - struct ksignal ksig; - - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || - signal_pending(current)) { - mutex_unlock(&sqd->lock); - if (signal_pending(current)) - did_sig = get_signal(&ksig); - cond_resched(); - mutex_lock(&sqd->lock); - } - return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static int io_sq_thread(void *data) -{ - struct io_sq_data *sqd = data; - struct io_ring_ctx *ctx; - unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; - DEFINE_WAIT(wait); - - snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); - set_task_comm(current, buf); - - if (sqd->sq_cpu != -1) - set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); - else - set_cpus_allowed_ptr(current, cpu_online_mask); - current->flags |= PF_NO_SETAFFINITY; - - audit_alloc_kernel(current); - - mutex_lock(&sqd->lock); - while (1) { - bool cap_entries, sqt_spin = false; - - if (io_sqd_events_pending(sqd) || signal_pending(current)) { - if (io_sqd_handle_event(sqd)) - break; - timeout = jiffies + sqd->sq_thread_idle; - } - - cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - int ret = __io_sq_thread(ctx, cap_entries); - - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) - sqt_spin = true; - } - if (io_run_task_work()) - sqt_spin = true; - - if (sqt_spin || !time_after(jiffies, timeout)) { - cond_resched(); - if (sqt_spin) - timeout = jiffies + sqd->sq_thread_idle; - continue; - } - - prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { - bool needs_sched = true; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - atomic_or(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !wq_list_empty(&ctx->iopoll_list)) { - needs_sched = false; - break; - } - - /* - * Ensure the store of the wakeup flag is not - * reordered with the load of the SQ tail - */ - smp_mb__after_atomic(); - - if (io_sqring_entries(ctx)) { - needs_sched = false; - break; - } - } - - if (needs_sched) { - mutex_unlock(&sqd->lock); - schedule(); - mutex_lock(&sqd->lock); - } - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_andnot(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - } - - finish_wait(&sqd->wait, &wait); - timeout = jiffies + sqd->sq_thread_idle; - } - - io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); - io_run_task_work(); - mutex_unlock(&sqd->lock); - - audit_free(current); - - complete(&sqd->exited); - do_exit(0); -} - struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; @@ -5934,131 +5713,6 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return ret; } -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - /* - * Do the dance but not conditional clear_bit() because it'd race with - * other threads incrementing park_pending and setting the bit. - */ - clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (atomic_dec_return(&sqd->park_pending)) - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - atomic_inc(&sqd->park_pending); - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); -} - -static void io_sq_thread_stop(struct io_sq_data *sqd) -{ - WARN_ON_ONCE(sqd->thread == current); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); - - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); -} - -static void io_put_sq_data(struct io_sq_data *sqd) -{ - if (refcount_dec_and_test(&sqd->refs)) { - WARN_ON_ONCE(atomic_read(&sqd->park_pending)); - - io_sq_thread_stop(sqd); - kfree(sqd); - } -} - -static void io_sq_thread_finish(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - io_sq_thread_park(sqd); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) -{ - struct io_ring_ctx *ctx_attach; - struct io_sq_data *sqd; - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return ERR_PTR(-ENXIO); - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - ctx_attach = f.file->private_data; - sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); - return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); - return ERR_PTR(-EPERM); - } - - refcount_inc(&sqd->refs); - fdput(f); - return sqd; -} - -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, - bool *attached) -{ - struct io_sq_data *sqd; - - *attached = false; - if (p->flags & IORING_SETUP_ATTACH_WQ) { - sqd = io_attach_sq_data(p); - if (!IS_ERR(sqd)) { - *attached = true; - return sqd; - } - /* fall through for EPERM case, setup new sqd/task */ - if (PTR_ERR(sqd) != -EPERM) - return sqd; - } - - sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); - if (!sqd) - return ERR_PTR(-ENOMEM); - - atomic_set(&sqd->park_pending, 0); - refcount_set(&sqd->refs, 1); - INIT_LIST_HEAD(&sqd->ctx_list); - mutex_init(&sqd->lock); - init_waitqueue_head(&sqd->wait); - init_completion(&sqd->exited); - return sqd; -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -6495,8 +6149,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, return io_wq_create(concurrency, &data); } -static __cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +__cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -6554,96 +6208,6 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) -{ - int ret; - - /* Retain compatibility with failing for an invalid attach attempt */ - if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == - IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return -ENXIO; - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return -EINVAL; - } - fdput(f); - } - if (ctx->flags & IORING_SETUP_SQPOLL) { - struct task_struct *tsk; - struct io_sq_data *sqd; - bool attached; - - ret = security_uring_sqpoll(); - if (ret) - return ret; - - sqd = io_get_sq_data(p, &attached); - if (IS_ERR(sqd)) { - ret = PTR_ERR(sqd); - goto err; - } - - ctx->sq_creds = get_current_cred(); - ctx->sq_data = sqd; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; - - io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - /* don't attach to a dying SQPOLL thread, would be racy */ - ret = (attached && !sqd->thread) ? -ENXIO : 0; - io_sq_thread_unpark(sqd); - - if (ret < 0) - goto err; - if (attached) - return 0; - - if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu = p->sq_thread_cpu; - - ret = -EINVAL; - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) - goto err_sqpoll; - sqd->sq_cpu = cpu; - } else { - sqd->sq_cpu = -1; - } - - sqd->task_pid = current->pid; - sqd->task_tgid = current->tgid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto err_sqpoll; - } - - sqd->thread = tsk; - ret = io_uring_alloc_task_context(tsk, ctx); - wake_up_new_task(tsk); - if (ret) - goto err; - } else if (p->flags & IORING_SETUP_SQ_AFF) { - /* Can't have SQ_AFF without SQPOLL */ - ret = -EINVAL; - goto err; - } - - return 0; -err_sqpoll: - complete(&ctx->sq_data->exited); -err: - io_sq_thread_finish(ctx); - return ret; -} - static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7755,8 +7319,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. */ -static __cold void io_uring_cancel_generic(bool cancel_all, - struct io_sq_data *sqd) +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; @@ -8034,24 +7597,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ -static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) -{ - DEFINE_WAIT(wait); - - do { - if (!io_sqring_full(ctx)) - break; - prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); - - if (!io_sqring_full(ctx)) - break; - schedule(); - } while (!signal_pending(current)); - - finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; -} - static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e285e12ccbdbb1..1da8e66507a350 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -64,6 +64,34 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; +} + +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; +} + +static inline bool io_run_task_work(void) +{ + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { + __set_current_state(TASK_RUNNING); + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); + return true; + } + + return false; +} + void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); @@ -101,6 +129,12 @@ void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); + +int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c new file mode 100644 index 00000000000000..149d5c976f1467 --- /dev/null +++ b/io_uring/sqpoll.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Contains the core associated with submission side polling of the SQ + * ring, offloading submissions from the application to a kernel thread. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "sqpoll.h" + +#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 + +enum { + IO_SQ_THREAD_SHOULD_STOP = 0, + IO_SQ_THREAD_SHOULD_PARK, +}; + +void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + WARN_ON_ONCE(sqd->thread == current); + + /* + * Do the dance but not conditional clear_bit() because it'd race with + * other threads incrementing park_pending and setting the bit. + */ + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (atomic_dec_return(&sqd->park_pending)) + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_unlock(&sqd->lock); +} + +void io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + WARN_ON_ONCE(sqd->thread == current); + + atomic_inc(&sqd->park_pending); + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_lock(&sqd->lock); + if (sqd->thread) + wake_up_process(sqd->thread); +} + +void io_sq_thread_stop(struct io_sq_data *sqd) +{ + WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); + + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + mutex_lock(&sqd->lock); + if (sqd->thread) + wake_up_process(sqd->thread); + mutex_unlock(&sqd->lock); + wait_for_completion(&sqd->exited); +} + +void io_put_sq_data(struct io_sq_data *sqd) +{ + if (refcount_dec_and_test(&sqd->refs)) { + WARN_ON_ONCE(atomic_read(&sqd->park_pending)); + + io_sq_thread_stop(sqd); + kfree(sqd); + } +} + +static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + unsigned sq_thread_idle = 0; + + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); + sqd->sq_thread_idle = sq_thread_idle; +} + +void io_sq_thread_finish(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + io_sq_thread_unpark(sqd); + + io_put_sq_data(sqd); + ctx->sq_data = NULL; + } +} + +static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx_attach; + struct io_sq_data *sqd; + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return ERR_PTR(-ENXIO); + if (!io_is_uring_fops(f.file)) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + ctx_attach = f.file->private_data; + sqd = ctx_attach->sq_data; + if (!sqd) { + fdput(f); + return ERR_PTR(-EINVAL); + } + if (sqd->task_tgid != current->tgid) { + fdput(f); + return ERR_PTR(-EPERM); + } + + refcount_inc(&sqd->refs); + fdput(f); + return sqd; +} + +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, + bool *attached) +{ + struct io_sq_data *sqd; + + *attached = false; + if (p->flags & IORING_SETUP_ATTACH_WQ) { + sqd = io_attach_sq_data(p); + if (!IS_ERR(sqd)) { + *attached = true; + return sqd; + } + /* fall through for EPERM case, setup new sqd/task */ + if (PTR_ERR(sqd) != -EPERM) + return sqd; + } + + sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); + if (!sqd) + return ERR_PTR(-ENOMEM); + + atomic_set(&sqd->park_pending, 0); + refcount_set(&sqd->refs, 1); + INIT_LIST_HEAD(&sqd->ctx_list); + mutex_init(&sqd->lock); + init_waitqueue_head(&sqd->wait); + init_completion(&sqd->exited); + return sqd; +} + +static inline bool io_sqd_events_pending(struct io_sq_data *sqd) +{ + return READ_ONCE(sqd->state); +} + +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) +{ + unsigned int to_submit; + int ret = 0; + + to_submit = io_sqring_entries(ctx); + /* if we're handling multiple rings, cap submit size for fairness */ + if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) + to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; + + if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); + + mutex_lock(&ctx->uring_lock); + if (!wq_list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, true); + + /* + * Don't submit if refs are dying, good for io_uring_register(), + * but also it is relied upon by io_ring_exit_work() + */ + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + if (creds) + revert_creds(creds); + } + + return ret; +} + +static bool io_sqd_handle_event(struct io_sq_data *sqd) +{ + bool did_sig = false; + struct ksignal ksig; + + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || + signal_pending(current)) { + mutex_unlock(&sqd->lock); + if (signal_pending(current)) + did_sig = get_signal(&ksig); + cond_resched(); + mutex_lock(&sqd->lock); + } + return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + +static int io_sq_thread(void *data) +{ + struct io_sq_data *sqd = data; + struct io_ring_ctx *ctx; + unsigned long timeout = 0; + char buf[TASK_COMM_LEN]; + DEFINE_WAIT(wait); + + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); + set_task_comm(current, buf); + + if (sqd->sq_cpu != -1) + set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |= PF_NO_SETAFFINITY; + + audit_alloc_kernel(current); + + mutex_lock(&sqd->lock); + while (1) { + bool cap_entries, sqt_spin = false; + + if (io_sqd_events_pending(sqd) || signal_pending(current)) { + if (io_sqd_handle_event(sqd)) + break; + timeout = jiffies + sqd->sq_thread_idle; + } + + cap_entries = !list_is_singular(&sqd->ctx_list); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + int ret = __io_sq_thread(ctx, cap_entries); + + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) + sqt_spin = true; + } + if (io_run_task_work()) + sqt_spin = true; + + if (sqt_spin || !time_after(jiffies, timeout)) { + cond_resched(); + if (sqt_spin) + timeout = jiffies + sqd->sq_thread_idle; + continue; + } + + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); + if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { + bool needs_sched = true; + + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + atomic_or(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !wq_list_empty(&ctx->iopoll_list)) { + needs_sched = false; + break; + } + + /* + * Ensure the store of the wakeup flag is not + * reordered with the load of the SQ tail + */ + smp_mb__after_atomic(); + + if (io_sqring_entries(ctx)) { + needs_sched = false; + break; + } + } + + if (needs_sched) { + mutex_unlock(&sqd->lock); + schedule(); + mutex_lock(&sqd->lock); + } + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + atomic_andnot(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); + } + + finish_wait(&sqd->wait, &wait); + timeout = jiffies + sqd->sq_thread_idle; + } + + io_uring_cancel_generic(true, sqd); + sqd->thread = NULL; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); + io_run_task_work(); + mutex_unlock(&sqd->lock); + + audit_free(current); + + complete(&sqd->exited); + do_exit(0); +} + +int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +{ + DEFINE_WAIT(wait); + + do { + if (!io_sqring_full(ctx)) + break; + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + + if (!io_sqring_full(ctx)) + break; + schedule(); + } while (!signal_pending(current)); + + finish_wait(&ctx->sqo_sq_wait, &wait); + return 0; +} + +__cold int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + int ret; + + /* Retain compatibility with failing for an invalid attach attempt */ + if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == + IORING_SETUP_ATTACH_WQ) { + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return -ENXIO; + if (!io_is_uring_fops(f.file)) { + fdput(f); + return -EINVAL; + } + fdput(f); + } + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct task_struct *tsk; + struct io_sq_data *sqd; + bool attached; + + ret = security_uring_sqpoll(); + if (ret) + return ret; + + sqd = io_get_sq_data(p, &attached); + if (IS_ERR(sqd)) { + ret = PTR_ERR(sqd); + goto err; + } + + ctx->sq_creds = get_current_cred(); + ctx->sq_data = sqd; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + + io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); + /* don't attach to a dying SQPOLL thread, would be racy */ + ret = (attached && !sqd->thread) ? -ENXIO : 0; + io_sq_thread_unpark(sqd); + + if (ret < 0) + goto err; + if (attached) + return 0; + + if (p->flags & IORING_SETUP_SQ_AFF) { + int cpu = p->sq_thread_cpu; + + ret = -EINVAL; + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + goto err_sqpoll; + sqd->sq_cpu = cpu; + } else { + sqd->sq_cpu = -1; + } + + sqd->task_pid = current->pid; + sqd->task_tgid = current->tgid; + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + goto err_sqpoll; + } + + sqd->thread = tsk; + ret = io_uring_alloc_task_context(tsk, ctx); + wake_up_new_task(tsk); + if (ret) + goto err; + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ + ret = -EINVAL; + goto err; + } + + return 0; +err_sqpoll: + complete(&ctx->sq_data->exited); +err: + io_sq_thread_finish(ctx); + return ret; +} diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h new file mode 100644 index 00000000000000..0c3fbcd1f583f6 --- /dev/null +++ b/io_uring/sqpoll.h @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +struct io_sq_data { + refcount_t refs; + atomic_t park_pending; + struct mutex lock; + + /* ctx's that are using this sqd */ + struct list_head ctx_list; + + struct task_struct *thread; + struct wait_queue_head wait; + + unsigned sq_thread_idle; + int sq_cpu; + pid_t task_pid; + pid_t task_tgid; + + unsigned long state; + struct completion exited; +}; + +int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p); +void io_sq_thread_finish(struct io_ring_ctx *ctx); +void io_sq_thread_stop(struct io_sq_data *sqd); +void io_sq_thread_park(struct io_sq_data *sqd); +void io_sq_thread_unpark(struct io_sq_data *sqd); +void io_put_sq_data(struct io_sq_data *sqd); +int io_sqpoll_wait_sq(struct io_ring_ctx *ctx); From e5550a1447bf8d82f32b58e2ba54792a6985d080 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 10:28:04 -0600 Subject: [PATCH 044/172] io_uring: use io_is_uring_fops() consistently Convert the last spots that check for io_uring_fops to use the provided helper instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 17c555aa03bc9c..687900b84ce08c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2309,7 +2309,7 @@ static bool __io_file_supports_nowait(struct file *file, umode_t mode) if (S_ISREG(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - file->f_op != &io_uring_fops) + !io_is_uring_fops(file)) return true; return false; } @@ -4857,7 +4857,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); /* we don't allow fixed io_uring files */ - if (file && file->f_op == &io_uring_fops) + if (file && io_is_uring_fops(file)) io_req_track_inflight(req); return file; } @@ -5949,7 +5949,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (file->f_op == &io_uring_fops) { + if (io_is_uring_fops(file)) { fput(file); goto fail; } @@ -5996,7 +5996,7 @@ int io_install_fixed_file(struct io_kiocb *req, struct file *file, struct io_fixed_file *file_slot; int ret; - if (file->f_op == &io_uring_fops) + if (io_is_uring_fops(file)) return -EBADF; if (!ctx->file_data) return -ENXIO; @@ -6096,7 +6096,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, * still no point in allowing a ring fd as it doesn't * support regular read/write anyway. */ - if (file->f_op == &io_uring_fops) { + if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; @@ -7416,7 +7416,7 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, file = fget(fd); if (!file) { return -EBADF; - } else if (file->f_op != &io_uring_fops) { + } else if (!io_is_uring_fops(file)) { fput(file); return -EOPNOTSUPP; } @@ -7677,7 +7677,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return -EBADF; ret = -EOPNOTSUPP; - if (unlikely(f.file->f_op != &io_uring_fops)) + if (unlikely(!io_is_uring_fops(f.file))) goto out_fput; ret = -ENXIO; @@ -8852,7 +8852,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, return -EBADF; ret = -EOPNOTSUPP; - if (f.file->f_op != &io_uring_fops) + if (!io_is_uring_fops(f.file)) goto out_fput; ctx = f.file->private_data; From a4ad4f748ea96202451f88b6385d7283d0f2e513 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 10:40:19 -0600 Subject: [PATCH 045/172] io_uring: move fdinfo helpers to its own file This also means moving a bit more of the fixed file handling to the filetable side, which makes sense separately too. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/fdinfo.c | 191 ++++++++++++++++++++++++++++++++++ io_uring/fdinfo.h | 3 + io_uring/filetable.h | 19 ++++ io_uring/io_uring.c | 210 +------------------------------------- io_uring/io_uring_types.h | 13 +++ 6 files changed, 230 insertions(+), 208 deletions(-) create mode 100644 io_uring/fdinfo.c create mode 100644 io_uring/fdinfo.h diff --git a/io_uring/Makefile b/io_uring/Makefile index c59a9ca74262a4..ed0bf42db4ae4c 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o + sqpoll.o fdinfo.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c new file mode 100644 index 00000000000000..fcedde4b4b1e1f --- /dev/null +++ b/io_uring/fdinfo.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "sqpoll.h" +#include "fdinfo.h" + +#ifdef CONFIG_PROC_FS +static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, + const struct cred *cred) +{ + struct user_namespace *uns = seq_user_ns(m); + struct group_info *gi; + kernel_cap_t cap; + unsigned __capi; + int g; + + seq_printf(m, "%5d\n", id); + seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); + seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); + seq_puts(m, "\n\tGroups:\t"); + gi = cred->group_info; + for (g = 0; g < gi->ngroups; g++) { + seq_put_decimal_ull(m, g ? " " : "", + from_kgid_munged(uns, gi->gid[g])); + } + seq_puts(m, "\n\tCapEff:\t"); + cap = cred->cap_effective; + CAP_FOR_EACH_U32(__capi) + seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); + seq_putc(m, '\n'); + return 0; +} + +static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ + struct io_sq_data *sq = NULL; + struct io_overflow_cqe *ocqe; + struct io_rings *r = ctx->rings; + unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; + unsigned int sq_head = READ_ONCE(r->sq.head); + unsigned int sq_tail = READ_ONCE(r->sq.tail); + unsigned int cq_head = READ_ONCE(r->cq.head); + unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int cq_shift = 0; + unsigned int sq_entries, cq_entries; + bool has_lock; + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + unsigned int i; + + if (is_cqe32) + cq_shift = 1; + + /* + * we may get imprecise sqe and cqe info if uring is actively running + * since we get cached_sq_head and cached_cq_tail without uring_lock + * and sq_tail and cq_head are changed by userspace. But it's ok since + * we usually use these info when it is stuck. + */ + seq_printf(m, "SqMask:\t0x%x\n", sq_mask); + seq_printf(m, "SqHead:\t%u\n", sq_head); + seq_printf(m, "SqTail:\t%u\n", sq_tail); + seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CqMask:\t0x%x\n", cq_mask); + seq_printf(m, "CqHead:\t%u\n", cq_head); + seq_printf(m, "CqTail:\t%u\n", cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); + sq_entries = min(sq_tail - sq_head, ctx->sq_entries); + for (i = 0; i < sq_entries; i++) { + unsigned int entry = i + sq_head; + unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + struct io_uring_sqe *sqe; + + if (sq_idx > sq_mask) + continue; + sqe = &ctx->sq_sqes[sq_idx]; + seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", + sq_idx, sqe->opcode, sqe->fd, sqe->flags, + sqe->user_data); + } + seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); + cq_entries = min(cq_tail - cq_head, ctx->cq_entries); + for (i = 0; i < cq_entries; i++) { + unsigned int entry = i + cq_head; + struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; + + if (!is_cqe32) { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags); + } else { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " + "extra1:%llu, extra2:%llu\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); + } + } + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. If we fail to get the lock, we just don't iterate any + * structures that could be going away outside the io_uring mutex. + */ + has_lock = mutex_trylock(&ctx->uring_lock); + + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + sq = ctx->sq_data; + if (!sq->thread) + sq = NULL; + } + + seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); + seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); + seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); + for (i = 0; has_lock && i < ctx->nr_user_files; i++) { + struct file *f = io_file_from_index(&ctx->file_table, i); + + if (f) + seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); + else + seq_printf(m, "%5u: \n", i); + } + seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); + for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *buf = ctx->user_bufs[i]; + unsigned int len = buf->ubuf_end - buf->ubuf; + + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); + } + if (has_lock && !xa_empty(&ctx->personalities)) { + unsigned long index; + const struct cred *cred; + + seq_printf(m, "Personalities:\n"); + xa_for_each(&ctx->personalities, index, cred) + io_uring_show_cred(m, index, cred); + } + if (has_lock) + mutex_unlock(&ctx->uring_lock); + + seq_puts(m, "PollList:\n"); + spin_lock(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_kiocb *req; + + hlist_for_each_entry(req, list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + task_work_pending(req->task)); + } + + seq_puts(m, "CqOverflowList:\n"); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + + } + + spin_unlock(&ctx->completion_lock); +} + +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct io_ring_ctx *ctx = f->private_data; + + if (percpu_ref_tryget(&ctx->refs)) { + __io_uring_show_fdinfo(ctx, m); + percpu_ref_put(&ctx->refs); + } +} +#endif diff --git a/io_uring/fdinfo.h b/io_uring/fdinfo.h new file mode 100644 index 00000000000000..6fde48c450e300 --- /dev/null +++ b/io_uring/fdinfo.h @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 + +void io_uring_show_fdinfo(struct seq_file *m, struct file *f); diff --git a/io_uring/filetable.h b/io_uring/filetable.h index fe1ec581958dda..6e1675f406b729 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -36,6 +36,8 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); int io_file_bitmap_get(struct io_ring_ctx *ctx); +unsigned int io_file_get_flags(struct file *file); + static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { __clear_bit(bit, table->bitmap); @@ -55,4 +57,21 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i) return &table->files[i]; } +static inline struct file *io_file_from_index(struct io_file_table *table, + int index) +{ + struct io_fixed_file *slot = io_fixed_file_slot(table, index); + + return (struct file *) (slot->file_ptr & FFS_MASK); +} + +static inline void io_fixed_file_set(struct io_fixed_file *file_slot, + struct file *file) +{ + unsigned long file_ptr = (unsigned long) file; + + file_ptr |= io_file_get_flags(file); + file_slot->file_ptr = file_ptr; +} + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 687900b84ce08c..95c2083a0da8b3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "io_uring.h" #include "refs.h" #include "sqpoll.h" +#include "fdinfo.h" #include "xattr.h" #include "nop.h" @@ -137,21 +138,8 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - struct io_ring_ctx; -struct io_overflow_cqe { - struct list_head list; - struct io_uring_cqe cqe; -}; - struct io_rsrc_put { struct list_head list; u64 tag; @@ -2325,7 +2313,7 @@ static bool __io_file_supports_nowait(struct file *file, umode_t mode) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static unsigned int io_file_get_flags(struct file *file) +unsigned int io_file_get_flags(struct file *file) { umode_t mode = file_inode(file)->i_mode; unsigned int res = 0; @@ -4810,22 +4798,6 @@ static void io_wq_submit_work(struct io_wq_work *work) io_req_task_queue_fail(req, ret); } -static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, - int index) -{ - struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); -} - -static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) -{ - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; -} - inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { @@ -5667,7 +5639,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) int i; for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(ctx, i); + struct file *file = io_file_from_index(&ctx->file_table, i); if (!file) continue; @@ -7777,182 +7749,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return ret; } -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - unsigned __capi; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - CAP_FOR_EACH_U32(__capi) - seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); - seq_putc(m, '\n'); - return 0; -} - -static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, - struct seq_file *m) -{ - struct io_sq_data *sq = NULL; - struct io_overflow_cqe *ocqe; - struct io_rings *r = ctx->rings; - unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; - unsigned int sq_head = READ_ONCE(r->sq.head); - unsigned int sq_tail = READ_ONCE(r->sq.tail); - unsigned int cq_head = READ_ONCE(r->cq.head); - unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; - unsigned int sq_entries, cq_entries; - bool has_lock; - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - unsigned int i; - - if (is_cqe32) - cq_shift = 1; - - /* - * we may get imprecise sqe and cqe info if uring is actively running - * since we get cached_sq_head and cached_cq_tail without uring_lock - * and sq_tail and cq_head are changed by userspace. But it's ok since - * we usually use these info when it is stuck. - */ - seq_printf(m, "SqMask:\t0x%x\n", sq_mask); - seq_printf(m, "SqHead:\t%u\n", sq_head); - seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); - seq_printf(m, "CqMask:\t0x%x\n", cq_mask); - seq_printf(m, "CqHead:\t%u\n", cq_head); - seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); - seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); - for (i = 0; i < sq_entries; i++) { - unsigned int entry = i + sq_head; - unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); - struct io_uring_sqe *sqe; - - if (sq_idx > sq_mask) - continue; - sqe = &ctx->sq_sqes[sq_idx]; - seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", - sq_idx, sqe->opcode, sqe->fd, sqe->flags, - sqe->user_data); - } - seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); - for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - - if (!is_cqe32) { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags); - } else { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " - "extra1:%llu, extra2:%llu\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); - } - } - - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; - } - - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); - seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = io_file_from_index(ctx, i); - - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: \n", i); - } - seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); - } - if (has_lock && !xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - - seq_puts(m, "PollList:\n"); - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list = &ctx->cancel_hash[i]; - struct io_kiocb *req; - - hlist_for_each_entry(req, list, hash_node) - seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); - } - - seq_puts(m, "CqOverflowList:\n"); - list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { - struct io_uring_cqe *cqe = &ocqe->cqe; - - seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", - cqe->user_data, cqe->res, cqe->flags); - - } - - spin_unlock(&ctx->completion_lock); -} - -static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct io_ring_ctx *ctx = f->private_data; - - if (percpu_ref_tryget(&ctx->refs)) { - __io_uring_show_fdinfo(ctx, m); - percpu_ref_put(&ctx->refs); - } -} -#endif - static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 349524907b6bfc..147e1e597530a9 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -498,4 +498,17 @@ struct io_cancel_data { int seq; }; +struct io_overflow_cqe { + struct list_head list; + struct io_uring_cqe cqe; +}; + +struct io_mapped_ubuf { + u64 ubuf; + u64 ubuf_end; + unsigned int nr_bvecs; + unsigned long acct_pages; + struct bio_vec bvec[]; +}; + #endif From c9f06aa7de153cdbe424e7e38be39f2272cf78bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 11:01:04 -0600 Subject: [PATCH 046/172] io_uring: move io_uring_task (tctx) helpers into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 368 +------------------------------------------- io_uring/io_uring.h | 4 + io_uring/tctx.c | 332 +++++++++++++++++++++++++++++++++++++++ io_uring/tctx.h | 55 +++++++ 5 files changed, 396 insertions(+), 365 deletions(-) create mode 100644 io_uring/tctx.c create mode 100644 io_uring/tctx.h diff --git a/io_uring/Makefile b/io_uring/Makefile index ed0bf42db4ae4c..2db085cdedad84 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o + sqpoll.o fdinfo.o tctx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 95c2083a0da8b3..d7336e6c9f2349 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "io_uring_types.h" #include "io_uring.h" #include "refs.h" +#include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" @@ -208,30 +209,6 @@ struct io_buffer { #define BGID_ARRAY 64 -/* - * Arbitrary limit, can be raised if need be - */ -#define IO_RINGFD_REG_MAX 16 - -struct io_uring_task { - /* submission side */ - int cached_refs; - struct xarray xa; - struct wait_queue_head wait; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct percpu_counter inflight; - atomic_t inflight_tracked; - atomic_t in_idle; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; - struct callback_head task_work; - struct file **registered_rings; - bool task_running; -}; - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in @@ -311,12 +288,6 @@ enum { IO_CHECK_CQ_DROPPED_BIT, }; -struct io_tctx_node { - struct list_head ctx_node; - struct task_struct *task; - struct io_ring_ctx *ctx; -}; - struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -361,7 +332,6 @@ static const struct io_op_def io_op_defs[]; #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) -static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); @@ -1677,7 +1647,7 @@ static void handle_tw_list(struct io_wq_work_node *node, } while (node); } -static void tctx_task_work(struct callback_head *cb) +void tctx_task_work(struct callback_head *cb) { bool uring_locked = false; struct io_ring_ctx *ctx = NULL; @@ -4725,7 +4695,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) +struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -4733,7 +4703,7 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) return req ? &req->work : NULL; } -static void io_wq_submit_work(struct io_wq_work *work) +void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -6089,97 +6059,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return done ? done : err; } -static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, - struct task_struct *task) -{ - struct io_wq_hash *hash; - struct io_wq_data data; - unsigned int concurrency; - - mutex_lock(&ctx->uring_lock); - hash = ctx->hash_map; - if (!hash) { - hash = kzalloc(sizeof(*hash), GFP_KERNEL); - if (!hash) { - mutex_unlock(&ctx->uring_lock); - return ERR_PTR(-ENOMEM); - } - refcount_set(&hash->refs, 1); - init_waitqueue_head(&hash->wait); - ctx->hash_map = hash; - } - mutex_unlock(&ctx->uring_lock); - - data.hash = hash; - data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - - return io_wq_create(concurrency, &data); -} - -__cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx; - int ret; - - tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); - if (unlikely(!tctx)) - return -ENOMEM; - - tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, - sizeof(struct file *), GFP_KERNEL); - if (unlikely(!tctx->registered_rings)) { - kfree(tctx); - return -ENOMEM; - } - - ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); - if (unlikely(ret)) { - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - tctx->io_wq = io_init_wq_offload(ctx, task); - if (IS_ERR(tctx->io_wq)) { - ret = PTR_ERR(tctx->io_wq); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - xa_init(&tctx->xa); - init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); - atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; - spin_lock_init(&tctx->task_lock); - INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - init_task_work(&tctx->task_work, tctx_task_work); - return 0; -} - -void __io_uring_free(struct task_struct *tsk) -{ - struct io_uring_task *tctx = tsk->io_uring; - - WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(tctx->io_wq); - WARN_ON_ONCE(tctx->cached_refs); - - kfree(tctx->registered_rings); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx); - tsk->io_uring = NULL; -} - static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7179,107 +7058,6 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - int ret; - - if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; - - tctx = current->io_uring; - if (ctx->iowq_limits_set) { - unsigned int limits[2] = { ctx->iowq_limits[0], - ctx->iowq_limits[1], }; - - ret = io_wq_max_workers(tctx->io_wq, limits); - if (ret) - return ret; - } - } - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - - mutex_lock(&ctx->uring_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); - } - tctx->last = ctx; - return 0; -} - -/* - * Note that this task has used io_uring. We use it for cancelation purposes. - */ -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (likely(tctx && tctx->last == ctx)) - return 0; - return __io_uring_add_tctx_node(ctx); -} - -/* - * Remove this io_uring_file -> task mapping. - */ -static __cold void io_uring_del_tctx_node(unsigned long index) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - - if (!tctx) - return; - node = xa_erase(&tctx->xa, index); - if (!node) - return; - - WARN_ON_ONCE(current != node->task); - WARN_ON_ONCE(list_empty(&node->ctx_node)); - - mutex_lock(&node->ctx->uring_lock); - list_del(&node->ctx_node); - mutex_unlock(&node->ctx->uring_lock); - - if (tctx->last == node->ctx) - tctx->last = NULL; - kfree(node); -} - -static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) -{ - struct io_wq *wq = tctx->io_wq; - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - io_uring_del_tctx_node(index); - cond_resched(); - } - if (wq) { - /* - * Must be after io_uring_del_tctx_node() (removes nodes under - * uring_lock) to avoid race with io_uring_try_cancel_iowq(). - */ - io_wq_put_and_exit(wq); - tctx->io_wq = NULL; - } -} - static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) @@ -7361,144 +7139,6 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -void io_uring_unreg_ringfd(void) -{ - struct io_uring_task *tctx = current->io_uring; - int i; - - for (i = 0; i < IO_RINGFD_REG_MAX; i++) { - if (tctx->registered_rings[i]) { - fput(tctx->registered_rings[i]); - tctx->registered_rings[i] = NULL; - } - } -} - -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, - int start, int end) -{ - struct file *file; - int offset; - - for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) - continue; - - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } - tctx->registered_rings[offset] = file; - return offset; - } - - return -EBUSY; -} - -/* - * Register a ring fd to avoid fdget/fdput for each io_uring_enter() - * invocation. User passes in an array of struct io_uring_rsrc_update - * with ->data set to the ring_fd, and ->offset given for the desired - * index. If no index is desired, application may set ->offset == -1U - * and we'll find an available index. Returns number of entries - * successfully processed, or < 0 on error if none were processed. - */ -static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_rsrc_update reg; - struct io_uring_task *tctx; - int ret, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - - mutex_unlock(&ctx->uring_lock); - ret = io_uring_add_tctx_node(ctx); - mutex_lock(&ctx->uring_lock); - if (ret) - return ret; - - tctx = current->io_uring; - for (i = 0; i < nr_args; i++) { - int start, end; - - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - - if (reg.resv) { - ret = -EINVAL; - break; - } - - if (reg.offset == -1U) { - start = 0; - end = IO_RINGFD_REG_MAX; - } else { - if (reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - start = reg.offset; - end = start + 1; - } - - ret = io_ring_add_registered_fd(tctx, reg.data, start, end); - if (ret < 0) - break; - - reg.offset = ret; - if (copy_to_user(&arg[i], ®, sizeof(reg))) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - ret = -EFAULT; - break; - } - } - - return i ? i : ret; -} - -static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_task *tctx = current->io_uring; - struct io_uring_rsrc_update reg; - int ret = 0, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - if (!tctx) - return 0; - - for (i = 0; i < nr_args; i++) { - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - - reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[reg.offset]) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - } - } - - return i ? i : ret; -} - static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1da8e66507a350..60678e88a9b92c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -128,6 +128,7 @@ void io_req_task_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); +void tctx_task_work(struct callback_head *cb); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, @@ -136,6 +137,9 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +struct io_wq_work *io_wq_free_work(struct io_wq_work *work); +void io_wq_submit_work(struct io_wq_work *work); + void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); diff --git a/io_uring/tctx.c b/io_uring/tctx.c new file mode 100644 index 00000000000000..3f7e9feb6ca2bc --- /dev/null +++ b/io_uring/tctx.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "tctx.h" + +static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, + struct task_struct *task) +{ + struct io_wq_hash *hash; + struct io_wq_data data; + unsigned int concurrency; + + mutex_lock(&ctx->uring_lock); + hash = ctx->hash_map; + if (!hash) { + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) { + mutex_unlock(&ctx->uring_lock); + return ERR_PTR(-ENOMEM); + } + refcount_set(&hash->refs, 1); + init_waitqueue_head(&hash->wait); + ctx->hash_map = hash; + } + mutex_unlock(&ctx->uring_lock); + + data.hash = hash; + data.task = task; + data.free_work = io_wq_free_work; + data.do_work = io_wq_submit_work; + + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + return io_wq_create(concurrency, &data); +} + +void __io_uring_free(struct task_struct *tsk) +{ + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); + WARN_ON_ONCE(tctx->io_wq); + WARN_ON_ONCE(tctx->cached_refs); + + kfree(tctx->registered_rings); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + tsk->io_uring = NULL; +} + +__cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx; + int ret; + + tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); + if (unlikely(!tctx)) + return -ENOMEM; + + tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, + sizeof(struct file *), GFP_KERNEL); + if (unlikely(!tctx->registered_rings)) { + kfree(tctx); + return -ENOMEM; + } + + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); + if (unlikely(ret)) { + kfree(tctx->registered_rings); + kfree(tctx); + return ret; + } + + tctx->io_wq = io_init_wq_offload(ctx, task); + if (IS_ERR(tctx->io_wq)) { + ret = PTR_ERR(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx->registered_rings); + kfree(tctx); + return ret; + } + + xa_init(&tctx->xa); + init_waitqueue_head(&tctx->wait); + atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->inflight_tracked, 0); + task->io_uring = tctx; + spin_lock_init(&tctx->task_lock); + INIT_WQ_LIST(&tctx->task_list); + INIT_WQ_LIST(&tctx->prio_task_list); + init_task_work(&tctx->task_work, tctx_task_work); + return 0; +} + +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; + int ret; + + if (unlikely(!tctx)) { + ret = io_uring_alloc_task_context(current, ctx); + if (unlikely(ret)) + return ret; + + tctx = current->io_uring; + if (ctx->iowq_limits_set) { + unsigned int limits[2] = { ctx->iowq_limits[0], + ctx->iowq_limits[1], }; + + ret = io_wq_max_workers(tctx->io_wq, limits); + if (ret) + return ret; + } + } + if (!xa_load(&tctx->xa, (unsigned long)ctx)) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; + + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); + if (ret) { + kfree(node); + return ret; + } + + mutex_lock(&ctx->uring_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->uring_lock); + } + tctx->last = ctx; + return 0; +} + +/* + * Remove this io_uring_file -> task mapping. + */ +__cold void io_uring_del_tctx_node(unsigned long index) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; + + if (!tctx) + return; + node = xa_erase(&tctx->xa, index); + if (!node) + return; + + WARN_ON_ONCE(current != node->task); + WARN_ON_ONCE(list_empty(&node->ctx_node)); + + mutex_lock(&node->ctx->uring_lock); + list_del(&node->ctx_node); + mutex_unlock(&node->ctx->uring_lock); + + if (tctx->last == node->ctx) + tctx->last = NULL; + kfree(node); +} + +__cold void io_uring_clean_tctx(struct io_uring_task *tctx) +{ + struct io_wq *wq = tctx->io_wq; + struct io_tctx_node *node; + unsigned long index; + + xa_for_each(&tctx->xa, index, node) { + io_uring_del_tctx_node(index); + cond_resched(); + } + if (wq) { + /* + * Must be after io_uring_del_tctx_node() (removes nodes under + * uring_lock) to avoid race with io_uring_try_cancel_iowq(). + */ + io_wq_put_and_exit(wq); + tctx->io_wq = NULL; + } +} + +void io_uring_unreg_ringfd(void) +{ + struct io_uring_task *tctx = current->io_uring; + int i; + + for (i = 0; i < IO_RINGFD_REG_MAX; i++) { + if (tctx->registered_rings[i]) { + fput(tctx->registered_rings[i]); + tctx->registered_rings[i] = NULL; + } + } +} + +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + for (offset = start; offset < end; offset++) { + offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[offset]) + continue; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; + } + tctx->registered_rings[offset] = file; + return offset; + } + + return -EBUSY; +} + +/* + * Register a ring fd to avoid fdget/fdput for each io_uring_enter() + * invocation. User passes in an array of struct io_uring_rsrc_update + * with ->data set to the ring_fd, and ->offset given for the desired + * index. If no index is desired, application may set ->offset == -1U + * and we'll find an available index. Returns number of entries + * successfully processed, or < 0 on error if none were processed. + */ +int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_rsrc_update reg; + struct io_uring_task *tctx; + int ret, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + + mutex_unlock(&ctx->uring_lock); + ret = io_uring_add_tctx_node(ctx); + mutex_lock(&ctx->uring_lock); + if (ret) + return ret; + + tctx = current->io_uring; + for (i = 0; i < nr_args; i++) { + int start, end; + + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + + if (reg.resv) { + ret = -EINVAL; + break; + } + + if (reg.offset == -1U) { + start = 0; + end = IO_RINGFD_REG_MAX; + } else { + if (reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + start = reg.offset; + end = start + 1; + } + + ret = io_ring_add_registered_fd(tctx, reg.data, start, end); + if (ret < 0) + break; + + reg.offset = ret; + if (copy_to_user(&arg[i], ®, sizeof(reg))) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + ret = -EFAULT; + break; + } + } + + return i ? i : ret; +} + +int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_task *tctx = current->io_uring; + struct io_uring_rsrc_update reg; + int ret = 0, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + if (!tctx) + return 0; + + for (i = 0; i < nr_args; i++) { + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + + reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[reg.offset]) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + } + } + + return i ? i : ret; +} diff --git a/io_uring/tctx.h b/io_uring/tctx.h new file mode 100644 index 00000000000000..f4964e40d07e0d --- /dev/null +++ b/io_uring/tctx.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + +struct io_uring_task { + /* submission side */ + int cached_refs; + struct xarray xa; + struct wait_queue_head wait; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct percpu_counter inflight; + atomic_t inflight_tracked; + atomic_t in_idle; + + spinlock_t task_lock; + struct io_wq_work_list task_list; + struct io_wq_work_list prio_task_list; + struct callback_head task_work; + struct file **registered_rings; + bool task_running; +}; + +struct io_tctx_node { + struct list_head ctx_node; + struct task_struct *task; + struct io_ring_ctx *ctx; +}; + +int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); +void io_uring_del_tctx_node(unsigned long index); +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); +void io_uring_clean_tctx(struct io_uring_task *tctx); + +void io_uring_unreg_ringfd(void); +int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args); +int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args); + +/* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + + if (likely(tctx && tctx->last == ctx)) + return 0; + return __io_uring_add_tctx_node(ctx); +} From 92ac8beaea1f4931f932da3dcc850547621107fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 11:48:35 -0600 Subject: [PATCH 047/172] io_uring: include and forward-declaration sanitation Remove some dead headers we no longer need, and get rid of the io_ring_ctx and io_uring_fops forward declarations. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d7336e6c9f2349..c1229704adfdf5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -70,11 +70,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -86,7 +84,6 @@ #include -#include "../fs/internal.h" #include "io-wq.h" #include "io_uring_types.h" @@ -139,8 +136,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_ring_ctx; - struct io_rsrc_put { struct list_head list; u64 tag; @@ -352,8 +347,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops; - const char *io_uring_get_opcode(u8 opcode) { switch ((enum io_uring_op)opcode) { @@ -457,11 +450,6 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } -bool io_is_uring_fops(struct file *file) -{ - return file->f_op == &io_uring_fops; -} - struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -7402,6 +7390,11 @@ static const struct file_operations io_uring_fops = { #endif }; +bool io_is_uring_fops(struct file *file) +{ + return file->f_op == &io_uring_fops; +} + static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { From cfd22e6b3319adc7c2a8a092e19ac16575dabc86 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 11:57:03 -0600 Subject: [PATCH 048/172] io_uring: add opcode name to io_op_defs This kills the last per-op switch. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 150 +++++++++++++++----------------------------- 1 file changed, 52 insertions(+), 98 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c1229704adfdf5..3f006907c8c5f4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -315,6 +315,8 @@ struct io_op_def { /* size of async data needed, if any */ unsigned short async_size; + const char *name; + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); @@ -349,104 +351,8 @@ static struct kmem_cache *req_cachep; const char *io_uring_get_opcode(u8 opcode) { - switch ((enum io_uring_op)opcode) { - case IORING_OP_NOP: - return "NOP"; - case IORING_OP_READV: - return "READV"; - case IORING_OP_WRITEV: - return "WRITEV"; - case IORING_OP_FSYNC: - return "FSYNC"; - case IORING_OP_READ_FIXED: - return "READ_FIXED"; - case IORING_OP_WRITE_FIXED: - return "WRITE_FIXED"; - case IORING_OP_POLL_ADD: - return "POLL_ADD"; - case IORING_OP_POLL_REMOVE: - return "POLL_REMOVE"; - case IORING_OP_SYNC_FILE_RANGE: - return "SYNC_FILE_RANGE"; - case IORING_OP_SENDMSG: - return "SENDMSG"; - case IORING_OP_RECVMSG: - return "RECVMSG"; - case IORING_OP_TIMEOUT: - return "TIMEOUT"; - case IORING_OP_TIMEOUT_REMOVE: - return "TIMEOUT_REMOVE"; - case IORING_OP_ACCEPT: - return "ACCEPT"; - case IORING_OP_ASYNC_CANCEL: - return "ASYNC_CANCEL"; - case IORING_OP_LINK_TIMEOUT: - return "LINK_TIMEOUT"; - case IORING_OP_CONNECT: - return "CONNECT"; - case IORING_OP_FALLOCATE: - return "FALLOCATE"; - case IORING_OP_OPENAT: - return "OPENAT"; - case IORING_OP_CLOSE: - return "CLOSE"; - case IORING_OP_FILES_UPDATE: - return "FILES_UPDATE"; - case IORING_OP_STATX: - return "STATX"; - case IORING_OP_READ: - return "READ"; - case IORING_OP_WRITE: - return "WRITE"; - case IORING_OP_FADVISE: - return "FADVISE"; - case IORING_OP_MADVISE: - return "MADVISE"; - case IORING_OP_SEND: - return "SEND"; - case IORING_OP_RECV: - return "RECV"; - case IORING_OP_OPENAT2: - return "OPENAT2"; - case IORING_OP_EPOLL_CTL: - return "EPOLL_CTL"; - case IORING_OP_SPLICE: - return "SPLICE"; - case IORING_OP_PROVIDE_BUFFERS: - return "PROVIDE_BUFFERS"; - case IORING_OP_REMOVE_BUFFERS: - return "REMOVE_BUFFERS"; - case IORING_OP_TEE: - return "TEE"; - case IORING_OP_SHUTDOWN: - return "SHUTDOWN"; - case IORING_OP_RENAMEAT: - return "RENAMEAT"; - case IORING_OP_UNLINKAT: - return "UNLINKAT"; - case IORING_OP_MKDIRAT: - return "MKDIRAT"; - case IORING_OP_SYMLINKAT: - return "SYMLINKAT"; - case IORING_OP_LINKAT: - return "LINKAT"; - case IORING_OP_MSG_RING: - return "MSG_RING"; - case IORING_OP_FSETXATTR: - return "FSETXATTR"; - case IORING_OP_SETXATTR: - return "SETXATTR"; - case IORING_OP_FGETXATTR: - return "FGETXATTR"; - case IORING_OP_GETXATTR: - return "GETXATTR"; - case IORING_OP_SOCKET: - return "SOCKET"; - case IORING_OP_URING_CMD: - return "URING_CMD"; - case IORING_OP_LAST: - return "INVALID"; - } + if (opcode < IORING_OP_LAST) + return io_op_defs[opcode].name; return "INVALID"; } @@ -8307,6 +8213,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, + .name = "NOP", .prep = io_nop_prep, .issue = io_nop, }, @@ -8320,6 +8227,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "READV", .prep = io_prep_rw, .issue = io_read, .prep_async = io_readv_prep_async, @@ -8335,6 +8243,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", .prep = io_prep_rw, .issue = io_write, .prep_async = io_writev_prep_async, @@ -8343,6 +8252,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, + .name = "FSYNC", .prep = io_fsync_prep, .issue = io_fsync, }, @@ -8355,6 +8265,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", .prep = io_prep_rw, .issue = io_read, }, @@ -8368,6 +8279,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", .prep = io_prep_rw, .issue = io_write, }, @@ -8375,17 +8287,20 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "POLL_ADD", .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, + .name = "POLL_REMOVE", .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, + .name = "SYNC_FILE_RANGE", .prep = io_sfr_prep, .issue = io_sync_file_range, }, @@ -8394,6 +8309,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, + .name = "SENDMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, @@ -8410,6 +8326,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, + .name = "RECVMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, @@ -8423,12 +8340,14 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_TIMEOUT] = { .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, + .name = "TIMEOUT_REMOVE", .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, @@ -8438,6 +8357,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ + .name = "ACCEPT", #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, @@ -8447,12 +8367,14 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, + .name = "ASYNC_CANCEL", .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -8460,6 +8382,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .name = "CONNECT", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, @@ -8471,26 +8394,31 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .name = "FALLOCATE", .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { + .name = "OPENAT", .prep = io_openat_prep, .issue = io_openat, .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { + .name = "CLOSE", .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, + .name = "FILES_UPDATE", .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, + .name = "STATX", .prep = io_statx_prep, .issue = io_statx, .cleanup = io_statx_cleanup, @@ -8505,6 +8433,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "READ", .prep = io_prep_rw, .issue = io_read, }, @@ -8518,16 +8447,19 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "WRITE", .prep = io_prep_rw, .issue = io_write, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, + .name = "FADVISE", .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { + .name = "MADVISE", .prep = io_madvise_prep, .issue = io_madvise, }, @@ -8537,6 +8469,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, + .name = "SEND", #if defined(CONFIG_NET) .prep = io_sendmsg_prep, .issue = io_send, @@ -8551,6 +8484,7 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, + .name = "RECV", #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, @@ -8559,6 +8493,7 @@ static const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_OPENAT2] = { + .name = "OPENAT2", .prep = io_openat2_prep, .issue = io_openat2, .cleanup = io_open_cleanup, @@ -8566,6 +8501,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "EPOLL", #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, @@ -8578,18 +8514,21 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "SPLICE", .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, + .name = "PROVIDE_BUFFERS", .prep = io_provide_buffers_prep, .issue = io_provide_buffers, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, + .name = "REMOVE_BUFFERS", .prep = io_remove_buffers_prep, .issue = io_remove_buffers, }, @@ -8598,11 +8537,13 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "TEE", .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, + .name = "SHUTDOWN", #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, @@ -8611,26 +8552,31 @@ static const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", .prep = io_renameat_prep, .issue = io_renameat, .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", .prep = io_unlinkat_prep, .issue = io_unlinkat, .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", .prep = io_mkdirat_prep, .issue = io_mkdirat, .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", .prep = io_symlinkat_prep, .issue = io_symlinkat, .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { + .name = "LINKAT", .prep = io_linkat_prep, .issue = io_linkat, .cleanup = io_link_cleanup, @@ -8638,33 +8584,39 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, + .name = "MSG_RING", .prep = io_msg_ring_prep, .issue = io_msg_ring, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, + .name = "FSETXATTR", .prep = io_fsetxattr_prep, .issue = io_fsetxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { + .name = "SETXATTR", .prep = io_setxattr_prep, .issue = io_setxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, + .name = "FGETXATTR", .prep = io_fgetxattr_prep, .issue = io_fgetxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { + .name = "GETXATTR", .prep = io_getxattr_prep, .issue = io_getxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, + .name = "SOCKET", #if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, @@ -8675,6 +8627,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, + .name = "URING_CMD", .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, @@ -8752,6 +8705,7 @@ static int __init io_uring_init(void) BUG_ON(!io_op_defs[i].prep); if (io_op_defs[i].prep != io_eopnotsupp_prep) BUG_ON(!io_op_defs[i].issue); + WARN_ON_ONCE(!io_op_defs[i].name); } req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | From 329061d3e2f9a0082a097e9558bd5497098586c6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 20:31:09 -0600 Subject: [PATCH 049/172] io_uring: move poll handling into its own file Add a io_poll_issue() rather than export the general task_work locking and io_issue_sqe(), and put the io_op_defs definition and structure into a separate header file so that poll can use it. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 842 +------------------------------------------- io_uring/io_uring.h | 32 ++ io_uring/opdef.h | 40 +++ io_uring/poll.c | 760 +++++++++++++++++++++++++++++++++++++++ io_uring/poll.h | 30 ++ 6 files changed, 879 insertions(+), 827 deletions(-) create mode 100644 io_uring/opdef.h create mode 100644 io_uring/poll.c create mode 100644 io_uring/poll.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 2db085cdedad84..eb1b07a9951639 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o + sqpoll.o fdinfo.o tctx.o poll.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3f006907c8c5f4..b4ecfa7238555f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -88,6 +88,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "opdef.h" #include "refs.h" #include "tctx.h" #include "sqpoll.h" @@ -106,6 +107,7 @@ #include "net.h" #include "msg_ring.h" #include "timeout.h" +#include "poll.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -208,22 +210,6 @@ struct io_buffer { * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in */ -struct io_poll { - struct file *file; - struct wait_queue_head *head; - __poll_t events; - struct wait_queue_entry wait; -}; - -struct io_poll_update { - struct file *file; - u64 old_user_data; - u64 new_user_data; - __poll_t events; - bool update_events; - bool update_user_data; -}; - struct io_cancel { struct file *file; u64 addr; @@ -268,11 +254,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct async_poll { - struct io_poll poll; - struct io_poll *double_poll; -}; - enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, @@ -289,42 +270,6 @@ struct io_defer_entry { u32 seq; }; -struct io_op_def { - /* needs req->file assigned */ - unsigned needs_file : 1; - /* should block plug */ - unsigned plug : 1; - /* hash wq insertion if file is a regular file */ - unsigned hash_reg_file : 1; - /* unbound wq insertion if file is a non-regular file */ - unsigned unbound_nonreg_file : 1; - /* set if opcode supports polled "wait" */ - unsigned pollin : 1; - unsigned pollout : 1; - unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; - /* skip auditing */ - unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; - /* size of async data needed, if any */ - unsigned short async_size; - - const char *name; - - int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); - int (*issue)(struct io_kiocb *, unsigned int); - int (*prep_async)(struct io_kiocb *); - void (*cleanup)(struct io_kiocb *); -}; - -static const struct io_op_def io_op_defs[]; - /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) @@ -529,32 +474,12 @@ static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, return xa_load(&ctx->io_bl_xa, bgid); } -static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; struct io_buffer *buf; - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - - /* - * READV uses fields in `struct io_rw` (len/addr) to stash the selected - * buffer data. However if that buffer is recycled the original request - * data stored in addr is lost. Therefore forbid recycling for now. - */ - if (req->opcode == IORING_OP_READV) - return; - /* * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear * the flag and hence ensure that bl->head doesn't get incremented. @@ -599,8 +524,8 @@ static bool io_match_linked(struct io_kiocb *head) * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ -static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) { bool matched; @@ -1310,7 +1235,7 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) io_req_complete_post(req); } -static void io_req_complete_failed(struct io_kiocb *req, s32 res) +void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); @@ -1656,7 +1581,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, req->cqe.res); } -static void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_kiocb *req, bool *locked) { io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ @@ -3437,749 +3362,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int nr_entries; - int error; -}; - -#define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK GENMASK(30, 0) - -/* - * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can - * bump it and acquire ownership. It's disallowed to modify requests while not - * owning it, that prevents from races for enqueueing task_work's and b/w - * arming poll and wakeups. - */ -static inline bool io_poll_get_ownership(struct io_kiocb *req) -{ - return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); -} - -static void io_poll_mark_cancelled(struct io_kiocb *req) -{ - atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); -} - -static struct io_poll *io_poll_get_double(struct io_kiocb *req) -{ - /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ - if (req->opcode == IORING_OP_POLL_ADD) - return req->async_data; - return req->apoll->double_poll; -} - -static struct io_poll *io_poll_get_single(struct io_kiocb *req) -{ - if (req->opcode == IORING_OP_POLL_ADD) - return io_kiocb_to_cmd(req); - return &req->apoll->poll; -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); -} - -static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - -static inline void io_poll_remove_entry(struct io_poll *poll) -{ - struct wait_queue_head *head = smp_load_acquire(&poll->head); - - if (head) { - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); - } -} - -static void io_poll_remove_entries(struct io_kiocb *req) -{ - /* - * Nothing to do if neither of those flags are set. Avoid dipping - * into the poll/apoll/double cachelines if we can. - */ - if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) - return; - - /* - * While we hold the waitqueue lock and the waitqueue is nonempty, - * wake_up_pollfree() will wait for us. However, taking the waitqueue - * lock in the first place can race with the waitqueue being freed. - * - * We solve this as eventpoll does: by taking advantage of the fact that - * all users of wake_up_pollfree() will RCU-delay the actual free. If - * we enter rcu_read_lock() and see that the pointer to the queue is - * non-NULL, we can then lock it without the memory being freed out from - * under us. - * - * Keep holding rcu_read_lock() as long as we hold the queue lock, in - * case the caller deletes the entry from the queue, leaving it empty. - * In that case, only RCU prevents the queue memory from being freed. - */ - rcu_read_lock(); - if (req->flags & REQ_F_SINGLE_POLL) - io_poll_remove_entry(io_poll_get_single(req)); - if (req->flags & REQ_F_DOUBLE_POLL) - io_poll_remove_entry(io_poll_get_double(req)); - rcu_read_unlock(); -} - -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); -/* - * All poll tw should go through this. Checks for poll events, manages - * references, does rewait, etc. - * - * Returns a negative error on failure. >0 when no action require, which is - * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->cqe.res. - */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int v, ret; - - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) - return -ECANCELED; - - do { - v = atomic_read(&req->poll_refs); - - /* tw handler should be the owner, and so have some references */ - if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return 0; - if (v & IO_POLL_CANCEL_FLAG) - return -ECANCELED; - - if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; - } - - if ((unlikely(!req->cqe.res))) - continue; - if (req->apoll_events & EPOLLONESHOT) - return 0; - - /* multishot, just fill a CQE and proceed */ - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __poll_t mask = mangle_poll(req->cqe.res & - req->apoll_events); - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - continue; - } - return -ECANCELED; - } - - io_tw_lock(req->ctx, locked); - if (unlikely(req->task->flags & PF_EXITING)) - return -EFAULT; - ret = io_issue_sqe(req, - IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (ret) - return ret; - - /* - * Release all references, retry if someone tried to restart - * task_work while we were executing it. - */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); - - return 1; -} - -static void io_poll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - if (!ret) { - struct io_poll *poll = io_kiocb_to_cmd(req); - - req->cqe.res = mangle_poll(req->cqe.res & poll->events); - } else { - req->cqe.res = ret; - req_set_fail(req); - } - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - req->cqe.flags = 0; - __io_req_complete_post(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void io_apoll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - spin_unlock(&ctx->completion_lock); - - if (!ret) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, ret); -} - -static void __io_poll_execute(struct io_kiocb *req, int mask, - __poll_t __maybe_unused events) -{ - io_req_set_res(req, mask, 0); - /* - * This is useful for poll that is armed on behalf of another - * request, and where the wakeup path could be on a different - * CPU. We want to avoid pulling in req->apoll->events for that - * case. - */ - if (req->opcode == IORING_OP_POLL_ADD) - req->io_task_work.func = io_poll_task_func; - else - req->io_task_work.func = io_apoll_task_func; - - trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); - io_req_task_work_add(req); -} - -static inline void io_poll_execute(struct io_kiocb *req, int res, - __poll_t events) -{ - if (io_poll_get_ownership(req)) - __io_poll_execute(req, res, events); -} - -static void io_poll_cancel_req(struct io_kiocb *req) -{ - io_poll_mark_cancelled(req); - /* kick tw, which should complete the request */ - io_poll_execute(req, 0, 0); -} - -#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) -#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) -#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) - -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wqe_to_req(wait); - struct io_poll *poll = container_of(wait, struct io_poll, wait); - __poll_t mask = key_to_poll(key); - - if (unlikely(mask & POLLFREE)) { - io_poll_mark_cancelled(req); - /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0, poll->events); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); - return 1; - } - - /* for instances that support it check for an event match first */ - if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) - return 0; - - if (io_poll_get_ownership(req)) { - /* optional, saves extra locking for removal in tw handler */ - if (mask && poll->events & EPOLLONESHOT) { - list_del_init(&poll->wait.entry); - poll->head = NULL; - if (wqe_is_double(wait)) - req->flags &= ~REQ_F_DOUBLE_POLL; - else - req->flags &= ~REQ_F_SINGLE_POLL; - } - __io_poll_execute(req, mask, poll->events); - } - return 1; -} - -static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, - struct wait_queue_head *head, - struct io_poll **poll_ptr) -{ - struct io_kiocb *req = pt->req; - unsigned long wqe_private = (unsigned long) req; - - /* - * The file being polled uses multiple waitqueues for poll handling - * (e.g. one for read, one for write). Setup a separate io_poll - * if this happens. - */ - if (unlikely(pt->nr_entries)) { - struct io_poll *first = poll; - - /* double add on the same waitqueue head, ignore */ - if (first->head == head) - return; - /* already have a 2nd entry, fail a third attempt */ - if (*poll_ptr) { - if ((*poll_ptr)->head == head) - return; - pt->error = -EINVAL; - return; - } - - poll = kmalloc(sizeof(*poll), GFP_ATOMIC); - if (!poll) { - pt->error = -ENOMEM; - return; - } - /* mark as double wq entry */ - wqe_private |= 1; - req->flags |= REQ_F_DOUBLE_POLL; - io_init_poll_iocb(poll, first->events, first->wait.func); - *poll_ptr = poll; - if (req->opcode == IORING_OP_POLL_ADD) - req->flags |= REQ_F_ASYNC_DATA; - } - - req->flags |= REQ_F_SINGLE_POLL; - pt->nr_entries++; - poll->head = head; - poll->wait.private = (void *) wqe_private; - - if (poll->events & EPOLLEXCLUSIVE) - add_wait_queue_exclusive(head, &poll->wait); - else - add_wait_queue(head, &poll->wait); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct io_poll *poll = io_kiocb_to_cmd(pt->req); - - __io_queue_proc(poll, pt, head, - (struct io_poll **) &pt->req->async_data); -} - -static int __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll *poll, - struct io_poll_table *ipt, __poll_t mask) -{ - struct io_ring_ctx *ctx = req->ctx; - int v; - - INIT_HLIST_NODE(&req->hash_node); - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); - poll->file = req->file; - - req->apoll_events = poll->events; - - ipt->pt._key = mask; - ipt->req = req; - ipt->error = 0; - ipt->nr_entries = 0; - - /* - * Take the ownership to delay any tw execution up until we're done - * with poll arming. see io_poll_get_ownership(). - */ - atomic_set(&req->poll_refs, 1); - mask = vfs_poll(req->file, &ipt->pt) & poll->events; - - if (mask && (poll->events & EPOLLONESHOT)) { - io_poll_remove_entries(req); - /* no one else has access to the req, forget about the ref */ - return mask; - } - if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { - io_poll_remove_entries(req); - if (!ipt->error) - ipt->error = -EINVAL; - return 0; - } - - spin_lock(&ctx->completion_lock); - io_poll_req_insert(req); - spin_unlock(&ctx->completion_lock); - - if (mask) { - /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) { - poll->events |= EPOLLONESHOT; - req->apoll_events |= EPOLLONESHOT; - ipt->error = 0; - } - __io_poll_execute(req, mask, poll->events); - return 0; - } - - /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. - */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0, poll->events); - return 0; -} - -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll; - - __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -enum { - IO_APOLL_OK, - IO_APOLL_ABORTED, - IO_APOLL_READY -}; - -static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; - struct async_poll *apoll; - struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR; - int ret; - - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; - if (!file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) - return IO_APOLL_ABORTED; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) - mask |= EPOLLONESHOT; - - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if (req->flags & REQ_F_CLEAR_POLLIN) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - if (req->flags & REQ_F_POLLED) { - apoll = req->apoll; - kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); - } else { - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; - } - apoll->double_poll = NULL; - req->apoll = apoll; - req->flags |= REQ_F_POLLED; - ipt.pt._qproc = io_async_queue_proc; - - io_kbuf_recycle(req, issue_flags); - - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); - if (ret || ipt.error) - return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - - trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, - mask, apoll->poll.events); - return IO_APOLL_OK; -} - -/* - * Returns true if we found and killed one or more poll requests - */ -static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - int i; - - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { - hlist_del_init(&req->hash_node); - io_poll_cancel_req(req); - found = true; - } - } - } - spin_unlock(&ctx->completion_lock); - return found; -} - -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct hlist_head *list; - struct io_kiocb *req; - - list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { - if (cd->data != req->cqe.user_data) - continue; - if (poll_only && req->opcode != IORING_OP_POLL_ADD) - continue; - if (cd->flags & IORING_ASYNC_CANCEL_ALL) { - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - } - return req; - } - return NULL; -} - -static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - int i; - - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry(req, list, hash_node) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - req->file != cd->file) - continue; - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - return req; - } - } - return NULL; -} - -static bool io_poll_disarm(struct io_kiocb *req) - __must_hold(&ctx->completion_lock) -{ - if (!io_poll_get_ownership(req)) - return false; - io_poll_remove_entries(req); - hash_del(&req->hash_node); - return true; -} - -static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd); - else - req = io_poll_find(ctx, false, cd); - if (!req) - return -ENOENT; - io_poll_cancel_req(req); - return 0; -} - -static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, - unsigned int flags) -{ - u32 events; - - events = READ_ONCE(sqe->poll32_events); -#ifdef __BIG_ENDIAN - events = swahw32(events); -#endif - if (!(flags & IORING_POLL_ADD_MULTI)) - events |= EPOLLONESHOT; - return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); -} - -static int io_poll_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_poll_update *upd = io_kiocb_to_cmd(req); - u32 flags; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | - IORING_POLL_ADD_MULTI)) - return -EINVAL; - /* meaningless without update */ - if (flags == IORING_POLL_ADD_MULTI) - return -EINVAL; - - upd->old_user_data = READ_ONCE(sqe->addr); - upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; - upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; - - upd->new_user_data = READ_ONCE(sqe->off); - if (!upd->update_user_data && upd->new_user_data) - return -EINVAL; - if (upd->update_events) - upd->events = io_poll_parse_events(sqe, flags); - else if (sqe->poll32_events) - return -EINVAL; - - return 0; -} - -static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_poll *poll = io_kiocb_to_cmd(req); - u32 flags; - - if (sqe->buf_index || sqe->off || sqe->addr) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~IORING_POLL_ADD_MULTI) - return -EINVAL; - if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) - return -EINVAL; - - io_req_set_refcount(req); - poll->events = io_poll_parse_events(sqe, flags); - return 0; -} - -static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_poll *poll = io_kiocb_to_cmd(req); - struct io_poll_table ipt; - int ret; - - ipt.pt._qproc = io_poll_queue_proc; - - ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); - if (ret) { - io_req_set_res(req, ret, 0); - return IOU_OK; - } - if (ipt.error) { - req_set_fail(req); - return ipt.error; - } - - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_poll_update *poll_update = io_kiocb_to_cmd(req); - struct io_cancel_data cd = { .data = poll_update->old_user_data, }; - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *preq; - int ret2, ret = 0; - bool locked; - - spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, true, &cd); - if (!preq || !io_poll_disarm(preq)) { - spin_unlock(&ctx->completion_lock); - ret = preq ? -EALREADY : -ENOENT; - goto out; - } - spin_unlock(&ctx->completion_lock); - - if (poll_update->update_events || poll_update->update_user_data) { - /* only mask one event flags, keep behavior flags */ - if (poll_update->update_events) { - struct io_poll *poll = io_kiocb_to_cmd(preq); - - poll->events &= ~0xffff; - poll->events |= poll_update->events & 0xffff; - poll->events |= IO_POLL_UNMASK; - } - if (poll_update->update_user_data) - preq->cqe.user_data = poll_update->new_user_data; - - ret2 = io_poll_add(preq, issue_flags); - /* successfully updated, don't complete poll request */ - if (!ret2 || ret2 == -EIOCBQUEUED) - goto out; - } - - req_set_fail(preq); - io_req_set_res(preq, -ECANCELED, 0); - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); -out: - if (ret < 0) { - req_set_fail(req); - return ret; - } - /* complete update request, we're done with it */ - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -4589,6 +3771,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } +int io_poll_issue(struct io_kiocb *req, bool *locked) +{ + io_tw_lock(req->ctx, locked); + if (unlikely(req->task->flags & PF_EXITING)) + return -EFAULT; + return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); +} + struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -8209,7 +7399,7 @@ static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) return -ECANCELED; } -static const struct io_op_def io_op_defs[] = { +const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 60678e88a9b92c..1ceac4ea62bf3b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -92,6 +92,7 @@ static inline bool io_run_task_work(void) return false; } +void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); @@ -109,6 +110,32 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); +static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +{ + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return; + /* + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. + */ + if ((req->flags & REQ_F_BUFFER_SELECTED) && + (req->flags & REQ_F_PARTIAL_IO)) + return; + + /* + * READV uses fields in `struct io_rw` (len/addr) to stash the selected + * buffer data. However if that buffer is recycled the original request + * data stored in addr is lost. Therefore forbid recycling for now. + */ + if (req->opcode == IORING_OP_READV) + return; + + __io_kbuf_recycle(req, issue_flags); +} + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -128,12 +155,14 @@ void io_req_task_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); +void io_req_task_submit(struct io_kiocb *req, bool *locked); void tctx_task_work(struct callback_head *cb); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); +int io_poll_issue(struct io_kiocb *req, bool *locked); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); @@ -143,6 +172,9 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all); + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) diff --git a/io_uring/opdef.h b/io_uring/opdef.h new file mode 100644 index 00000000000000..4578adcdba8a41 --- /dev/null +++ b/io_uring/opdef.h @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_OP_DEF_H +#define IOU_OP_DEF_H + +struct io_op_def { + /* needs req->file assigned */ + unsigned needs_file : 1; + /* should block plug */ + unsigned plug : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; + unsigned poll_exclusive : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* skip auditing */ + unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; + /* size of async data needed, if any */ + unsigned short async_size; + + const char *name; + + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); + int (*issue)(struct io_kiocb *, unsigned int); + int (*prep_async)(struct io_kiocb *); + void (*cleanup)(struct io_kiocb *); +}; + +extern const struct io_op_def io_op_defs[]; +#endif diff --git a/io_uring/poll.c b/io_uring/poll.c new file mode 100644 index 00000000000000..c3e4fcb0a7ba7b --- /dev/null +++ b/io_uring/poll.c @@ -0,0 +1,760 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "refs.h" +#include "opdef.h" +#include "poll.h" + +struct io_poll_update { + struct file *file; + u64 old_user_data; + u64 new_user_data; + __poll_t events; + bool update_events; + bool update_user_data; +}; + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int nr_entries; + int error; +}; + +#define IO_POLL_CANCEL_FLAG BIT(31) +#define IO_POLL_REF_MASK GENMASK(30, 0) + +/* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not + * owning it, that prevents from races for enqueueing task_work's and b/w + * arming poll and wakeups. + */ +static inline bool io_poll_get_ownership(struct io_kiocb *req) +{ + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); +} + +static void io_poll_mark_cancelled(struct io_kiocb *req) +{ + atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); +} + +static struct io_poll *io_poll_get_double(struct io_kiocb *req) +{ + /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return req->async_data; + return req->apoll->double_poll; +} + +static struct io_poll *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return io_kiocb_to_cmd(req); + return &req->apoll->poll; +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; +#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) + /* mask in events that we always want/need */ + poll->events = events | IO_POLL_UNMASK; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} + +static inline void io_poll_remove_entry(struct io_poll *poll) +{ + struct wait_queue_head *head = smp_load_acquire(&poll->head); + + if (head) { + spin_lock_irq(&head->lock); + list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); + } +} + +static void io_poll_remove_entries(struct io_kiocb *req) +{ + /* + * Nothing to do if neither of those flags are set. Avoid dipping + * into the poll/apoll/double cachelines if we can. + */ + if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) + return; + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + if (req->flags & REQ_F_SINGLE_POLL) + io_poll_remove_entry(io_poll_get_single(req)); + if (req->flags & REQ_F_DOUBLE_POLL) + io_poll_remove_entry(io_poll_get_double(req)); + rcu_read_unlock(); +} + +/* + * All poll tw should go through this. Checks for poll events, manages + * references, does rewait, etc. + * + * Returns a negative error on failure. >0 when no action require, which is + * either spurious wakeup or multishot CQE is served. 0 when it's done with + * the request, then the mask is stored in req->cqe.res. + */ +static int io_poll_check_events(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int v, ret; + + /* req->task == current here, checking PF_EXITING is safe */ + if (unlikely(req->task->flags & PF_EXITING)) + return -ECANCELED; + + do { + v = atomic_read(&req->poll_refs); + + /* tw handler should be the owner, and so have some references */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return 0; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED; + + if (!req->cqe.res) { + struct poll_table_struct pt = { ._key = req->apoll_events }; + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; + } + + if ((unlikely(!req->cqe.res))) + continue; + if (req->apoll_events & EPOLLONESHOT) + return 0; + + /* multishot, just fill a CQE and proceed */ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __poll_t mask = mangle_poll(req->cqe.res & + req->apoll_events); + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + continue; + } + return -ECANCELED; + } + + ret = io_poll_issue(req, locked); + if (ret) + return ret; + + /* + * Release all references, retry if someone tried to restart + * task_work while we were executing it. + */ + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + + return 1; +} + +static void io_poll_task_func(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_poll_check_events(req, locked); + if (ret > 0) + return; + + if (!ret) { + struct io_poll *poll = io_kiocb_to_cmd(req); + + req->cqe.res = mangle_poll(req->cqe.res & poll->events); + } else { + req->cqe.res = ret; + req_set_fail(req); + } + + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + req->cqe.flags = 0; + __io_req_complete_post(req); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +static void io_apoll_task_func(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_poll_check_events(req, locked); + if (ret > 0) + return; + + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock(&ctx->completion_lock); + + if (!ret) + io_req_task_submit(req, locked); + else + io_req_complete_failed(req, ret); +} + +static void __io_poll_execute(struct io_kiocb *req, int mask, + __poll_t __maybe_unused events) +{ + io_req_set_res(req, mask, 0); + /* + * This is useful for poll that is armed on behalf of another + * request, and where the wakeup path could be on a different + * CPU. We want to avoid pulling in req->apoll->events for that + * case. + */ + if (req->opcode == IORING_OP_POLL_ADD) + req->io_task_work.func = io_poll_task_func; + else + req->io_task_work.func = io_apoll_task_func; + + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + io_req_task_work_add(req); +} + +static inline void io_poll_execute(struct io_kiocb *req, int res, + __poll_t events) +{ + if (io_poll_get_ownership(req)) + __io_poll_execute(req, res, events); +} + +static void io_poll_cancel_req(struct io_kiocb *req) +{ + io_poll_mark_cancelled(req); + /* kick tw, which should complete the request */ + io_poll_execute(req, 0, 0); +} + +#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) +#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wqe_to_req(wait); + struct io_poll *poll = container_of(wait, struct io_poll, wait); + __poll_t mask = key_to_poll(key); + + if (unlikely(mask & POLLFREE)) { + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0, poll->events); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; + } + + /* for instances that support it check for an event match first */ + if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) + return 0; + + if (io_poll_get_ownership(req)) { + /* optional, saves extra locking for removal in tw handler */ + if (mask && poll->events & EPOLLONESHOT) { + list_del_init(&poll->wait.entry); + poll->head = NULL; + if (wqe_is_double(wait)) + req->flags &= ~REQ_F_DOUBLE_POLL; + else + req->flags &= ~REQ_F_SINGLE_POLL; + } + __io_poll_execute(req, mask, poll->events); + } + return 1; +} + +static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, + struct wait_queue_head *head, + struct io_poll **poll_ptr) +{ + struct io_kiocb *req = pt->req; + unsigned long wqe_private = (unsigned long) req; + + /* + * The file being polled uses multiple waitqueues for poll handling + * (e.g. one for read, one for write). Setup a separate io_poll + * if this happens. + */ + if (unlikely(pt->nr_entries)) { + struct io_poll *first = poll; + + /* double add on the same waitqueue head, ignore */ + if (first->head == head) + return; + /* already have a 2nd entry, fail a third attempt */ + if (*poll_ptr) { + if ((*poll_ptr)->head == head) + return; + pt->error = -EINVAL; + return; + } + + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); + if (!poll) { + pt->error = -ENOMEM; + return; + } + /* mark as double wq entry */ + wqe_private |= 1; + req->flags |= REQ_F_DOUBLE_POLL; + io_init_poll_iocb(poll, first->events, first->wait.func); + *poll_ptr = poll; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; + } + + req->flags |= REQ_F_SINGLE_POLL; + pt->nr_entries++; + poll->head = head; + poll->wait.private = (void *) wqe_private; + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); +} + +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct io_poll *poll = io_kiocb_to_cmd(pt->req); + + __io_queue_proc(poll, pt, head, + (struct io_poll **) &pt->req->async_data); +} + +static int __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll *poll, + struct io_poll_table *ipt, __poll_t mask) +{ + struct io_ring_ctx *ctx = req->ctx; + int v; + + INIT_HLIST_NODE(&req->hash_node); + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); + io_init_poll_iocb(poll, mask, io_poll_wake); + poll->file = req->file; + + req->apoll_events = poll->events; + + ipt->pt._key = mask; + ipt->req = req; + ipt->error = 0; + ipt->nr_entries = 0; + + /* + * Take the ownership to delay any tw execution up until we're done + * with poll arming. see io_poll_get_ownership(). + */ + atomic_set(&req->poll_refs, 1); + mask = vfs_poll(req->file, &ipt->pt) & poll->events; + + if (mask && (poll->events & EPOLLONESHOT)) { + io_poll_remove_entries(req); + /* no one else has access to the req, forget about the ref */ + return mask; + } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + if (!ipt->error) + ipt->error = -EINVAL; + return 0; + } + + spin_lock(&ctx->completion_lock); + io_poll_req_insert(req); + spin_unlock(&ctx->completion_lock); + + if (mask) { + /* can't multishot if failed, just queue the event we've got */ + if (unlikely(ipt->error || !ipt->nr_entries)) { + poll->events |= EPOLLONESHOT; + req->apoll_events |= EPOLLONESHOT; + ipt->error = 0; + } + __io_poll_execute(req, mask, poll->events); + return 0; + } + + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0, poll->events); + return 0; +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; + + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); +} + +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask = POLLPRI | POLLERR; + int ret; + + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + if (!file_can_poll(req->file)) + return IO_APOLL_ABORTED; + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) + return IO_APOLL_ABORTED; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + mask |= EPOLLONESHOT; + + if (def->pollin) { + mask |= EPOLLIN | EPOLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if (req->flags & REQ_F_CLEAR_POLLIN) + mask &= ~EPOLLIN; + } else { + mask |= EPOLLOUT | EPOLLWRNORM; + } + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + kfree(apoll->double_poll); + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + } + apoll->double_poll = NULL; + req->apoll = apoll; + req->flags |= REQ_F_POLLED; + ipt.pt._qproc = io_async_queue_proc; + + io_kbuf_recycle(req, issue_flags); + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); + if (ret || ipt.error) + return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; + + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, + mask, apoll->poll.events); + return IO_APOLL_OK; +} + +/* + * Returns true if we found and killed one or more poll requests + */ +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + int i; + + spin_lock(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (io_match_task_safe(req, tsk, cancel_all)) { + hlist_del_init(&req->hash_node); + io_poll_cancel_req(req); + found = true; + } + } + } + spin_unlock(&ctx->completion_lock); + return found; +} + +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct hlist_head *list; + struct io_kiocb *req; + + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (cd->data != req->cqe.user_data) + continue; + if (poll_only && req->opcode != IORING_OP_POLL_ADD) + continue; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } + return req; + } + return NULL; +} + +static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + int i; + + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry(req, list, hash_node) { + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + req->file != cd->file) + continue; + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + return req; + } + } + return NULL; +} + +static bool io_poll_disarm(struct io_kiocb *req) + __must_hold(&ctx->completion_lock) +{ + if (!io_poll_get_ownership(req)) + return false; + io_poll_remove_entries(req); + hash_del(&req->hash_node); + return true; +} + +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + req = io_poll_file_find(ctx, cd); + else + req = io_poll_find(ctx, false, cd); + if (!req) + return -ENOENT; + io_poll_cancel_req(req); + return 0; +} + +static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, + unsigned int flags) +{ + u32 events; + + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif + if (!(flags & IORING_POLL_ADD_MULTI)) + events |= EPOLLONESHOT; + return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); +} + +int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll_update *upd = io_kiocb_to_cmd(req); + u32 flags; + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + flags = READ_ONCE(sqe->len); + if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | + IORING_POLL_ADD_MULTI)) + return -EINVAL; + /* meaningless without update */ + if (flags == IORING_POLL_ADD_MULTI) + return -EINVAL; + + upd->old_user_data = READ_ONCE(sqe->addr); + upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; + upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; + + upd->new_user_data = READ_ONCE(sqe->off); + if (!upd->update_user_data && upd->new_user_data) + return -EINVAL; + if (upd->update_events) + upd->events = io_poll_parse_events(sqe, flags); + else if (sqe->poll32_events) + return -EINVAL; + + return 0; +} + +int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll *poll = io_kiocb_to_cmd(req); + u32 flags; + + if (sqe->buf_index || sqe->off || sqe->addr) + return -EINVAL; + flags = READ_ONCE(sqe->len); + if (flags & ~IORING_POLL_ADD_MULTI) + return -EINVAL; + if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) + return -EINVAL; + + io_req_set_refcount(req); + poll->events = io_poll_parse_events(sqe, flags); + return 0; +} + +int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_poll *poll = io_kiocb_to_cmd(req); + struct io_poll_table ipt; + int ret; + + ipt.pt._qproc = io_poll_queue_proc; + + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); + if (ret) { + io_req_set_res(req, ret, 0); + return IOU_OK; + } + if (ipt.error) { + req_set_fail(req); + return ipt.error; + } + + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_poll_update *poll_update = io_kiocb_to_cmd(req); + struct io_cancel_data cd = { .data = poll_update->old_user_data, }; + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *preq; + int ret2, ret = 0; + bool locked; + + spin_lock(&ctx->completion_lock); + preq = io_poll_find(ctx, true, &cd); + if (!preq || !io_poll_disarm(preq)) { + spin_unlock(&ctx->completion_lock); + ret = preq ? -EALREADY : -ENOENT; + goto out; + } + spin_unlock(&ctx->completion_lock); + + if (poll_update->update_events || poll_update->update_user_data) { + /* only mask one event flags, keep behavior flags */ + if (poll_update->update_events) { + struct io_poll *poll = io_kiocb_to_cmd(preq); + + poll->events &= ~0xffff; + poll->events |= poll_update->events & 0xffff; + poll->events |= IO_POLL_UNMASK; + } + if (poll_update->update_user_data) + preq->cqe.user_data = poll_update->new_user_data; + + ret2 = io_poll_add(preq, issue_flags); + /* successfully updated, don't complete poll request */ + if (!ret2 || ret2 == -EIOCBQUEUED) + goto out; + } + + req_set_fail(preq); + io_req_set_res(preq, -ECANCELED, 0); + locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &locked); +out: + if (ret < 0) { + req_set_fail(req); + return ret; + } + /* complete update request, we're done with it */ + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/poll.h b/io_uring/poll.h new file mode 100644 index 00000000000000..cc75c1567a84ae --- /dev/null +++ b/io_uring/poll.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 + +enum { + IO_APOLL_OK, + IO_APOLL_ABORTED, + IO_APOLL_READY +}; + +struct io_poll { + struct file *file; + struct wait_queue_head *head; + __poll_t events; + struct wait_queue_entry wait; +}; + +struct async_poll { + struct io_poll poll; + struct io_poll *double_poll; +}; + +int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); + +int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); + +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); +bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all); From 7aaff708a768144ec6459f0a58301be1a6b982fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 20:36:47 -0600 Subject: [PATCH 050/172] io_uring: move cancelation into its own file This also helps cleanup the io_uring.h cancel parts, as we can make things static in the cancel.c file, mostly. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/cancel.c | 194 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/cancel.h | 6 ++ io_uring/io_uring.c | 177 +--------------------------------------- io_uring/io_uring.h | 1 - io_uring/timeout.c | 1 + 6 files changed, 204 insertions(+), 178 deletions(-) create mode 100644 io_uring/cancel.c create mode 100644 io_uring/cancel.h diff --git a/io_uring/Makefile b/io_uring/Makefile index eb1b07a9951639..cfd61e6b7759f7 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o + sqpoll.o fdinfo.o tctx.o poll.o \ + cancel.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c new file mode 100644 index 00000000000000..83cceb52d82d64 --- /dev/null +++ b/io_uring/cancel.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "tctx.h" +#include "poll.h" +#include "timeout.h" +#include "cancel.h" + +struct io_cancel { + struct file *file; + u64 addr; + u32 flags; + s32 fd; +}; + +#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ + IORING_ASYNC_CANCEL_ANY) + +static bool io_cancel_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_cancel_data *cd = data; + + if (req->ctx != cd->ctx) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { + ; + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { + if (req->file != cd->file) + return false; + } else { + if (req->cqe.user_data != cd->data) + return false; + } + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + return false; + req->work.cancel_seq = cd->seq; + } + return true; +} + +static int io_async_cancel_one(struct io_uring_task *tctx, + struct io_cancel_data *cd) +{ + enum io_wq_cancel cancel_ret; + int ret = 0; + bool all; + + if (!tctx || !tctx->io_wq) + return -ENOENT; + + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); + switch (cancel_ret) { + case IO_WQ_CANCEL_OK: + ret = 0; + break; + case IO_WQ_CANCEL_RUNNING: + ret = -EALREADY; + break; + case IO_WQ_CANCEL_NOTFOUND: + ret = -ENOENT; + break; + } + + return ret; +} + +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); + + ret = io_async_cancel_one(req->task->io_uring, cd); + /* + * Fall-through even for -EALREADY, as we may have poll armed + * that need unarming. + */ + if (!ret) + return 0; + + spin_lock(&ctx->completion_lock); + ret = io_poll_cancel(ctx, cd); + if (ret != -ENOENT) + goto out; + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) + ret = io_timeout_cancel(ctx, cd); +out: + spin_unlock(&ctx->completion_lock); + return ret; +} + + +int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_cancel *cancel = io_kiocb_to_cmd(req); + + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sqe->off || sqe->len || sqe->splice_fd_in) + return -EINVAL; + + cancel->addr = READ_ONCE(sqe->addr); + cancel->flags = READ_ONCE(sqe->cancel_flags); + if (cancel->flags & ~CANCEL_FLAGS) + return -EINVAL; + if (cancel->flags & IORING_ASYNC_CANCEL_FD) { + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + cancel->fd = READ_ONCE(sqe->fd); + } + + return 0; +} + +static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, + unsigned int issue_flags) +{ + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + struct io_ring_ctx *ctx = cd->ctx; + struct io_tctx_node *node; + int ret, nr = 0; + + do { + ret = io_try_cancel(req, cd); + if (ret == -ENOENT) + break; + if (!all) + return ret; + nr++; + } while (1); + + /* slow path, try all io-wq's */ + io_ring_submit_lock(ctx, issue_flags); + ret = -ENOENT; + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + ret = io_async_cancel_one(tctx, cd); + if (ret != -ENOENT) { + if (!all) + break; + nr++; + } + } + io_ring_submit_unlock(ctx, issue_flags); + return all ? nr : ret; +} + +int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_cancel *cancel = io_kiocb_to_cmd(req); + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = cancel->addr, + .flags = cancel->flags, + .seq = atomic_inc_return(&req->ctx->cancel_seq), + }; + int ret; + + if (cd.flags & IORING_ASYNC_CANCEL_FD) { + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, cancel->fd, + issue_flags); + else + req->file = io_file_get_normal(req, cancel->fd); + if (!req->file) { + ret = -EBADF; + goto done; + } + cd.file = req->file; + } + + ret = __io_async_cancel(&cd, req, issue_flags); +done: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h new file mode 100644 index 00000000000000..4f35d86963253f --- /dev/null +++ b/io_uring/cancel.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); + +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b4ecfa7238555f..fb4f3ffa58c886 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -108,6 +108,7 @@ #include "msg_ring.h" #include "timeout.h" #include "poll.h" +#include "cancel.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -210,13 +211,6 @@ struct io_buffer { * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in */ -struct io_cancel { - struct file *file; - u64 addr; - u32 flags; - s32 fd; -}; - struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -3362,175 +3356,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -static bool io_cancel_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_cancel_data *cd = data; - - if (req->ctx != cd->ctx) - return false; - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { - ; - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { - if (req->file != cd->file) - return false; - } else { - if (req->cqe.user_data != cd->data) - return false; - } - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - return false; - req->work.cancel_seq = cd->seq; - } - return true; -} - -static int io_async_cancel_one(struct io_uring_task *tctx, - struct io_cancel_data *cd) -{ - enum io_wq_cancel cancel_ret; - int ret = 0; - bool all; - - if (!tctx || !tctx->io_wq) - return -ENOENT; - - all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); - switch (cancel_ret) { - case IO_WQ_CANCEL_OK: - ret = 0; - break; - case IO_WQ_CANCEL_RUNNING: - ret = -EALREADY; - break; - case IO_WQ_CANCEL_NOTFOUND: - ret = -ENOENT; - break; - } - - return ret; -} - -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - - ret = io_async_cancel_one(req->task->io_uring, cd); - /* - * Fall-through even for -EALREADY, as we may have poll armed - * that need unarming. - */ - if (!ret) - return 0; - - spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, cd); - if (ret != -ENOENT) - goto out; - if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) - ret = io_timeout_cancel(ctx, cd); -out: - spin_unlock(&ctx->completion_lock); - return ret; -} - -#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY) - -static int io_async_cancel_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_cancel *cancel = io_kiocb_to_cmd(req); - - if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sqe->off || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - cancel->addr = READ_ONCE(sqe->addr); - cancel->flags = READ_ONCE(sqe->cancel_flags); - if (cancel->flags & ~CANCEL_FLAGS) - return -EINVAL; - if (cancel->flags & IORING_ASYNC_CANCEL_FD) { - if (cancel->flags & IORING_ASYNC_CANCEL_ANY) - return -EINVAL; - cancel->fd = READ_ONCE(sqe->fd); - } - - return 0; -} - -static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, - unsigned int issue_flags) -{ - bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - struct io_ring_ctx *ctx = cd->ctx; - struct io_tctx_node *node; - int ret, nr = 0; - - do { - ret = io_try_cancel(req, cd); - if (ret == -ENOENT) - break; - if (!all) - return ret; - nr++; - } while (1); - - /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, issue_flags); - ret = -ENOENT; - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); - if (ret != -ENOENT) { - if (!all) - break; - nr++; - } - } - io_ring_submit_unlock(ctx, issue_flags); - return all ? nr : ret; -} - -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_cancel *cancel = io_kiocb_to_cmd(req); - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = cancel->addr, - .flags = cancel->flags, - .seq = atomic_inc_return(&req->ctx->cancel_seq), - }; - int ret; - - if (cd.flags & IORING_ASYNC_CANCEL_FD) { - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, cancel->fd, - issue_flags); - else - req->file = io_file_get_normal(req, cancel->fd); - if (!req->file) { - ret = -EBADF; - goto done; - } - cd.file = req->file; - } - - ret = __io_async_cancel(&cd, req, issue_flags); -done: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1ceac4ea62bf3b..a78e3c5ab109b8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -157,7 +157,6 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, bool *locked); void tctx_task_work(struct callback_head *cb); -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 5e42bfcd683e2a..69cca42d6835b4 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -11,6 +11,7 @@ #include "io_uring_types.h" #include "io_uring.h" #include "refs.h" +#include "cancel.h" #include "timeout.h" struct io_timeout { From 3b77495a97239faa27989f946d29b6be7dd091e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 07:07:23 -0600 Subject: [PATCH 051/172] io_uring: split provided buffers handling into its own file Move both the opcodes related to it, and the internals code dealing with it. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 602 +------------------------------------------- io_uring/io_uring.h | 36 +-- io_uring/kbuf.c | 524 ++++++++++++++++++++++++++++++++++++++ io_uring/kbuf.h | 142 +++++++++++ io_uring/net.c | 1 + io_uring/poll.c | 1 + 7 files changed, 672 insertions(+), 636 deletions(-) create mode 100644 io_uring/kbuf.c create mode 100644 io_uring/kbuf.h diff --git a/io_uring/Makefile b/io_uring/Makefile index cfd61e6b7759f7..b85418b64e8241 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o + cancel.o kbuf.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb4f3ffa58c886..e395167999edfb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" +#include "kbuf.h" #include "xattr.h" #include "nop.h" @@ -171,42 +172,10 @@ struct io_rsrc_data { bool quiesce; }; -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -struct io_buffer_list { - /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. - */ - union { - struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - }; - __u16 bgid; - - /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; - __u16 head; - __u16 mask; -}; - -struct io_buffer { - struct list_head list; - __u64 addr; - __u32 len; - __u16 bid; - __u16 bgid; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -#define BGID_ARRAY 64 - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in @@ -226,15 +195,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_provide_buf { - struct file *file; - __u64 addr; - __u32 len; - __u32 bgid; - __u16 nbufs; - __u16 bid; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -399,110 +359,6 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) - req->buf_list->head++; - req->flags &= ~REQ_F_BUFFER_RING; - } else { - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } - - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); -} - -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); -} - -inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) -{ - unsigned int cflags; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; -} - -static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - if (ctx->io_bl && bgid < BGID_ARRAY) - return &ctx->io_bl[bgid]; - - return xa_load(&ctx->io_bl_xa, bgid); -} - -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - struct io_buffer *buf; - - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } - return; - } - - io_ring_submit_lock(ctx, issue_flags); - - buf = req->kbuf; - bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; - - io_ring_submit_unlock(ctx, issue_flags); -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -2296,96 +2152,6 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, req->imu); } -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); -} - -static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl) -{ - if (!list_empty(&bl->buf_list)) { - struct io_buffer *kbuf; - - kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&kbuf->list); - if (*len > kbuf->len) - *len = kbuf->len; - req->flags |= REQ_F_BUFFER_SELECTED; - req->kbuf = kbuf; - req->buf_index = kbuf->bid; - return u64_to_user_ptr(kbuf->addr); - } - return NULL; -} - -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) -{ - struct io_uring_buf_ring *br = bl->buf_ring; - struct io_uring_buf *buf; - __u16 head = bl->head; - - if (unlikely(smp_load_acquire(&br->tail) == head)) - return NULL; - - head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } - if (*len > buf->len) - *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; - req->buf_index = buf->bid; - - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). - */ - req->buf_list = NULL; - bl->head++; - } - return u64_to_user_ptr(buf->addr); -} - -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - void __user *ret = NULL; - - io_ring_submit_lock(req->ctx, issue_flags); - - bl = io_buffer_get_list(ctx, req->buf_index); - if (likely(bl)) { - if (bl->buf_nr_pages) - ret = io_ring_buffer_select(req, len, bl, issue_flags); - else - ret = io_provided_buffer_select(req, len, bl); - } - io_ring_submit_unlock(req->ctx, issue_flags); - return ret; -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) @@ -3098,258 +2864,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, return ret; } -static int io_remove_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - u64 tmp; - - if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || - sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -EINVAL; - - memset(p, 0, sizeof(*p)); - p->nbufs = tmp; - p->bgid = READ_ONCE(sqe->buf_group); - return 0; -} - -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) -{ - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->buf_nr_pages) { - int j; - - i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - return i; - } - - /* the head kbuf is the list itself */ - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - - nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&nxt->list); - if (++i == nbufs) - return i; - cond_resched(); - } - i++; - - return i; -} - -static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - if (ret < 0) - req_set_fail(req); - - /* complete before unlock, IOPOLL may need the lock */ - io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_provide_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - unsigned long size, tmp_check; - struct io_provide_buf *p = io_kiocb_to_cmd(req); - u64 tmp; - - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -E2BIG; - p->nbufs = tmp; - p->addr = READ_ONCE(sqe->addr); - p->len = READ_ONCE(sqe->len); - - if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, - &size)) - return -EOVERFLOW; - if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) - return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; - if (!access_ok(u64_to_user_ptr(p->addr), size)) - return -EFAULT; - - p->bgid = READ_ONCE(sqe->buf_group); - tmp = READ_ONCE(sqe->off); - if (tmp > USHRT_MAX) - return -E2BIG; - p->bid = tmp; - return 0; -} - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *buf; - struct page *page; - int bufs_in_page; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * page worth of buffer entries and add those to our freelist. - */ - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - - list_add(&page->lru, &ctx->io_buffers_pages); - - buf = page_address(page); - bufs_in_page = PAGE_SIZE / sizeof(*buf); - while (bufs_in_page) { - list_add_tail(&buf->list, &ctx->io_buffers_cache); - buf++; - bufs_in_page--; - } - - return 0; -} - -static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer_list *bl) -{ - struct io_buffer *buf; - u64 addr = pbuf->addr; - int i, bid = pbuf->bid; - - for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) - break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); - buf->addr = addr; - buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); - buf->bid = bid; - buf->bgid = pbuf->bgid; - addr += pbuf->len; - bid++; - cond_resched(); - } - - return i ? 0 : -ENOMEM; -} - -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - int i; - - ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), - GFP_KERNEL); - if (!ctx->io_bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); - ctx->io_bl[i].bgid = i; - } - - return 0; -} - -static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto err; - } - INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); - if (ret) { - kfree(bl); - goto err; - } - } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { - ret = -EINVAL; - goto err; - } - - ret = io_add_buffers(ctx, p, bl); -err: - if (ret < 0) - req_set_fail(req); - /* complete before unlock, IOPOLL may need the lock */ - io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { @@ -5218,8 +4732,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, - int *npages) +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; @@ -5543,33 +5056,6 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } -static void io_destroy_buffers(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } - - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } - - while (!list_empty(&ctx->io_buffers_pages)) { - struct page *page; - - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); - list_del_init(&page->lru); - __free_page(page); - } -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; @@ -6953,89 +6439,6 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return ret; } -static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; - struct page **pages; - int nr_pages; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - if (!is_power_of_2(reg.ring_entries)) - return -EINVAL; - - /* cannot disambiguate full vs empty due to head/tail size */ - if (reg.ring_entries >= 65536) - return -EINVAL; - - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - - bl = io_buffer_get_list(ctx, reg.bgid); - if (bl) { - /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) - return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; - } - - pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(free_bl); - return PTR_ERR(pages); - } - - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; -} - -static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - - bl = io_buffer_get_list(ctx, reg.bgid); - if (!bl) - return -ENOENT; - if (!bl->buf_nr_pages) - return -EINVAL; - - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree(bl); - } - return 0; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -7701,7 +7104,6 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index a78e3c5ab109b8..172defdcfdbe40 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -99,42 +99,8 @@ void __io_req_complete_post(struct io_kiocb *req); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); -unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); -static inline bool io_do_buffer_select(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return false; - return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); -} - -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); -static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - - /* - * READV uses fields in `struct io_rw` (len/addr) to stash the selected - * buffer data. However if that buffer is recycled the original request - * data stored in addr is lost. Therefore forbid recycling for now. - */ - if (req->opcode == IORING_OP_READV) - return; - - __io_kbuf_recycle(req, issue_flags); -} +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c new file mode 100644 index 00000000000000..bc58890d932b2f --- /dev/null +++ b/io_uring/kbuf.c @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "opdef.h" +#include "kbuf.h" + +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) + +#define BGID_ARRAY 64 + +struct io_provide_buf { + struct file *file; + __u64 addr; + __u32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + +static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) +{ + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; + + return xa_load(&ctx->io_bl_xa, bgid); +} + +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + struct io_buffer *buf; + + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + */ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) { + if (req->flags & REQ_F_PARTIAL_IO) { + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + } + return; + } + + io_ring_submit_lock(ctx, issue_flags); + + buf = req->kbuf; + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->buf_index = buf->bgid; + + io_ring_submit_unlock(ctx, issue_flags); +} + +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + bl->bgid = bgid; + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); +} + +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl) +{ + if (!list_empty(&bl->buf_list)) { + struct io_buffer *kbuf; + + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&kbuf->list); + if (*len > kbuf->len) + *len = kbuf->len; + req->flags |= REQ_F_BUFFER_SELECTED; + req->kbuf = kbuf; + req->buf_index = kbuf->bid; + return u64_to_user_ptr(kbuf->addr); + } + return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct io_uring_buf *buf; + __u16 head = bl->head; + + if (unlikely(smp_load_acquire(&br->tail) == head)) + return NULL; + + head &= bl->mask; + if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + buf = &br->bufs[head]; + } else { + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; + buf = page_address(bl->buf_pages[index]); + buf += off; + } + if (*len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + req->buf_index = buf->bid; + + if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here. This does mean it'll be pinned until the IO + * completes. But coming in unlocked means we're in io-wq + * context, hence there should be no further retry. For the + * locked case, the caller must ensure to call the commit when + * the transfer completes (or if we get -EAGAIN and must poll + * or retry). + */ + req->buf_list = NULL; + bl->head++; + } + return u64_to_user_ptr(buf->addr); +} + +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + void __user *ret = NULL; + + io_ring_submit_lock(req->ctx, issue_flags); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (likely(bl)) { + if (bl->buf_nr_pages) + ret = io_ring_buffer_select(req, len, bl, issue_flags); + else + ret = io_provided_buffer_select(req, len, bl); + } + io_ring_submit_unlock(req->ctx, issue_flags); + return ret; +} + +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ + int i; + + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), + GFP_KERNEL); + if (!ctx->io_bl) + return -ENOMEM; + + for (i = 0; i < BGID_ARRAY; i++) { + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); + ctx->io_bl[i].bgid = i; + } + + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + if (bl->buf_nr_pages) { + int j; + + i = bl->buf_ring->tail - bl->head; + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + return i; + } + + /* the head kbuf is the list itself */ + while (!list_empty(&bl->buf_list)) { + struct io_buffer *nxt; + + nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&nxt->list); + if (++i == nbufs) + return i; + cond_resched(); + } + i++; + + return i; +} + +void io_destroy_buffers(struct io_ring_ctx *ctx) +{ + struct io_buffer_list *bl; + unsigned long index; + int i; + + for (i = 0; i < BGID_ARRAY; i++) { + if (!ctx->io_bl) + break; + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); + } + + xa_for_each(&ctx->io_bl_xa, index, bl) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); + } + + while (!list_empty(&ctx->io_buffers_pages)) { + struct page *page; + + page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); + list_del_init(&page->lru); + __free_page(page); + } +} + +int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req); + u64 tmp; + + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + sqe->splice_fd_in) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -EINVAL; + + memset(p, 0, sizeof(*p)); + p->nbufs = tmp; + p->bgid = READ_ONCE(sqe->buf_group); + return 0; +} + +int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + + ret = -ENOENT; + bl = io_buffer_get_list(ctx, p->bgid); + if (bl) { + ret = -EINVAL; + /* can't use provide/remove buffers command on mapped buffers */ + if (!bl->buf_nr_pages) + ret = __io_remove_buffers(ctx, bl, p->nbufs); + } + if (ret < 0) + req_set_fail(req); + + /* complete before unlock, IOPOLL may need the lock */ + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); + io_ring_submit_unlock(ctx, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned long size, tmp_check; + struct io_provide_buf *p = io_kiocb_to_cmd(req); + u64 tmp; + + if (sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -E2BIG; + p->nbufs = tmp; + p->addr = READ_ONCE(sqe->addr); + p->len = READ_ONCE(sqe->len); + + if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, + &size)) + return -EOVERFLOW; + if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) + return -EOVERFLOW; + + size = (unsigned long)p->len * p->nbufs; + if (!access_ok(u64_to_user_ptr(p->addr), size)) + return -EFAULT; + + p->bgid = READ_ONCE(sqe->buf_group); + tmp = READ_ONCE(sqe->off); + if (tmp > USHRT_MAX) + return -E2BIG; + p->bid = tmp; + return 0; +} + +static int io_refill_buffer_cache(struct io_ring_ctx *ctx) +{ + struct io_buffer *buf; + struct page *page; + int bufs_in_page; + + /* + * Completions that don't happen inline (eg not under uring_lock) will + * add to ->io_buffers_comp. If we don't have any free buffers, check + * the completion list and splice those entries first. + */ + if (!list_empty_careful(&ctx->io_buffers_comp)) { + spin_lock(&ctx->completion_lock); + if (!list_empty(&ctx->io_buffers_comp)) { + list_splice_init(&ctx->io_buffers_comp, + &ctx->io_buffers_cache); + spin_unlock(&ctx->completion_lock); + return 0; + } + spin_unlock(&ctx->completion_lock); + } + + /* + * No free buffers and no completion entries either. Allocate a new + * page worth of buffer entries and add those to our freelist. + */ + page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!page) + return -ENOMEM; + + list_add(&page->lru, &ctx->io_buffers_pages); + + buf = page_address(page); + bufs_in_page = PAGE_SIZE / sizeof(*buf); + while (bufs_in_page) { + list_add_tail(&buf->list, &ctx->io_buffers_cache); + buf++; + bufs_in_page--; + } + + return 0; +} + +static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, + struct io_buffer_list *bl) +{ + struct io_buffer *buf; + u64 addr = pbuf->addr; + int i, bid = pbuf->bid; + + for (i = 0; i < pbuf->nbufs; i++) { + if (list_empty(&ctx->io_buffers_cache) && + io_refill_buffer_cache(ctx)) + break; + buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, + list); + list_move_tail(&buf->list, &bl->buf_list); + buf->addr = addr; + buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); + buf->bid = bid; + buf->bgid = pbuf->bgid; + addr += pbuf->len; + bid++; + cond_resched(); + } + + return i ? 0 : -ENOMEM; +} + +int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { + ret = io_init_bl_list(ctx); + if (ret) + goto err; + } + + bl = io_buffer_get_list(ctx, p->bgid); + if (unlikely(!bl)) { + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) { + ret = -ENOMEM; + goto err; + } + INIT_LIST_HEAD(&bl->buf_list); + ret = io_buffer_add_list(ctx, bl, p->bgid); + if (ret) { + kfree(bl); + goto err; + } + } + /* can't add buffers via this command for a mapped buffer ring */ + if (bl->buf_nr_pages) { + ret = -EINVAL; + goto err; + } + + ret = io_add_buffers(ctx, p, bl); +err: + if (ret < 0) + req_set_fail(req); + /* complete before unlock, IOPOLL may need the lock */ + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); + io_ring_submit_unlock(ctx, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_ring *br; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl, *free_bl = NULL; + struct page **pages; + int nr_pages; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + if (!is_power_of_2(reg.ring_entries)) + return -EINVAL; + + /* cannot disambiguate full vs empty due to head/tail size */ + if (reg.ring_entries >= 65536) + return -EINVAL; + + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { + int ret = io_init_bl_list(ctx); + if (ret) + return ret; + } + + bl = io_buffer_get_list(ctx, reg.bgid); + if (bl) { + /* if mapped buffer ring OR classic exists, don't allow */ + if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + return -EEXIST; + } else { + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + } + + pages = io_pin_pages(reg.ring_addr, + struct_size(br, bufs, reg.ring_entries), + &nr_pages); + if (IS_ERR(pages)) { + kfree(free_bl); + return PTR_ERR(pages); + } + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->nr_entries = reg.ring_entries; + bl->buf_ring = br; + bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +} + +int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); + if (!bl) + return -ENOENT; + if (!bl->buf_nr_pages) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h new file mode 100644 index 00000000000000..9da3a933ef40e1 --- /dev/null +++ b/io_uring/kbuf.h @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_KBUF_H +#define IOU_KBUF_H + +#include + +struct io_buffer_list { + /* + * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, + * then these are classic provided buffers and ->buf_list is used. + */ + union { + struct list_head buf_list; + struct { + struct page **buf_pages; + struct io_uring_buf_ring *buf_ring; + }; + }; + __u16 bgid; + + /* below is for ring provided buffers */ + __u16 buf_nr_pages; + __u16 nr_entries; + __u16 head; + __u16 mask; +}; + +struct io_buffer { + struct list_head list; + __u64 addr; + __u32 len; + __u16 bid; + __u16 bgid; +}; + +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags); +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); +void io_destroy_buffers(struct io_ring_ctx *ctx); + +int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); + +int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); + +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + +static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +{ + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return; + /* + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. + */ + if ((req->flags & REQ_F_BUFFER_SELECTED) && + (req->flags & REQ_F_PARTIAL_IO)) + return; + + /* + * READV uses fields in `struct io_rw` (len/addr) to stash the selected + * buffer data. However if that buffer is recycled the original request + * data stored in addr is lost. Therefore forbid recycling for now. + */ + if (req->opcode == IORING_OP_READV) + return; + + __io_kbuf_recycle(req, issue_flags); +} + +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) +{ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) + req->buf_list->head++; + req->flags &= ~REQ_F_BUFFER_RING; + } else { + list_add(&req->kbuf->list, list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); +} + +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +{ + lockdep_assert_held(&req->ctx->completion_lock); + + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return 0; + return __io_put_kbuf(req, &req->ctx->io_buffers_comp); +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + unsigned int cflags; + + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return 0; + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + lockdep_assert_held(&req->ctx->uring_lock); + + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); + } + + return cflags; +} +#endif diff --git a/io_uring/net.c b/io_uring/net.c index 2434548d0c1fb3..fe1fe920b9291f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -12,6 +12,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "kbuf.h" #include "net.h" #if defined(CONFIG_NET) diff --git a/io_uring/poll.c b/io_uring/poll.c index c3e4fcb0a7ba7b..b80f7fa261232d 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -17,6 +17,7 @@ #include "io_uring.h" #include "refs.h" #include "opdef.h" +#include "kbuf.h" #include "poll.h" struct io_poll_update { From 73572984481907d92673255b494c0ff4f77c8ed4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 07:12:45 -0600 Subject: [PATCH 052/172] io_uring: move rsrc related data, core, and commands Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 1644 +++--------------------------------------- io_uring/io_uring.h | 6 +- io_uring/openclose.c | 1 + io_uring/rsrc.c | 1320 +++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 155 ++++ 6 files changed, 1595 insertions(+), 1533 deletions(-) create mode 100644 io_uring/rsrc.c create mode 100644 io_uring/rsrc.h diff --git a/io_uring/Makefile b/io_uring/Makefile index b85418b64e8241..360a83039c2a4f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o + cancel.o kbuf.o rsrc.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e395167999edfb..0c47c919887f52 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include #include @@ -94,6 +93,7 @@ #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" +#include "rsrc.h" #include "xattr.h" #include "nop.h" @@ -114,17 +114,9 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -/* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - -#define IORING_MAX_REG_BUFFERS (1U << 14) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -140,38 +132,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_rsrc_put { - struct list_head list; - u64 tag; - union { - void *rsrc; - struct file *file; - struct io_mapped_ubuf *buf; - }; -}; - -struct io_rsrc_node { - struct percpu_ref refs; - struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; - struct llist_node llist; - bool done; -}; - -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - rsrc_put_fn *do_put; - atomic_t refs; - struct completion done; - bool quiesce; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 @@ -188,13 +148,6 @@ struct io_rw { rwf_t flags; }; -struct io_rsrc_update { - struct file *file; - u64 arg; - u32 nr_args; - u32 offset; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -208,11 +161,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -enum { - IORING_RSRC_FILE = 0, - IORING_RSRC_BUFFER = 1, -}; - enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, @@ -233,12 +181,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_dismantle_req(struct io_kiocb *req); -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args); static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); -static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -268,22 +212,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -#if defined(CONFIG_UNIX) -static inline bool io_file_need_scm(struct file *filp) -{ -#if defined(IO_URING_SCM_ALL) - return true; -#else - return !!unix_get_socket(filp); -#endif -} -#else -static inline bool io_file_need_scm(struct file *filp) -{ - return false; -} -#endif - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -298,67 +226,6 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) __io_submit_flush_completions(ctx); } -#define IO_RSRC_REF_BATCH 100 - -static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) -{ - percpu_ref_put_many(&node->refs, nr); -} - -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; - else - io_rsrc_put_node(node, 1); - } -} - -static inline void io_req_put_rsrc(struct io_kiocb *req) -{ - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); -} - -static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } -} - -static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); -} - -static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) -{ - if (!req->rsrc_node) { - req->rsrc_node = ctx->rsrc_node; - - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - lockdep_assert_held(&ctx->uring_lock); - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); - } else { - percpu_ref_get(&req->rsrc_node->refs); - } - } -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -2870,92 +2737,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -static int io_files_update_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_rsrc_update *up = io_kiocb_to_cmd(req); - - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - up->offset = READ_ONCE(sqe->off); - up->nr_args = READ_ONCE(sqe->len); - if (!up->nr_args) - return -EINVAL; - up->arg = READ_ONCE(sqe->addr); - return 0; -} - -static int io_files_update_with_index_alloc(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_rsrc_update *up = io_kiocb_to_cmd(req); - __s32 __user *fds = u64_to_user_ptr(up->arg); - unsigned int done; - struct file *file; - int ret, fd; - - if (!req->ctx->file_data) - return -ENXIO; - - for (done = 0; done < up->nr_args; done++) { - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { - ret = -EFAULT; - break; - } - - file = fget(fd); - if (!file) { - ret = -EBADF; - break; - } - ret = io_fixed_fd_install(req, issue_flags, file, - IORING_FILE_INDEX_ALLOC); - if (ret < 0) - break; - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { - __io_close_fixed(req, issue_flags, ret); - ret = -EFAULT; - break; - } - } - - if (done) - return done; - return ret; -} - -static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rsrc_update *up = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update2 up2; - int ret; - - up2.offset = up->offset; - up2.data = up->arg; - up2.nr = 0; - up2.tags = 0; - up2.resv = 0; - up2.resv2 = 0; - - if (up->offset == IORING_FILE_INDEX_ALLOC) { - ret = io_files_update_with_index_alloc(req, issue_flags); - } else { - io_ring_submit_lock(ctx, issue_flags); - ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up2, up->nr_args); - io_ring_submit_unlock(ctx, issue_flags); - } - - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_req_prep_async(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -3696,7 +3477,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, return -1; } -static int io_run_task_work_sig(void) +int io_run_task_work_sig(void) { if (io_run_task_work()) return 1; @@ -3798,1265 +3579,164 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_free_page_table(void **table, size_t size) +int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) + __must_hold(&req->ctx->uring_lock) { - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + struct io_ring_ctx *ctx = req->ctx; + bool needs_switch = false; + struct io_fixed_file *file_slot; + int ret; - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); -} + if (io_is_uring_fops(file)) + return -EBADF; + if (!ctx->file_data) + return -ENXIO; + if (slot_index >= ctx->nr_user_files) + return -EINVAL; -static __cold void **io_alloc_page_table(size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; + if (file_slot->file_ptr) { + struct file *old_file; - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; + old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + ctx->rsrc_node, old_file); + if (ret) + goto err; + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); + needs_switch = true; } - return table; -} -static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + } +err: + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->file_data); + if (ret) + fput(file); + return ret; } -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) +static void io_mem_free(void *ptr) { - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; + struct page *page; - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + if (!ptr) + return; - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + page = virt_to_head_page(ptr); + if (put_page_testzero(page)) + free_compound_page(page); } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +static void *io_mem_alloc(size_t size) { - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; + return (void *) __get_free_pages(gfp, get_order(size)); } -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { - WARN_ON_ONCE(!ctx->rsrc_backup_node); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + struct io_rings *rings; + size_t off, sq_array_size; - io_rsrc_refs_drop(ctx); + off = struct_size(rings, cqes, cq_entries); + if (off == SIZE_MAX) + return SIZE_MAX; + if (ctx->flags & IORING_SETUP_CQE32) { + if (check_shl_overflow(off, 1, &off)) + return SIZE_MAX; + } - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif - rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); + if (sq_offset) + *sq_offset = off; - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); - ctx->rsrc_node = NULL; - } + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } -} + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; + return off; } -static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) { - int ret; - - /* As we may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - data->quiesce = true; - do { - ret = io_rsrc_node_switch_start(ctx); - if (ret) - break; - io_rsrc_node_switch(ctx, data); + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) - break; - mutex_unlock(&ctx->uring_lock); - flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) > 0) { - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } else { - break; - } - } + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; - ret = io_run_task_work_sig(); - mutex_lock(&ctx->uring_lock); - } while (ret >= 0); - data->quiesce = false; + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; - return ret; + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; } -static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +static void io_eventfd_put(struct rcu_head *rcu) { - unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; - unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - return &data->tags[table_idx][off]; + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); } -static void io_rsrc_data_free(struct io_rsrc_data *data) +static int io_eventfd_unregister(struct io_ring_ctx *ctx) { - size_t size = data->nr * sizeof(data->tags[0][0]); + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + call_rcu(&ev_fd->rcu, io_eventfd_put); + return 0; + } - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); + return -ENXIO; } -static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) -{ - struct io_rsrc_data *data; - int ret = -ENOMEM; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; - } - - data->nr = nr; - data->ctx = ctx; - data->do_put = do_put; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } - } - - atomic_set(&data->refs, 1); - init_completion(&data->done); - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; -} - -static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ -#if !defined(IO_URING_SCM_ALL) - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(&ctx->file_table, i); - - if (!file) - continue; - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); - } -#endif - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#endif - io_free_file_tables(&ctx->file_table); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; -} - -static int io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) - return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; -} - -/* - * Ensure the UNIX gc is aware of our file set, so we are certain that - * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. We account only files that can hold other - * files because otherwise they can't form a loop and so are not interesting - * for GC. - */ -static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sk = ctx->ring_sock->sk; - struct sk_buff_head *head = &sk->sk_receive_queue; - struct scm_fp_list *fpl; - struct sk_buff *skb; - - if (likely(!io_file_need_scm(file))) - return 0; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) - __skb_unlink(skb, head); - else - skb = NULL; - spin_unlock_irq(&head->lock); - - if (!skb) { - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } - - fpl->user = get_uid(current_user()); - fpl->max = SCM_MAX_FD; - fpl->count = 0; - - UNIXCB(skb).fp = fpl; - skb->sk = sk; - skb->destructor = unix_destruct_scm; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - } - - fpl = UNIXCB(skb).fp; - fpl->fp[fpl->count++] = get_file(file); - unix_inflight(fpl->user, file); - skb_queue_head(head, skb); - fput(file); -#endif - return 0; -} - -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - struct file *file = prsrc->file; -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - if (!io_file_need_scm(file)) { - fput(file); - return; - } - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(file); -#endif -} - -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; - struct io_rsrc_put *prsrc, *tmp; - - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } - - rsrc_data->do_put(ctx, prsrc); - kfree(prsrc); - } - - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) - complete(&rsrc_data->done); -} - -static void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args, u64 __user *tags) -{ - __s32 __user *fds = (__s32 __user *) arg; - struct file *file; - int fd, ret; - unsigned i; - - if (ctx->file_data) - return -EBUSY; - if (!nr_args) - return -EINVAL; - if (nr_args > IORING_MAX_FIXED_FILES) - return -EMFILE; - if (nr_args > rlimit(RLIMIT_NOFILE)) - return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; - - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; - goto fail; - } - /* allow sparse sets */ - if (!fds || fd == -1) { - ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto fail; - continue; - } - - file = fget(fd); - ret = -EBADF; - if (unlikely(!file)) - goto fail; - - /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. - */ - if (io_is_uring_fops(file)) { - fput(file); - goto fail; - } - ret = io_scm_file_account(ctx, file); - if (ret) { - fput(file); - goto fail; - } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - - io_rsrc_node_switch(ctx, NULL); - return 0; -fail: - __io_sqe_files_unregister(ctx); - return ret; -} - -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) -{ - u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; - - prsrc->tag = *tag_slot; - *tag_slot = 0; - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); - return 0; -} - -int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) - __must_hold(&req->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool needs_switch = false; - struct io_fixed_file *file_slot; - int ret; - - if (io_is_uring_fops(file)) - return -EBADF; - if (!ctx->file_data) - return -ENXIO; - if (slot_index >= ctx->nr_user_files) - return -EINVAL; - - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); - if (ret) - goto err; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; - } - - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); - return ret; -} - -static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; - struct file *file; - int fd, i, err = 0; - unsigned int done; - bool needs_switch = false; - - if (!ctx->file_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - u64 tag = 0; - - if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || - copy_from_user(&fd, &fds[done], sizeof(fd))) { - err = -EFAULT; - break; - } - if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { - err = -EINVAL; - break; - } - if (fd == IORING_REGISTER_FILES_SKIP) - continue; - - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); - if (err) - break; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; - } - if (fd != -1) { - file = fget(fd); - if (!file) { - err = -EBADF; - break; - } - /* - * Don't allow io_uring instances to be registered. If - * UNIX isn't enabled, then this causes a reference - * cycle and this instance can never get freed. If UNIX - * is enabled we'll handle it just fine, but there's - * still no point in allowing a ring fd as it doesn't - * support regular read/write anyway. - */ - if (io_is_uring_fops(file)) { - fput(file); - err = -EBADF; - break; - } - err = io_scm_file_account(ctx, file); - if (err) { - fput(file); - break; - } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); - return done ? done : err; -} - -static inline void __io_unaccount_mem(struct user_struct *user, - unsigned long nr_pages) -{ - atomic_long_sub(nr_pages, &user->locked_vm); -} - -static inline int __io_account_mem(struct user_struct *user, - unsigned long nr_pages) -{ - unsigned long page_limit, cur_pages, new_pages; - - /* Don't allow more pages than we can safely lock */ - page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - do { - cur_pages = atomic_long_read(&user->locked_vm); - new_pages = cur_pages + nr_pages; - if (new_pages > page_limit) - return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); - - return 0; -} - -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); - - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); -} - -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - int ret; - - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); - if (ret) - return ret; - } - - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); - - return 0; -} - -static void io_mem_free(void *ptr) -{ - struct page *page; - - if (!ptr) - return; - - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); -} - -static void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - - return (void *) __get_free_pages(gfp, get_order(size)); -} - -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) -{ - struct io_rings *rings; - size_t off, sq_array_size; - - off = struct_size(rings, cqes, cq_entries); - if (off == SIZE_MAX) - return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } - -#ifdef CONFIG_SMP - off = ALIGN(off, SMP_CACHE_BYTES); - if (off == 0) - return SIZE_MAX; -#endif - - if (sq_offset) - *sq_offset = off; - - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; - - return off; -} - -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) -{ - struct io_mapped_ubuf *imu = *slot; - unsigned int i; - - if (imu != ctx->dummy_ubuf) { - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } - *slot = NULL; -} - -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) - return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; -} - -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - -/* - * Not super efficient, but this is just a registration time. And we do cache - * the last compound head, so generally we'll only do a full search if we don't - * match that one. - * - * We check if the given compound head page has already been accounted, to - * avoid double accounting it. This allows us to account the full size of the - * page, not just the constituent pages of a huge page. - */ -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct page *hpage) -{ - int i, j; - - /* check current page array */ - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) - continue; - if (compound_head(pages[i]) == hpage) - return true; - } - - /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; - - for (j = 0; j < imu->nr_bvecs; j++) { - if (!PageCompound(imu->bvec[j].bv_page)) - continue; - if (compound_head(imu->bvec[j].bv_page) == hpage) - return true; - } - } - - return false; -} - -static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct io_mapped_ubuf *imu, - struct page **last_hpage) -{ - int i, ret; - - imu->acct_pages = 0; - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) { - imu->acct_pages++; - } else { - struct page *hpage; - - hpage = compound_head(pages[i]); - if (hpage == *last_hpage) - continue; - *last_hpage = hpage; - if (headpage_already_acct(ctx, pages, i, hpage)) - continue; - imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; - } - } - - if (!imu->acct_pages) - return 0; - - ret = io_account_mem(ctx, imu->acct_pages); - if (ret) - imu->acct_pages = 0; - return ret; -} - -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) -{ - unsigned long start, end, nr_pages; - struct vm_area_struct **vmas = NULL; - struct page **pages = NULL; - int i, pret, ret = -ENOMEM; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto done; - - vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - goto done; - - ret = 0; - mmap_read_lock(current->mm); - pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) - continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { - ret = -EOPNOTSUPP; - break; - } - } - *npages = nr_pages; - } else { - ret = pret < 0 ? pret : -EFAULT; - } - mmap_read_unlock(current->mm); - if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ - if (pret > 0) - unpin_user_pages(pages, pret); - goto done; - } - ret = 0; -done: - kvfree(vmas); - if (ret < 0) { - kvfree(pages); - pages = ERR_PTR(ret); - } - return pages; -} - -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) -{ - struct io_mapped_ubuf *imu = NULL; - struct page **pages = NULL; - unsigned long off; - size_t size; - int ret, nr_pages, i; - - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - *pimu = NULL; - ret = -ENOMEM; - - pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, - &nr_pages); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - pages = NULL; - goto done; - } - - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - - ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); - goto done; - } - - off = (unsigned long) iov->iov_base & ~PAGE_MASK; - size = iov->iov_len; - for (i = 0; i < nr_pages; i++) { - size_t vec_len; - - vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[i].bv_page = pages[i]; - imu->bvec[i].bv_len = vec_len; - imu->bvec[i].bv_offset = off; - off = 0; - size -= vec_len; - } - /* store original address for later verification */ - imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; - imu->nr_bvecs = nr_pages; - *pimu = imu; - ret = 0; -done: - if (ret) - kvfree(imu); - kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; -} - -static int io_buffer_validate(struct iovec *iov) -{ - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); - - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - if (!iov->iov_base) - return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; -} - -static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args, u64 __user *tags) -{ - struct page *last_hpage = NULL; - struct io_rsrc_data *data; - int i, ret; - struct iovec iov; - - if (ctx->user_bufs) - return -EBUSY; - if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) - return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); - if (ret) - return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; - } else { - memset(&iov, 0, sizeof(iov)); - } - - if (!iov.iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; - } - - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) - break; - } - - WARN_ON_ONCE(ctx->buf_data); - - ctx->buf_data = data; - if (ret) - __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); - return ret; -} - -static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned int nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); - struct page *last_hpage = NULL; - bool needs_switch = false; - __u32 done; - int i, err; - - if (!ctx->buf_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; - int offset = up->offset + done; - u64 tag = 0; - - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) - break; - if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { - err = -EFAULT; - break; - } - err = io_buffer_validate(&iov); - if (err) - break; - if (!iov.iov_base && tag) { - err = -EINVAL; - break; - } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); - break; - } - ctx->user_bufs[i] = NULL; - needs_switch = true; - } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, offset) = tag; - } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); - return done ? done : err; -} - -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return 0; -} - -static void io_eventfd_put(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); -} - -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - call_rcu(&ev_fd->rcu, io_eventfd_put); - return 0; - } - - return -ENXIO; -} - -static void io_req_caches_free(struct io_ring_ctx *ctx) +static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; int nr = 0; @@ -5078,12 +3758,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_wait_rsrc_data(struct io_rsrc_data *data) -{ - if (data && !atomic_dec_and_test(&data->refs)) - wait_for_completion(&data->done); -} - static void io_flush_apoll_cache(struct io_ring_ctx *ctx) { struct async_poll *apoll; @@ -6228,89 +4902,6 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) return 0; } -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - __u32 tmp; - int err; - - if (check_add_overflow(up->offset, nr_args, &tmp)) - return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; - - switch (type) { - case IORING_RSRC_FILE: - return __io_sqe_files_update(ctx, up, nr_args); - case IORING_RSRC_BUFFER: - return __io_sqe_buffers_update(ctx, up, nr_args); - } - return -EINVAL; -} - -static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update2 up; - - if (!nr_args) - return -EINVAL; - memset(&up, 0, sizeof(up)); - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) - return -EFAULT; - if (up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); -} - -static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned size, unsigned type) -{ - struct io_uring_rsrc_update2 up; - - if (size != sizeof(up)) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (!up.nr || up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, type, &up, up.nr); -} - -static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, - unsigned int size, unsigned int type) -{ - struct io_uring_rsrc_register rr; - - /* keep it extendible */ - if (size != sizeof(rr)) - return -EINVAL; - - memset(&rr, 0, sizeof(rr)); - if (copy_from_user(&rr, arg, size)) - return -EFAULT; - if (!rr.nr || rr.resv2) - return -EINVAL; - if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) - return -EINVAL; - - switch (type) { - case IORING_RSRC_FILE: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - case IORING_RSRC_BUFFER: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - } - return -EINVAL; -} - static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { @@ -7103,7 +5694,6 @@ static int __init io_uring_init(void) sizeof(struct io_uring_rsrc_update2)); /* ->buf_index is u16 */ - BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 172defdcfdbe40..090c17deba9db2 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -92,6 +92,7 @@ static inline bool io_run_task_work(void) return false; } +int io_run_task_work_sig(void); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); @@ -110,11 +111,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc); -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); diff --git a/io_uring/openclose.c b/io_uring/openclose.c index fa35bd56a33086..1cbf3903097053 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -14,6 +14,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "rsrc.h" #include "openclose.h" struct io_open { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c new file mode 100644 index 00000000000000..8c40b20659d40a --- /dev/null +++ b/io_uring/rsrc.c @@ -0,0 +1,1320 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "openclose.h" +#include "rsrc.h" + +struct io_rsrc_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage); + +#define IO_RSRC_REF_BATCH 100 + +/* only define max */ +#define IORING_MAX_FIXED_FILES (1U << 20) +#define IORING_MAX_REG_BUFFERS (1U << 14) + +void io_rsrc_refs_drop(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (ctx->rsrc_cached_refs) { + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); + ctx->rsrc_cached_refs = 0; + } +} + +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) +{ + unsigned long page_limit, cur_pages, new_pages; + + /* Don't allow more pages than we can safely lock */ + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + return 0; +} + +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->user) + __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->mm_account) + atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + int ret; + + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->mm_account) + atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + + return 0; +} + +static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, + void __user *arg, unsigned index) +{ + struct iovec __user *src; + +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_iovec __user *ciovs; + struct compat_iovec ciov; + + ciovs = (struct compat_iovec __user *) arg; + if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) + return -EFAULT; + + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); + dst->iov_len = ciov.iov_len; + return 0; + } +#endif + src = (struct iovec __user *) arg; + if (copy_from_user(dst, &src[index], sizeof(*dst))) + return -EFAULT; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ + unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + + /* + * Don't impose further limits on the size and buffer + * constraints here, we'll -EINVAL later when IO is + * submitted if they are wrong. + */ + if (!iov->iov_base) + return iov->iov_len ? -EFAULT : 0; + if (!iov->iov_len) + return -EFAULT; + + /* arbitrary limit, but we need something */ + if (iov->iov_len > SZ_1G) + return -EFAULT; + + if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) + return -EOVERFLOW; + + return 0; +} + +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) +{ + struct io_mapped_ubuf *imu = *slot; + unsigned int i; + + if (imu != ctx->dummy_ubuf) { + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); + } + *slot = NULL; +} + +void io_rsrc_refs_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; + percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); +} + +static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) +{ + struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; + struct io_ring_ctx *ctx = rsrc_data->ctx; + struct io_rsrc_put *prsrc, *tmp; + + list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { + list_del(&prsrc->list); + + if (prsrc->tag) { + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); + + spin_lock(&ctx->completion_lock); + io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); + } + + rsrc_data->do_put(ctx, prsrc); + kfree(prsrc); + } + + io_rsrc_node_destroy(ref_node); + if (atomic_dec_and_test(&rsrc_data->refs)) + complete(&rsrc_data->done); +} + +void io_rsrc_put_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx; + struct llist_node *node; + + ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); + node = llist_del_all(&ctx->rsrc_put_llist); + + while (node) { + struct io_rsrc_node *ref_node; + struct llist_node *next = node->next; + + ref_node = llist_entry(node, struct io_rsrc_node, llist); + __io_rsrc_put_work(ref_node); + node = next; + } +} + +void io_wait_rsrc_data(struct io_rsrc_data *data) +{ + if (data && !atomic_dec_and_test(&data->refs)) + wait_for_completion(&data->done); +} + +void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +{ + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); +} + +static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) +{ + struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); + struct io_ring_ctx *ctx = node->rsrc_data->ctx; + unsigned long flags; + bool first_add = false; + unsigned long delay = HZ; + + spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); + node->done = true; + + /* if we are mid-quiesce then do not delay */ + if (node->rsrc_data->quiesce) + delay = 0; + + while (!list_empty(&ctx->rsrc_ref_list)) { + node = list_first_entry(&ctx->rsrc_ref_list, + struct io_rsrc_node, node); + /* recycle ref nodes in order */ + if (!node->done) + break; + list_del(&node->node); + first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); + } + spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + + if (first_add) + mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); +} + +static struct io_rsrc_node *io_rsrc_node_alloc(void) +{ + struct io_rsrc_node *ref_node; + + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + + if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return NULL; + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->rsrc_list); + ref_node->done = false; + return ref_node; +} + +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill) + __must_hold(&ctx->uring_lock) +{ + WARN_ON_ONCE(!ctx->rsrc_backup_node); + WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + + io_rsrc_refs_drop(ctx); + + if (data_to_kill) { + struct io_rsrc_node *rsrc_node = ctx->rsrc_node; + + rsrc_node->rsrc_data = data_to_kill; + spin_lock_irq(&ctx->rsrc_ref_lock); + list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); + spin_unlock_irq(&ctx->rsrc_ref_lock); + + atomic_inc(&data_to_kill->refs); + percpu_ref_kill(&rsrc_node->refs); + ctx->rsrc_node = NULL; + } + + if (!ctx->rsrc_node) { + ctx->rsrc_node = ctx->rsrc_backup_node; + ctx->rsrc_backup_node = NULL; + } +} + +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +{ + if (ctx->rsrc_backup_node) + return 0; + ctx->rsrc_backup_node = io_rsrc_node_alloc(); + return ctx->rsrc_backup_node ? 0 : -ENOMEM; +} + +__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, + struct io_ring_ctx *ctx) +{ + int ret; + + /* As we may drop ->uring_lock, other task may have started quiesce */ + if (data->quiesce) + return -ENXIO; + + data->quiesce = true; + do { + ret = io_rsrc_node_switch_start(ctx); + if (ret) + break; + io_rsrc_node_switch(ctx, data); + + /* kill initial ref, already quiesced if zero */ + if (atomic_dec_and_test(&data->refs)) + break; + mutex_unlock(&ctx->uring_lock); + flush_delayed_work(&ctx->rsrc_put_work); + ret = wait_for_completion_interruptible(&data->done); + if (!ret) { + mutex_lock(&ctx->uring_lock); + if (atomic_read(&data->refs) > 0) { + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); + } else { + break; + } + } + + atomic_inc(&data->refs); + /* wait for all works potentially completing data->done */ + flush_delayed_work(&ctx->rsrc_put_work); + reinit_completion(&data->done); + + ret = io_run_task_work_sig(); + mutex_lock(&ctx->uring_lock); + } while (ret >= 0); + data->quiesce = false; + + return ret; +} + +static void io_free_page_table(void **table, size_t size) +{ + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + + for (i = 0; i < nr_tables; i++) + kfree(table[i]); + kfree(table); +} + +static void io_rsrc_data_free(struct io_rsrc_data *data) +{ + size_t size = data->nr * sizeof(data->tags[0][0]); + + if (data->tags) + io_free_page_table((void **)data->tags, size); + kfree(data); +} + +static __cold void **io_alloc_page_table(size_t size) +{ + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + size_t init_size = size; + void **table; + + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); + if (!table) + return NULL; + + for (i = 0; i < nr_tables; i++) { + unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + + table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); + if (!table[i]) { + io_free_page_table(table, init_size); + return NULL; + } + size -= this_size; + } + return table; +} + +__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, + rsrc_put_fn *do_put, u64 __user *utags, + unsigned nr, struct io_rsrc_data **pdata) +{ + struct io_rsrc_data *data; + int ret = -ENOMEM; + unsigned i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); + if (!data->tags) { + kfree(data); + return -ENOMEM; + } + + data->nr = nr; + data->ctx = ctx; + data->do_put = do_put; + if (utags) { + ret = -EFAULT; + for (i = 0; i < nr; i++) { + u64 *tag_slot = io_get_tag_slot(data, i); + + if (copy_from_user(tag_slot, &utags[i], + sizeof(*tag_slot))) + goto fail; + } + } + + atomic_set(&data->refs, 1); + init_completion(&data->done); + *pdata = data; + return 0; +fail: + io_rsrc_data_free(data); + return ret; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_rsrc_update2 *up, + unsigned nr_args) +{ + u64 __user *tags = u64_to_user_ptr(up->tags); + __s32 __user *fds = u64_to_user_ptr(up->data); + struct io_rsrc_data *data = ctx->file_data; + struct io_fixed_file *file_slot; + struct file *file; + int fd, i, err = 0; + unsigned int done; + bool needs_switch = false; + + if (!ctx->file_data) + return -ENXIO; + if (up->offset + nr_args > ctx->nr_user_files) + return -EINVAL; + + for (done = 0; done < nr_args; done++) { + u64 tag = 0; + + if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || + copy_from_user(&fd, &fds[done], sizeof(fd))) { + err = -EFAULT; + break; + } + if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { + err = -EINVAL; + break; + } + if (fd == IORING_REGISTER_FILES_SKIP) + continue; + + i = array_index_nospec(up->offset + done, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, i); + + if (file_slot->file_ptr) { + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); + if (err) + break; + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, i); + needs_switch = true; + } + if (fd != -1) { + file = fget(fd); + if (!file) { + err = -EBADF; + break; + } + /* + * Don't allow io_uring instances to be registered. If + * UNIX isn't enabled, then this causes a reference + * cycle and this instance can never get freed. If UNIX + * is enabled we'll handle it just fine, but there's + * still no point in allowing a ring fd as it doesn't + * support regular read/write anyway. + */ + if (io_is_uring_fops(file)) { + fput(file); + err = -EBADF; + break; + } + err = io_scm_file_account(ctx, file); + if (err) { + fput(file); + break; + } + *io_get_tag_slot(data, i) = tag; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); + } + } + + if (needs_switch) + io_rsrc_node_switch(ctx, data); + return done ? done : err; +} + +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, + struct io_uring_rsrc_update2 *up, + unsigned int nr_args) +{ + u64 __user *tags = u64_to_user_ptr(up->tags); + struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct page *last_hpage = NULL; + bool needs_switch = false; + __u32 done; + int i, err; + + if (!ctx->buf_data) + return -ENXIO; + if (up->offset + nr_args > ctx->nr_user_bufs) + return -EINVAL; + + for (done = 0; done < nr_args; done++) { + struct io_mapped_ubuf *imu; + int offset = up->offset + done; + u64 tag = 0; + + err = io_copy_iov(ctx, &iov, iovs, done); + if (err) + break; + if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { + err = -EFAULT; + break; + } + err = io_buffer_validate(&iov); + if (err) + break; + if (!iov.iov_base && tag) { + err = -EINVAL; + break; + } + err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + if (err) + break; + + i = array_index_nospec(offset, ctx->nr_user_bufs); + if (ctx->user_bufs[i] != ctx->dummy_ubuf) { + err = io_queue_rsrc_removal(ctx->buf_data, i, + ctx->rsrc_node, ctx->user_bufs[i]); + if (unlikely(err)) { + io_buffer_unmap(ctx, &imu); + break; + } + ctx->user_bufs[i] = NULL; + needs_switch = true; + } + + ctx->user_bufs[i] = imu; + *io_get_tag_slot(ctx->buf_data, offset) = tag; + } + + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->buf_data); + return done ? done : err; +} + +static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, + struct io_uring_rsrc_update2 *up, + unsigned nr_args) +{ + __u32 tmp; + int err; + + if (check_add_overflow(up->offset, nr_args, &tmp)) + return -EOVERFLOW; + err = io_rsrc_node_switch_start(ctx); + if (err) + return err; + + switch (type) { + case IORING_RSRC_FILE: + return __io_sqe_files_update(ctx, up, nr_args); + case IORING_RSRC_BUFFER: + return __io_sqe_buffers_update(ctx, up, nr_args); + } + return -EINVAL; +} + +int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update2 up; + + if (!nr_args) + return -EINVAL; + memset(&up, 0, sizeof(up)); + if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) + return -EFAULT; + if (up.resv || up.resv2) + return -EINVAL; + return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); +} + +int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned size, unsigned type) +{ + struct io_uring_rsrc_update2 up; + + if (size != sizeof(up)) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (!up.nr || up.resv || up.resv2) + return -EINVAL; + return __io_register_rsrc_update(ctx, type, &up, up.nr); +} + +__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, + unsigned int size, unsigned int type) +{ + struct io_uring_rsrc_register rr; + + /* keep it extendible */ + if (size != sizeof(rr)) + return -EINVAL; + + memset(&rr, 0, sizeof(rr)); + if (copy_from_user(&rr, arg, size)) + return -EFAULT; + if (!rr.nr || rr.resv2) + return -EINVAL; + if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) + return -EINVAL; + + switch (type) { + case IORING_RSRC_FILE: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; + return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), + rr.nr, u64_to_user_ptr(rr.tags)); + case IORING_RSRC_BUFFER: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; + return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), + rr.nr, u64_to_user_ptr(rr.tags)); + } + return -EINVAL; +} + +int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + up->offset = READ_ONCE(sqe->off); + up->nr_args = READ_ONCE(sqe->len); + if (!up->nr_args) + return -EINVAL; + up->arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update_with_index_alloc(struct io_kiocb *req, + unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + __s32 __user *fds = u64_to_user_ptr(up->arg); + unsigned int done; + struct file *file; + int ret, fd; + + if (!req->ctx->file_data) + return -ENXIO; + + for (done = 0; done < up->nr_args; done++) { + if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + ret = -EFAULT; + break; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + break; + } + ret = io_fixed_fd_install(req, issue_flags, file, + IORING_FILE_INDEX_ALLOC); + if (ret < 0) + break; + if (copy_to_user(&fds[done], &ret, sizeof(ret))) { + __io_close_fixed(req, issue_flags, ret); + ret = -EFAULT; + break; + } + } + + if (done) + return done; + return ret; +} + +int io_files_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_rsrc_update2 up2; + int ret; + + up2.offset = up->offset; + up2.data = up->arg; + up2.nr = 0; + up2.tags = 0; + up2.resv = 0; + up2.resv2 = 0; + + if (up->offset == IORING_FILE_INDEX_ALLOC) { + ret = io_files_update_with_index_alloc(req, issue_flags); + } else { + io_ring_submit_lock(ctx, issue_flags); + ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, + &up2, up->nr_args); + io_ring_submit_unlock(ctx, issue_flags); + } + + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc) +{ + u64 *tag_slot = io_get_tag_slot(data, idx); + struct io_rsrc_put *prsrc; + + prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); + if (!prsrc) + return -ENOMEM; + + prsrc->tag = *tag_slot; + *tag_slot = 0; + prsrc->rsrc = rsrc; + list_add(&prsrc->list, &node->rsrc_list); + return 0; +} + +void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if !defined(IO_URING_SCM_ALL) + int i; + + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file = io_file_from_index(&ctx->file_table, i); + + if (!file) + continue; + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + continue; + io_file_bitmap_clear(&ctx->file_table, i); + fput(file); + } +#endif + +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#endif + io_free_file_tables(&ctx->file_table); + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + ctx->nr_user_files = 0; +} + +int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + unsigned nr = ctx->nr_user_files; + int ret; + + if (!ctx->file_data) + return -ENXIO; + + /* + * Quiesce may unlock ->uring_lock, and while it's not held + * prevent new requests using the table. + */ + ctx->nr_user_files = 0; + ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); + ctx->nr_user_files = nr; + if (!ret) + __io_sqe_files_unregister(ctx); + return ret; +} + +/* + * Ensure the UNIX gc is aware of our file set, so we are certain that + * the io_uring can be safely unregistered on process exit, even if we have + * loops in the file referencing. We account only files that can hold other + * files because otherwise they can't form a loop and so are not interesting + * for GC. + */ +int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sk = ctx->ring_sock->sk; + struct sk_buff_head *head = &sk->sk_receive_queue; + struct scm_fp_list *fpl; + struct sk_buff *skb; + + if (likely(!io_file_need_scm(file))) + return 0; + + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) + __skb_unlink(skb, head); + else + skb = NULL; + spin_unlock_irq(&head->lock); + + if (!skb) { + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + fpl->user = get_uid(current_user()); + fpl->max = SCM_MAX_FD; + fpl->count = 0; + + UNIXCB(skb).fp = fpl; + skb->sk = sk; + skb->destructor = unix_destruct_scm; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + } + + fpl = UNIXCB(skb).fp; + fpl->fp[fpl->count++] = get_file(file); + unix_inflight(fpl->user, file); + skb_queue_head(head, skb); + fput(file); +#endif + return 0; +} + +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + struct file *file = prsrc->file; +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + if (!io_file_need_scm(file)) { + fput(file); + return; + } + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args, u64 __user *tags) +{ + __s32 __user *fds = (__s32 __user *) arg; + struct file *file; + int fd, ret; + unsigned i; + + if (ctx->file_data) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + if (nr_args > rlimit(RLIMIT_NOFILE)) + return -EMFILE; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + &ctx->file_data); + if (ret) + return ret; + + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + + for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { + struct io_fixed_file *file_slot; + + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { + ret = -EFAULT; + goto fail; + } + /* allow sparse sets */ + if (!fds || fd == -1) { + ret = -EINVAL; + if (unlikely(*io_get_tag_slot(ctx->file_data, i))) + goto fail; + continue; + } + + file = fget(fd); + ret = -EBADF; + if (unlikely(!file)) + goto fail; + + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't support regular read/write anyway. + */ + if (io_is_uring_fops(file)) { + fput(file); + goto fail; + } + ret = io_scm_file_account(ctx, file); + if (ret) { + fput(file); + goto fail; + } + file_slot = io_fixed_file_slot(&ctx->file_table, i); + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); + } + + io_rsrc_node_switch(ctx, NULL); + return 0; +fail: + __io_sqe_files_unregister(ctx); + return ret; +} + +static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + io_buffer_unmap(ctx, &prsrc->buf); + prsrc->buf = NULL; +} + +void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +{ + unsigned int i; + + for (i = 0; i < ctx->nr_user_bufs; i++) + io_buffer_unmap(ctx, &ctx->user_bufs[i]); + kfree(ctx->user_bufs); + io_rsrc_data_free(ctx->buf_data); + ctx->user_bufs = NULL; + ctx->buf_data = NULL; + ctx->nr_user_bufs = 0; +} + +int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +{ + unsigned nr = ctx->nr_user_bufs; + int ret; + + if (!ctx->buf_data) + return -ENXIO; + + /* + * Quiesce may unlock ->uring_lock, and while it's not held + * prevent new requests using the table. + */ + ctx->nr_user_bufs = 0; + ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); + ctx->nr_user_bufs = nr; + if (!ret) + __io_sqe_buffers_unregister(ctx); + return ret; +} + +/* + * Not super efficient, but this is just a registration time. And we do cache + * the last compound head, so generally we'll only do a full search if we don't + * match that one. + * + * We check if the given compound head page has already been accounted, to + * avoid double accounting it. This allows us to account the full size of the + * page, not just the constituent pages of a huge page. + */ +static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct page *hpage) +{ + int i, j; + + /* check current page array */ + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) + continue; + if (compound_head(pages[i]) == hpage) + return true; + } + + /* check previously registered pages */ + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) { + if (!PageCompound(imu->bvec[j].bv_page)) + continue; + if (compound_head(imu->bvec[j].bv_page) == hpage) + return true; + } + } + + return false; +} + +static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct io_mapped_ubuf *imu, + struct page **last_hpage) +{ + int i, ret; + + imu->acct_pages = 0; + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) { + imu->acct_pages++; + } else { + struct page *hpage; + + hpage = compound_head(pages[i]); + if (hpage == *last_hpage) + continue; + *last_hpage = hpage; + if (headpage_already_acct(ctx, pages, i, hpage)) + continue; + imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; + } + } + + if (!imu->acct_pages) + return 0; + + ret = io_account_mem(ctx, imu->acct_pages); + if (ret) + imu->acct_pages = 0; + return ret; +} + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct vm_area_struct **vmas = NULL; + struct page **pages = NULL; + int i, pret, ret = -ENOMEM; + + end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ubuf >> PAGE_SHIFT; + nr_pages = end - start; + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto done; + + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas) + goto done; + + ret = 0; + mmap_read_lock(current->mm); + pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); + if (pret == nr_pages) { + /* don't support file backed memory */ + for (i = 0; i < nr_pages; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma_is_shmem(vma)) + continue; + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { + ret = -EOPNOTSUPP; + break; + } + } + *npages = nr_pages; + } else { + ret = pret < 0 ? pret : -EFAULT; + } + mmap_read_unlock(current->mm); + if (ret) { + /* + * if we did partial map, or found file backed vmas, + * release any pages we did get + */ + if (pret > 0) + unpin_user_pages(pages, pret); + goto done; + } + ret = 0; +done: + kvfree(vmas); + if (ret < 0) { + kvfree(pages); + pages = ERR_PTR(ret); + } + return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) +{ + struct io_mapped_ubuf *imu = NULL; + struct page **pages = NULL; + unsigned long off; + size_t size; + int ret, nr_pages, i; + + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } + + *pimu = NULL; + ret = -ENOMEM; + + pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, + &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto done; + } + + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) + goto done; + + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); + if (ret) { + unpin_user_pages(pages, nr_pages); + goto done; + } + + off = (unsigned long) iov->iov_base & ~PAGE_MASK; + size = iov->iov_len; + for (i = 0; i < nr_pages; i++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + imu->bvec[i].bv_page = pages[i]; + imu->bvec[i].bv_len = vec_len; + imu->bvec[i].bv_offset = off; + off = 0; + size -= vec_len; + } + /* store original address for later verification */ + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->nr_bvecs = nr_pages; + *pimu = imu; + ret = 0; +done: + if (ret) + kvfree(imu); + kvfree(pages); + return ret; +} + +static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) +{ + ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); + return ctx->user_bufs ? 0 : -ENOMEM; +} + +int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args, u64 __user *tags) +{ + struct page *last_hpage = NULL; + struct io_rsrc_data *data; + int i, ret; + struct iovec iov; + + BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + + if (ctx->user_bufs) + return -EBUSY; + if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) + return -EINVAL; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + if (ret) + return ret; + ret = io_buffers_map_alloc(ctx, nr_args); + if (ret) { + io_rsrc_data_free(data); + return ret; + } + + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + if (arg) { + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + ret = io_buffer_validate(&iov); + if (ret) + break; + } else { + memset(&iov, 0, sizeof(iov)); + } + + if (!iov.iov_base && *io_get_tag_slot(data, i)) { + ret = -EINVAL; + break; + } + + ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + &last_hpage); + if (ret) + break; + } + + WARN_ON_ONCE(ctx->buf_data); + + ctx->buf_data = data; + if (ret) + __io_sqe_buffers_unregister(ctx); + else + io_rsrc_node_switch(ctx, NULL); + return ret; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h new file mode 100644 index 00000000000000..872c86312cbc2d --- /dev/null +++ b/io_uring/rsrc.h @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_RSRC_H +#define IOU_RSRC_H + +#include + +#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) +#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) +#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) + +enum { + IORING_RSRC_FILE = 0, + IORING_RSRC_BUFFER = 1, +}; + +struct io_rsrc_put { + struct list_head list; + u64 tag; + union { + void *rsrc; + struct file *file; + struct io_mapped_ubuf *buf; + }; +}; + +typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); + +struct io_rsrc_data { + struct io_ring_ctx *ctx; + + u64 **tags; + unsigned int nr; + rsrc_put_fn *do_put; + atomic_t refs; + struct completion done; + bool quiesce; +}; + +struct io_rsrc_node { + struct percpu_ref refs; + struct list_head node; + struct list_head rsrc_list; + struct io_rsrc_data *rsrc_data; + struct llist_node llist; + bool done; +}; + +void io_rsrc_put_work(struct work_struct *work); +void io_rsrc_refs_refill(struct io_ring_ctx *ctx); +void io_wait_rsrc_data(struct io_rsrc_data *data); +void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); +void io_rsrc_refs_drop(struct io_ring_ctx *ctx); +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc); +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill); + + +void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); +int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); +int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args, u64 __user *tags); +void __io_sqe_files_unregister(struct io_ring_ctx *ctx); +int io_sqe_files_unregister(struct io_ring_ctx *ctx); +int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args, u64 __user *tags); + +int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); + +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) + return true; +#else + return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ + return false; +} +#endif + +static inline int io_scm_file_account(struct io_ring_ctx *ctx, + struct file *file) +{ + if (likely(!io_file_need_scm(file))) + return 0; + return __io_scm_file_account(ctx, file); +} + +int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args); +int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned size, unsigned type); +int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, + unsigned int size, unsigned int type); + +static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ + percpu_ref_put_many(&node->refs, nr); +} + +static inline void io_req_put_rsrc(struct io_kiocb *req) +{ + if (req->rsrc_node) + io_rsrc_put_node(req->rsrc_node, 1); +} + +static inline void io_req_put_rsrc_locked(struct io_kiocb *req, + struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + struct io_rsrc_node *node = req->rsrc_node; + + if (node) { + if (node == ctx->rsrc_node) + ctx->rsrc_cached_refs++; + else + io_rsrc_put_node(node, 1); + } +} + +static inline void io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned int issue_flags) +{ + if (!req->rsrc_node) { + req->rsrc_node = ctx->rsrc_node; + + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + lockdep_assert_held(&ctx->uring_lock); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); + } else { + percpu_ref_get(&req->rsrc_node->refs); + } + } +} + +static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +{ + unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; + unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + + return &data->tags[table_idx][off]; +} + +int io_files_update(struct io_kiocb *req, unsigned int issue_flags); +int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +#endif From c98817e6cd4471a6f6283813dd6efea162f5ac5f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 May 2022 09:44:31 -0600 Subject: [PATCH 053/172] io_uring: move remaining file table manipulation to filetable.c Signed-off-by: Jens Axboe --- io_uring/filetable.c | 87 +++++++++++++++++++++++++++++++++++++++++++- io_uring/filetable.h | 5 ++- io_uring/io_uring.c | 82 ----------------------------------------- io_uring/io_uring.h | 4 -- 4 files changed, 90 insertions(+), 88 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 560629a93c04b0..e449ceb9a848e6 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -4,14 +4,17 @@ #include #include #include +#include #include #include #include "io_uring_types.h" #include "io_uring.h" +#include "rsrc.h" +#include "filetable.h" -int io_file_bitmap_get(struct io_ring_ctx *ctx) +static int io_file_bitmap_get(struct io_ring_ctx *ctx) { struct io_file_table *table = &ctx->file_table; unsigned long nr = ctx->nr_user_files; @@ -55,3 +58,85 @@ void io_free_file_tables(struct io_file_table *table) table->files = NULL; table->bitmap = NULL; } + +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) + __must_hold(&req->ctx->uring_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool needs_switch = false; + struct io_fixed_file *file_slot; + int ret; + + if (io_is_uring_fops(file)) + return -EBADF; + if (!ctx->file_data) + return -ENXIO; + if (slot_index >= ctx->nr_user_files) + return -EINVAL; + + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + + if (file_slot->file_ptr) { + struct file *old_file; + + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; + + old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + ctx->rsrc_node, old_file); + if (ret) + goto err; + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); + needs_switch = true; + } + + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + } +err: + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->file_data); + if (ret) + fput(file); + return ret; +} + +/* + * Note when io_fixed_fd_install() returns error value, it will ensure + * fput() is called correspondingly. + */ +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + + if (alloc_slot) { + ret = io_file_bitmap_get(ctx); + if (unlikely(ret < 0)) + goto err; + file_slot = ret; + } else { + file_slot--; + } + + ret = io_install_fixed_file(req, file, issue_flags, file_slot); + if (!ret && alloc_slot) + ret = file_slot; +err: + io_ring_submit_unlock(ctx, issue_flags); + if (unlikely(ret < 0)) + fput(file); + return ret; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 6e1675f406b729..c404360f709053 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -3,6 +3,7 @@ #define IOU_FILE_TABLE_H struct io_ring_ctx; +struct io_kiocb; /* * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 @@ -34,7 +35,9 @@ struct io_file_table { bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); -int io_file_bitmap_get(struct io_ring_ctx *ctx); + +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot); unsigned int io_file_get_flags(struct file *file); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0c47c919887f52..c0f1f79933ac2a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2699,38 +2699,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -/* - * Note when io_fixed_fd_install() returns error value, it will ensure - * fput() is called correspondingly. - */ -int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) -{ - bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; - struct io_ring_ctx *ctx = req->ctx; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - - if (alloc_slot) { - ret = io_file_bitmap_get(ctx); - if (unlikely(ret < 0)) - goto err; - file_slot = ret; - } else { - file_slot--; - } - - ret = io_install_fixed_file(req, file, issue_flags, file_slot); - if (!ret && alloc_slot) - ret = file_slot; -err: - io_ring_submit_unlock(ctx, issue_flags); - if (unlikely(ret < 0)) - fput(file); - return ret; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { @@ -3579,56 +3547,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) - __must_hold(&req->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool needs_switch = false; - struct io_fixed_file *file_slot; - int ret; - - if (io_is_uring_fops(file)) - return -EBADF; - if (!ctx->file_data) - return -ENXIO; - if (slot_index >= ctx->nr_user_files) - return -EINVAL; - - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); - if (ret) - goto err; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; - } - - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); - return ret; -} - static void io_mem_free(void *ptr) { struct page *page; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 090c17deba9db2..71afb46070e364 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -106,10 +106,6 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot); -int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); From f3b44f92e59a804cf375479bda0bccbf4b6e6ef6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 07:27:03 -0600 Subject: [PATCH 054/172] io_uring: move read/write related opcodes to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 1239 +------------------------------------------ io_uring/io_uring.h | 131 +++++ io_uring/rw.c | 1099 ++++++++++++++++++++++++++++++++++++++ io_uring/rw.h | 23 + 5 files changed, 1263 insertions(+), 1231 deletions(-) create mode 100644 io_uring/rw.c create mode 100644 io_uring/rw.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 360a83039c2a4f..d70deed65a0bb2 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o + cancel.o kbuf.o rsrc.o rw.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c0f1f79933ac2a..0af61a6c29cfee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -57,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -67,13 +65,10 @@ #include #include #include -#include #include #include #include -#include #include -#include #include #include #include @@ -110,6 +105,7 @@ #include "timeout.h" #include "poll.h" #include "cancel.h" +#include "rw.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -136,31 +132,6 @@ #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -/* - * First field must be the file pointer in all the - * iocb unions! See also 'struct kiocb' in - */ -struct io_rw { - /* NOTE: kiocb has the file as the first member, so don't do it here */ - struct kiocb kiocb; - u64 addr; - u32 len; - rwf_t flags; -}; - -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - -struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; - size_t bytes_done; - struct wait_page_queue wpq; -}; - enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, @@ -184,9 +155,7 @@ static void io_dismantle_req(struct io_kiocb *req); static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); -static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static int io_req_prep_async(struct io_kiocb *req); static void io_eventfd_signal(struct io_ring_ctx *ctx); @@ -393,11 +362,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -489,7 +453,7 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -532,7 +496,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { spin_lock(&ctx->completion_lock); @@ -547,60 +511,6 @@ static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -/* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ -static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; - unsigned int free, queued, len; - - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; - - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; - /* we need a contiguous range, limit based on the current array offset */ - len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; - - ctx->cached_cq_tail++; - ctx->cqe_cached = &rings->cqes[off]; - ctx->cqe_sentinel = ctx->cqe_cached + len; - ctx->cqe_cached++; - return &rings->cqes[off << shift]; -} - -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) -{ - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { - struct io_uring_cqe *cqe = ctx->cqe_cached; - - if (ctx->flags & IORING_SETUP_CQE32) { - unsigned int off = ctx->cqe_cached - ctx->rings->cqes; - - cqe += off; - } - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - return cqe; - } - - return __io_get_cqe(ctx); -} - static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; @@ -628,17 +538,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) rcu_read_unlock(); } -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - /* - * wake_up_all() may seem excessive, but io_wake_function() and - * io_should_wake() handle the termination of the loop and only - * wake as many waiters as we need to. - */ - if (wq_has_sleeper(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); -} - /* * This should only get called when at least one event has been posted. * Some applications rely on the eventfd notification count only changing @@ -655,16 +554,6 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } -static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_wake(ctx); -} - /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -775,9 +664,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, - u64 extra2) +bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags, u64 extra1, u64 extra2) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -814,59 +702,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; - - if (!(ctx->flags & IORING_SETUP_CQE32)) { - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - 0, 0); - } else { - u64 extra1 = 0, extra2 = 0; - - if (req->flags & REQ_F_CQE32_INIT) { - extra1 = req->extra1; - extra2 = req->extra2; - } - - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - extra1, extra2); - } -} - bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { @@ -1269,7 +1104,7 @@ void io_req_task_work_add(struct io_kiocb *req) __io_req_task_work_add(req, tctx, &tctx->task_list); } -static void io_req_task_prio_work_add(struct io_kiocb *req) +void io_req_task_prio_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; @@ -1315,18 +1150,12 @@ void io_req_task_queue_fail(struct io_kiocb *req, int ret) io_req_task_work_add(req); } -static void io_req_task_queue(struct io_kiocb *req) +void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; io_req_task_work_add(req); } -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -1335,8 +1164,7 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -static void io_free_batch_list(struct io_ring_ctx *ctx, - struct io_wq_work_node *node) +void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { struct task_struct *task = NULL; @@ -1435,76 +1263,6 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) return __io_cqring_events(ctx); } -int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) -{ - struct io_wq_work_node *pos, *start, *prev; - unsigned int poll_flags = BLK_POLL_NOSLEEP; - DEFINE_IO_COMP_BATCH(iob); - int nr_events = 0; - - /* - * Only spin for completions if we don't have multiple devices hanging - * off our complete list. - */ - if (ctx->poll_multi_queue || force_nonspin) - poll_flags |= BLK_POLL_ONESHOT; - - wq_list_for_each(pos, start, &ctx->iopoll_list) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct io_rw *rw = io_kiocb_to_cmd(req); - int ret; - - /* - * Move completed and retryable entries to our local lists. - * If we find a request that requires polling, break out - * and complete those lists first, if we have entries there. - */ - if (READ_ONCE(req->iopoll_completed)) - break; - - ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); - if (unlikely(ret < 0)) - return ret; - else if (ret) - poll_flags |= BLK_POLL_ONESHOT; - - /* iopoll may have completed current req */ - if (!rq_list_empty(iob.req_list) || - READ_ONCE(req->iopoll_completed)) - break; - } - - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - else if (!pos) - return 0; - - prev = start; - wq_list_for_each_resume(pos, prev) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - - /* order with io_complete_rw_iopoll(), e.g. ->result updates */ - if (!smp_load_acquire(&req->iopoll_completed)) - break; - nr_events++; - if (unlikely(req->flags & REQ_F_CQE_SKIP)) - continue; - - req->cqe.flags = io_put_kbuf(req, 0); - __io_fill_cqe_req(req->ctx, req); - } - - if (unlikely(!nr_events)) - return 0; - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - pos = start ? start->next : ctx->iopoll_list.first; - wq_list_cut(&ctx->iopoll_list, prev, start); - io_free_batch_list(ctx, pos); - return nr_events; -} - /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. @@ -1589,90 +1347,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } - -static void kiocb_end_write(struct io_kiocb *req) -{ - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (req->flags & REQ_F_ISREG) { - struct super_block *sb = file_inode(req->file)->i_sb; - - __sb_writers_acquired(sb, SB_FREEZE_WRITE); - sb_end_write(sb); - } -} - -#ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&io->s.iter, &io->s.iter_state); - return true; -} - -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - umode_t mode = file_inode(req->file)->i_mode; - struct io_ring_ctx *ctx = req->ctx; - - if (!S_ISBLK(mode) && !S_ISREG(mode)) - return false; - if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) - return false; - /* - * If ref is dying, we might be running poll reap from the exit work. - * Don't attempt to reissue from that path, just let it fail with - * -EAGAIN. - */ - if (percpu_ref_is_dying(&ctx->refs)) - return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; - return true; -} -#else -static bool io_resubmit_prep(struct io_kiocb *req) -{ - return false; -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - return false; -} -#endif - -static bool __io_complete_rw_common(struct io_kiocb *req, long res) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - - if (rw->kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); - fsnotify_modify(req->file); - } else { - fsnotify_access(req->file); - } - if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return true; - } - req_set_fail(req); - req->cqe.res = res; - } - return false; -} - inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { if (*locked) { @@ -1685,46 +1359,6 @@ inline void io_req_task_complete(struct io_kiocb *req, bool *locked) } } -static void __io_complete_rw(struct io_kiocb *req, long res, - unsigned int issue_flags) -{ - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); - __io_req_complete(req, issue_flags); -} - -static void io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); - struct io_kiocb *req = cmd_to_io_kiocb(rw); - - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, res, 0); - req->io_task_work.func = io_req_task_complete; - io_req_task_prio_work_add(req); -} - -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -{ - struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); - struct io_kiocb *req = cmd_to_io_kiocb(rw); - - if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return; - } - req->cqe.res = res; - } - - /* order with io_iopoll_complete() checking ->iopoll_completed */ - smp_store_release(&req->iopoll_completed, 1); -} - /* * After the iocb has been issued, it's safe to be found on the poll list. * Adding the kiocb to the list AFTER submission ensures that we don't @@ -1833,426 +1467,6 @@ unsigned int io_file_get_flags(struct file *file) return res; } -static inline bool io_file_supports_nowait(struct io_kiocb *req) -{ - return req->flags & REQ_F_SUPPORT_NOWAIT; -} - -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - unsigned ioprio; - int ret; - - rw->kiocb.ki_pos = READ_ONCE(sqe->off); - /* used for fixed read/write too - just read unconditionally */ - req->buf_index = READ_ONCE(sqe->buf_index); - - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - u16 index; - - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); - } - - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - rw->kiocb.ki_ioprio = ioprio; - } else { - rw->kiocb.ki_ioprio = get_current_ioprio(); - } - - rw->addr = READ_ONCE(sqe->addr); - rw->len = READ_ONCE(sqe->len); - rw->flags = READ_ONCE(sqe->rw_flags); - return 0; -} - -static void io_readv_writev_cleanup(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); -} - -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) -{ - switch (ret) { - case -EIOCBQUEUED: - break; - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - case -ERESTART_RESTARTBLOCK: - /* - * We can't just restart the syscall, since previously - * submitted sqes may already be in progress. Just fail this - * IO with EINTR. - */ - ret = -EINTR; - fallthrough; - default: - kiocb->ki_complete(kiocb, ret); - } -} - -static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - - if (rw->kiocb.ki_pos != -1) - return &rw->kiocb.ki_pos; - - if (!(req->file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - rw->kiocb.ki_pos = req->file->f_pos; - return &rw->kiocb.ki_pos; - } - - rw->kiocb.ki_pos = 0; - return NULL; -} - -static void kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) -{ - struct io_async_rw *io = req->async_data; - struct io_rw *rw = io_kiocb_to_cmd(req); - - /* add previously done IO, if any */ - if (req_has_async_data(req) && io->bytes_done > 0) { - if (ret < 0) - ret = io->bytes_done; - else - ret += io->bytes_done; - } - - if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, issue_flags); - else - io_rw_done(&rw->kiocb, ret); - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, ret); - } -} - -static int __io_import_fixed(struct io_kiocb *req, int ddir, - struct iov_iter *iter, struct io_mapped_ubuf *imu) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - size_t len = rw->len; - u64 buf_end, buf_addr = rw->addr; - size_t offset; - - if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) - return -EFAULT; - /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) - return -EFAULT; - - /* - * May not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; - - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); - - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; - } - } - - return 0; -} - -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - unsigned int issue_flags) -{ - if (WARN_ON_ONCE(!req->imu)) - return -EFAULT; - return __io_import_fixed(req, rw, iter, req->imu); -} - -#ifdef CONFIG_COMPAT -static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct compat_iovec __user *uiov; - compat_ssize_t clen; - void __user *buf; - size_t len; - - uiov = u64_to_user_ptr(rw->addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - - len = clen; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - iov[0].iov_base = buf; - rw->len = iov[0].iov_len = (compat_size_t) len; - return 0; -} -#endif - -static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct iovec __user *uiov = u64_to_user_ptr(rw->addr); - void __user *buf; - ssize_t len; - - if (copy_from_user(iov, uiov, sizeof(*uiov))) - return -EFAULT; - - len = iov[0].iov_len; - if (len < 0) - return -EINVAL; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - iov[0].iov_base = buf; - rw->len = iov[0].iov_len = len; - return 0; -} - -static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - iov[0].iov_base = u64_to_user_ptr(rw->addr); - iov[0].iov_len = rw->len; - return 0; - } - if (rw->len != 1) - return -EINVAL; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return io_compat_import(req, iov, issue_flags); -#endif - - return __io_iov_buffer_select(req, iov, issue_flags); -} - -static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, ddir, iter, issue_flags); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, iovec, issue_flags); - if (ret) - return ERR_PTR(ret); - iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); - return NULL; - } - - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - -static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) -{ - return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; -} - -/* - * For files that don't have ->read_iter() and ->write_iter(), handle them - * by looping over ->read() or ->write() manually. - */ -static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) -{ - struct kiocb *kiocb = &rw->kiocb; - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; - loff_t *ppos; - - /* - * Don't support polled IO through this interface, and we can't - * support non-blocking either. For the latter, this just causes - * the kiocb to be handled from an async context. - */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EOPNOTSUPP; - if ((kiocb->ki_flags & IOCB_NOWAIT) && - !(kiocb->ki_filp->f_flags & O_NONBLOCK)) - return -EAGAIN; - - ppos = io_kiocb_ppos(kiocb); - - while (iov_iter_count(iter)) { - struct iovec iovec; - ssize_t nr; - - if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); - } else { - iovec.iov_base = u64_to_user_ptr(rw->addr); - iovec.iov_len = rw->len; - } - - if (ddir == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (!iov_iter_is_bvec(iter)) { - iov_iter_advance(iter, nr); - } else { - rw->addr += nr; - rw->len -= nr; - if (!rw->len) - break; - } - if (nr != iovec.iov_len) - break; - } - - return ret; -} - -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *io = req->async_data; - - memcpy(&io->s.iter, iter, sizeof(*iter)); - io->free_iovec = iovec; - io->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - io->s.iter.iov = io->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - io->s.iter.iov += iov_off; - } - if (io->s.fast_iov != fast_iov) - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); @@ -2264,448 +1478,13 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_op_defs[req->opcode].prep_async) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - iorw->bytes_done = 0; - iorw->free_iovec = iov; - if (iov) - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, READ); -} - -static int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, WRITE); -} - -/* - * This is our waitqueue callback handler, registered through __folio_lock_async() - * when we initially tried to do the IO with the iocb armed our waitqueue. - * This gets called when the page is unlocked, and we generally expect that to - * happen when the page IO is completed and the page is now uptodate. This will - * queue a task_work based retry of the operation, attempting to copy the data - * again. If the latter fails because the page was NOT uptodate, then we will - * do a thread based blocking retry of the operation. That's the unexpected - * slow path. - */ -static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, - int sync, void *arg) -{ - struct wait_page_queue *wpq; - struct io_kiocb *req = wait->private; - struct io_rw *rw = io_kiocb_to_cmd(req); - struct wait_page_key *key = arg; - - wpq = container_of(wait, struct wait_page_queue, wait); - - if (!wake_page_match(wpq, key)) - return 0; - - rw->kiocb.ki_flags &= ~IOCB_WAITQ; - list_del_init(&wait->entry); - io_req_task_queue(req); - return 1; -} - -/* - * This controls whether a given IO request should be armed for async page - * based retry. If we return false here, the request is handed to the async - * worker threads for retry. If we're doing buffered reads on a regular file, - * we prepare a private wait_page_queue entry and retry the operation. This - * will either succeed because the page is now uptodate and unlocked, or it - * will register a callback when the page is unlocked at IO completion. Through - * that callback, io_uring uses task_work to setup a retry of the operation. - * That retry will attempt the buffered read again. The retry will generally - * succeed, or in rare cases where it fails, we then fall back to using the - * async worker threads for a blocking retry. - */ -static bool io_rw_should_retry(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - struct wait_page_queue *wait = &io->wpq; - struct io_rw *rw = io_kiocb_to_cmd(req); - struct kiocb *kiocb = &rw->kiocb; - - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) - return false; - - /* Only for buffered IO */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) - return false; - - /* - * just use poll if we can, and don't attempt if the fs doesn't - * support callback based unlocks - */ - if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) - return false; - - wait->wait.func = io_async_buf_func; - wait->wait.private = req; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_flags &= ~IOCB_NOWAIT; - kiocb->ki_waitq = wait; - return true; -} - -static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) -{ - struct file *file = rw->kiocb.ki_filp; - - if (likely(file->f_op->read_iter)) - return call_read_iter(file, &rw->kiocb, iter); - else if (file->f_op->read) - return loop_rw_iter(READ, rw, iter); - else - return -EINVAL; -} - -static bool need_read_all(struct io_kiocb *req) -{ - return req->flags & REQ_F_ISREG || - S_ISBLK(file_inode(req->file)->i_mode); -} - -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct kiocb *kiocb = &rw->kiocb; - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (unlikely(!file || !(file->f_mode & mode))) - return -EBADF; - - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, rw->flags); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->private = NULL; - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } - - return 0; -} - -static int io_read(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *io; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - io = req->async_data; - s = &io->s; - - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (io_do_buffer_select(req)) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - - ret = io_iter_do_read(rw, &s->iter); - - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; - /* if we can poll, just do that */ - if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) - return -EAGAIN; - /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) - goto done; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (req->flags & REQ_F_NOWAIT) - goto done; - ret = 0; - } else if (ret == -EIOCBQUEUED) { - goto out_free; - } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { - /* read all, failed, already did sync or don't want to retry */ - goto done; - } - - /* - * Don't depend on the iter state matching what was consumed, or being - * untouched in case of error. Restore it and we'll advance it - * manually if we need to. - */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - if (ret2) - return ret2; - - iovec = NULL; - io = req->async_data; - s = &io->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ - - do { - /* - * We end up here because of a partial read, either from - * above or inside this loop. Advance the iter by the bytes - * that were consumed. - */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) - break; - io->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); - - /* if we can retry, do so with the callbacks armed */ - if (!io_rw_should_retry(req)) { - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; - } - - /* - * Now retry read with the IOCB_WAITQ parts set in the iocb. If - * we get -EIOCBQUEUED, then we'll get a notification when the - * desired page gets unlocked. We can also get a partial read - * here, and if we do, then just retry at the new offset. - */ - ret = io_iter_do_read(rw, &s->iter); - if (ret == -EIOCBQUEUED) - return IOU_ISSUE_SKIP_COMPLETE; - /* we got some bytes, but not all. retry. */ - kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); - } while (ret > 0); -done: - kiocb_done(req, ret, issue_flags); -out_free: - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_write(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *io = req->async_data; - - s = &io->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; - - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; - - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) - goto out_free; - - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - sb_start_write(file_inode(req->file)->i_sb); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; - - if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); - else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &s->iter); - else - ret2 = -EINVAL; - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) - goto done; - if (!force_nonblock || ret2 != -EAGAIN) { - /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; -done: - kiocb_done(req, ret2, issue_flags); - ret = IOU_ISSUE_SKIP_COMPLETE; - } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - return ret ?: -EAGAIN; - } -out_free: - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { return -EOPNOTSUPP; } -static int io_req_prep_async(struct io_kiocb *req) +int io_req_prep_async(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 71afb46070e364..22e6e52c42d261 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,11 +5,125 @@ #include #include "io_uring_types.h" +#ifndef CREATE_TRACE_POINTS +#include +#endif + enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, }; +bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags, u64 extra1, u64 extra2); + +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static inline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) + return NULL; + + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + struct io_uring_cqe *cqe = ctx->cqe_cached; + + if (ctx->flags & IORING_SETUP_CQE32) { + unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + + cqe += off; + } + + ctx->cached_cq_tail++; + ctx->cqe_cached++; + return cqe; + } + + return __io_get_cqe(ctx); +} + +static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + + if (!(ctx->flags & IORING_SETUP_CQE32)) { + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + 0, 0); + } else { + u64 extra1 = 0, extra2 = 0; + + if (req->flags & REQ_F_CQE32_INIT) { + extra1 = req->extra1; + extra2 = req->extra2; + } + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + extra1, extra2); + } +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -64,6 +178,17 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void io_cqring_wake(struct io_ring_ctx *ctx) +{ + /* + * wake_up_all() may seem excessive, but io_wake_function() and + * io_should_wake() handle the termination of the loop and only + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -100,6 +225,7 @@ void __io_req_complete_post(struct io_kiocb *req); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); +void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -110,7 +236,10 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); +void io_req_task_prio_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); +void io_req_task_queue(struct io_kiocb *req); +void io_queue_iowq(struct io_kiocb *req, bool *dont_use); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, bool *locked); @@ -122,6 +251,8 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_poll_issue(struct io_kiocb *req, bool *locked); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); +int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); diff --git a/io_uring/rw.c b/io_uring/rw.c new file mode 100644 index 00000000000000..f0b60199ee2279 --- /dev/null +++ b/io_uring/rw.c @@ -0,0 +1,1099 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "opdef.h" +#include "kbuf.h" +#include "rsrc.h" +#include "rw.h" + +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u32 len; + rwf_t flags; +}; + +static inline bool io_file_supports_nowait(struct io_kiocb *req) +{ + return req->flags & REQ_F_SUPPORT_NOWAIT; +} + +int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + unsigned ioprio; + int ret; + + rw->kiocb.ki_pos = READ_ONCE(sqe->off); + /* used for fixed read/write too - just read unconditionally */ + req->buf_index = READ_ONCE(sqe->buf_index); + + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) { + struct io_ring_ctx *ctx = req->ctx; + u16 index; + + if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); + req->imu = ctx->user_bufs[index]; + io_req_set_rsrc_node(req, ctx, 0); + } + + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + rw->kiocb.ki_ioprio = ioprio; + } else { + rw->kiocb.ki_ioprio = get_current_ioprio(); + } + + rw->addr = READ_ONCE(sqe->addr); + rw->len = READ_ONCE(sqe->len); + rw->flags = READ_ONCE(sqe->rw_flags); + return 0; +} + +void io_readv_writev_cleanup(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + kfree(io->free_iovec); +} + +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + break; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail this + * IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + kiocb->ki_complete(kiocb, ret); + } +} + +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (rw->kiocb.ki_pos != -1) + return &rw->kiocb.ki_pos; + + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + rw->kiocb.ki_pos = req->file->f_pos; + return &rw->kiocb.ki_pos; + } + + rw->kiocb.ki_pos = 0; + return NULL; +} + +static void io_req_task_queue_reissue(struct io_kiocb *req) +{ + req->io_task_work.func = io_queue_iowq; + io_req_task_work_add(req); +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + if (!req_has_async_data(req)) + return !io_req_prep_async(req); + iov_iter_restore(&io->s.iter, &io->s.iter_state); + return true; +} + +static bool io_rw_should_reissue(struct io_kiocb *req) +{ + umode_t mode = file_inode(req->file)->i_mode; + struct io_ring_ctx *ctx = req->ctx; + + if (!S_ISBLK(mode) && !S_ISREG(mode)) + return false; + if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && + !(ctx->flags & IORING_SETUP_IOPOLL))) + return false; + /* + * If ref is dying, we might be running poll reap from the exit work. + * Don't attempt to reissue from that path, just let it fail with + * -EAGAIN. + */ + if (percpu_ref_is_dying(&ctx->refs)) + return false; + /* + * Play it safe and assume not safe to re-import and reissue if we're + * not in the original thread group (or in task context). + */ + if (!same_thread_group(req->task, current) || !in_task()) + return false; + return true; +} +#else +static bool io_resubmit_prep(struct io_kiocb *req) +{ + return false; +} +static bool io_rw_should_reissue(struct io_kiocb *req) +{ + return false; +} +#endif + +static void kiocb_end_write(struct io_kiocb *req) +{ + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct super_block *sb = file_inode(req->file)->i_sb; + + __sb_writers_acquired(sb, SB_FREEZE_WRITE); + sb_end_write(sb); + } +} + +static bool __io_complete_rw_common(struct io_kiocb *req, long res) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (rw->kiocb.ki_flags & IOCB_WRITE) { + kiocb_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); + } + if (unlikely(res != req->cqe.res)) { + if ((res == -EAGAIN || res == -EOPNOTSUPP) && + io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; + return true; + } + req_set_fail(req); + req->cqe.res = res; + } + return false; +} + +static void __io_complete_rw(struct io_kiocb *req, long res, + unsigned int issue_flags) +{ + if (__io_complete_rw_common(req, res)) + return; + io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); + __io_req_complete(req, issue_flags); +} + +static void io_complete_rw(struct kiocb *kiocb, long res) +{ + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); + + if (__io_complete_rw_common(req, res)) + return; + io_req_set_res(req, res, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_prio_work_add(req); +} + +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) +{ + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); + + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); + if (unlikely(res != req->cqe.res)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; + return; + } + req->cqe.res = res; + } + + /* order with io_iopoll_complete() checking ->iopoll_completed */ + smp_store_release(&req->iopoll_completed, 1); +} + +static void kiocb_done(struct io_kiocb *req, ssize_t ret, + unsigned int issue_flags) +{ + struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req); + + /* add previously done IO, if any */ + if (req_has_async_data(req) && io->bytes_done > 0) { + if (ret < 0) + ret = io->bytes_done; + else + ret += io->bytes_done; + } + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = rw->kiocb.ki_pos; + if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) + __io_complete_rw(req, ret, issue_flags); + else + io_rw_done(&rw->kiocb, ret); + + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + if (io_resubmit_prep(req)) + io_req_task_queue_reissue(req); + else + io_req_task_queue_fail(req, ret); + } +} + +static int __io_import_fixed(struct io_kiocb *req, int ddir, + struct iov_iter *iter, struct io_mapped_ubuf *imu) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + size_t len = rw->len; + u64 buf_end, buf_addr = rw->addr; + size_t offset; + + if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) + return -EFAULT; + /* not inside the mapped region */ + if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + + if (offset) { + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here, because + * we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are PAGE_SIZE in size, except potentially the + * first and last bvec + * + * So just find our index, and adjust the iterator afterwards. + * If the offset is within the first bvec (or the whole first + * bvec, just use iov_iter_advance(). This makes it easier + * since we can just skip the first segment, which may not + * be PAGE_SIZE aligned. + */ + const struct bio_vec *bvec = imu->bvec; + + if (offset <= bvec->bv_len) { + iov_iter_advance(iter, offset); + } else { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> PAGE_SHIFT); + + iter->bvec = bvec + seg_skip; + iter->nr_segs -= seg_skip; + iter->count -= bvec->bv_len + offset; + iter->iov_offset = offset & ~PAGE_MASK; + } + } + + return 0; +} + +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, + unsigned int issue_flags) +{ + if (WARN_ON_ONCE(!req->imu)) + return -EFAULT; + return __io_import_fixed(req, rw, iter, req->imu); +} + +#ifdef CONFIG_COMPAT +static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct compat_iovec __user *uiov; + compat_ssize_t clen; + void __user *buf; + size_t len; + + uiov = u64_to_user_ptr(rw->addr); + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + + len = clen; + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + iov[0].iov_base = buf; + rw->len = iov[0].iov_len = (compat_size_t) len; + return 0; +} +#endif + +static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct iovec __user *uiov = u64_to_user_ptr(rw->addr); + void __user *buf; + ssize_t len; + + if (copy_from_user(iov, uiov, sizeof(*uiov))) + return -EFAULT; + + len = iov[0].iov_len; + if (len < 0) + return -EINVAL; + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + iov[0].iov_base = buf; + rw->len = iov[0].iov_len = len; + return 0; +} + +static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + iov[0].iov_base = u64_to_user_ptr(rw->addr); + iov[0].iov_len = rw->len; + return 0; + } + if (rw->len != 1) + return -EINVAL; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_compat_import(req, iov, issue_flags); +#endif + + return __io_iov_buffer_select(req, iov, issue_flags); +} + +static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, + struct io_rw_state *s, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct iov_iter *iter = &s->iter; + u8 opcode = req->opcode; + struct iovec *iovec; + void __user *buf; + size_t sqe_len; + ssize_t ret; + + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { + ret = io_import_fixed(req, ddir, iter, issue_flags); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; + + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return ERR_PTR(-ENOBUFS); + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + + ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + iovec = s->fast_iov; + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_iov_buffer_select(req, iovec, issue_flags); + if (ret) + return ERR_PTR(ret); + iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); + return NULL; + } + + ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, + req->ctx->compat); + if (unlikely(ret < 0)) + return ERR_PTR(ret); + return iovec; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct io_rw_state *s, + unsigned int issue_flags) +{ + *iovec = __io_import_iovec(rw, req, s, issue_flags); + if (unlikely(IS_ERR(*iovec))) + return PTR_ERR(*iovec); + + iov_iter_save_state(&s->iter, &s->iter_state); + return 0; +} + +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; +} + +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) +{ + struct kiocb *kiocb = &rw->kiocb; + struct file *file = kiocb->ki_filp; + ssize_t ret = 0; + loff_t *ppos; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if ((kiocb->ki_flags & IOCB_NOWAIT) && + !(kiocb->ki_filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + ppos = io_kiocb_ppos(kiocb); + + while (iov_iter_count(iter)) { + struct iovec iovec; + ssize_t nr; + + if (!iov_iter_is_bvec(iter)) { + iovec = iov_iter_iovec(iter); + } else { + iovec.iov_base = u64_to_user_ptr(rw->addr); + iovec.iov_len = rw->len; + } + + if (ddir == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, ppos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, ppos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (!iov_iter_is_bvec(iter)) { + iov_iter_advance(iter, nr); + } else { + rw->addr += nr; + rw->len -= nr; + if (!rw->len) + break; + } + if (nr != iovec.iov_len) + break; + } + + return ret; +} + +static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) +{ + struct io_async_rw *io = req->async_data; + + memcpy(&io->s.iter, iter, sizeof(*iter)); + io->free_iovec = iovec; + io->bytes_done = 0; + /* can only be fixed buffers, no need to do anything */ + if (iov_iter_is_bvec(iter)) + return; + if (!iovec) { + unsigned iov_off = 0; + + io->s.iter.iov = io->s.fast_iov; + if (iter->iov != fast_iov) { + iov_off = iter->iov - fast_iov; + io->s.iter.iov += iov_off; + } + if (io->s.fast_iov != fast_iov) + memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, + sizeof(struct iovec) * iter->nr_segs); + } else { + req->flags |= REQ_F_NEED_CLEANUP; + } +} + +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + struct io_rw_state *s, bool force) +{ + if (!force && !io_op_defs[req->opcode].prep_async) + return 0; + if (!req_has_async_data(req)) { + struct io_async_rw *iorw; + + if (io_alloc_async_data(req)) { + kfree(iovec); + return -ENOMEM; + } + + io_req_map_rw(req, iovec, s->fast_iov, &s->iter); + iorw = req->async_data; + /* we've copied and mapped the iter, ensure state is saved */ + iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); + } + return 0; +} + +static inline int io_rw_prep_async(struct io_kiocb *req, int rw) +{ + struct io_async_rw *iorw = req->async_data; + struct iovec *iov; + int ret; + + /* submission path, ->uring_lock should already be taken */ + ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); + if (unlikely(ret < 0)) + return ret; + + iorw->bytes_done = 0; + iorw->free_iovec = iov; + if (iov) + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_readv_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, READ); +} + +int io_writev_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, WRITE); +} + +/* + * This is our waitqueue callback handler, registered through __folio_lock_async() + * when we initially tried to do the IO with the iocb armed our waitqueue. + * This gets called when the page is unlocked, and we generally expect that to + * happen when the page IO is completed and the page is now uptodate. This will + * queue a task_work based retry of the operation, attempting to copy the data + * again. If the latter fails because the page was NOT uptodate, then we will + * do a thread based blocking retry of the operation. That's the unexpected + * slow path. + */ +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct wait_page_key *key = arg; + + wpq = container_of(wait, struct wait_page_queue, wait); + + if (!wake_page_match(wpq, key)) + return 0; + + rw->kiocb.ki_flags &= ~IOCB_WAITQ; + list_del_init(&wait->entry); + io_req_task_queue(req); + return 1; +} + +/* + * This controls whether a given IO request should be armed for async page + * based retry. If we return false here, the request is handed to the async + * worker threads for retry. If we're doing buffered reads on a regular file, + * we prepare a private wait_page_queue entry and retry the operation. This + * will either succeed because the page is now uptodate and unlocked, or it + * will register a callback when the page is unlocked at IO completion. Through + * that callback, io_uring uses task_work to setup a retry of the operation. + * That retry will attempt the buffered read again. The retry will generally + * succeed, or in rare cases where it fails, we then fall back to using the + * async worker threads for a blocking retry. + */ +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + struct wait_page_queue *wait = &io->wpq; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* Only for buffered IO */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) + return false; + + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_waitq = wait; + return true; +} + +static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) +{ + struct file *file = rw->kiocb.ki_filp; + + if (likely(file->f_op->read_iter)) + return call_read_iter(file, &rw->kiocb, iter); + else if (file->f_op->read) + return loop_rw_iter(READ, rw, iter); + else + return -EINVAL; +} + +static bool need_read_all(struct io_kiocb *req) +{ + return req->flags & REQ_F_ISREG || + S_ISBLK(file_inode(req->file)->i_mode); +} + +static inline bool io_req_ffs_set(struct io_kiocb *req) +{ + return req->flags & REQ_F_FIXED_FILE; +} + +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (unlikely(!file || !(file->f_mode & mode))) + return -EBADF; + + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + + kiocb->ki_flags = iocb_flags(file); + ret = kiocb_set_rw_flags(kiocb, rw->flags); + if (unlikely(ret)) + return ret; + + /* + * If the file is marked O_NONBLOCK, still allow retry for it if it + * supports async. Otherwise it's impossible to use O_NONBLOCK files + * reliably. If not, or it IOCB_NOWAIT is set, don't retry. + */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + req->flags |= REQ_F_NOWAIT; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) + return -EOPNOTSUPP; + + kiocb->private = NULL; + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; + kiocb->ki_complete = io_complete_rw_iopoll; + req->iopoll_completed = 0; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + kiocb->ki_complete = io_complete_rw; + } + + return 0; +} + +int io_read(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; + struct kiocb *kiocb = &rw->kiocb; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_async_rw *io; + ssize_t ret, ret2; + loff_t *ppos; + + if (!req_has_async_data(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } else { + io = req->async_data; + s = &io->s; + + /* + * Safe and required to re-import if we're using provided + * buffers, as we dropped the selected one before retry. + */ + if (io_do_buffer_select(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } + + /* + * We come here from an earlier attempt, restore our state to + * match in case it doesn't. It's cheap enough that we don't + * need to make this conditional. + */ + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; + } + ret = io_rw_init_file(req, FMODE_READ); + if (unlikely(ret)) { + kfree(iovec); + return ret; + } + req->cqe.res = iov_iter_count(&s->iter); + + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) { + ret = io_setup_async_rw(req, iovec, s, true); + return ret ?: -EAGAIN; + } + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } + + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); + if (unlikely(ret)) { + kfree(iovec); + return ret; + } + + ret = io_iter_do_read(rw, &s->iter); + + if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { + req->flags &= ~REQ_F_REISSUE; + /* if we can poll, just do that */ + if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + return -EAGAIN; + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + goto done; + /* no retry on NONBLOCK nor RWF_NOWAIT */ + if (req->flags & REQ_F_NOWAIT) + goto done; + ret = 0; + } else if (ret == -EIOCBQUEUED) { + goto out_free; + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || + (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { + /* read all, failed, already did sync or don't want to retry */ + goto done; + } + + /* + * Don't depend on the iter state matching what was consumed, or being + * untouched in case of error. Restore it and we'll advance it + * manually if we need to. + */ + iov_iter_restore(&s->iter, &s->iter_state); + + ret2 = io_setup_async_rw(req, iovec, s, true); + if (ret2) + return ret2; + + iovec = NULL; + io = req->async_data; + s = &io->s; + /* + * Now use our persistent iterator and state, if we aren't already. + * We've restored and mapped the iter to match. + */ + + do { + /* + * We end up here because of a partial read, either from + * above or inside this loop. Advance the iter by the bytes + * that were consumed. + */ + iov_iter_advance(&s->iter, ret); + if (!iov_iter_count(&s->iter)) + break; + io->bytes_done += ret; + iov_iter_save_state(&s->iter, &s->iter_state); + + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; + } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If + * we get -EIOCBQUEUED, then we'll get a notification when the + * desired page gets unlocked. We can also get a partial read + * here, and if we do, then just retry at the new offset. + */ + ret = io_iter_do_read(rw, &s->iter); + if (ret == -EIOCBQUEUED) + return IOU_ISSUE_SKIP_COMPLETE; + /* we got some bytes, but not all. retry. */ + kiocb->ki_flags &= ~IOCB_WAITQ; + iov_iter_restore(&s->iter, &s->iter_state); + } while (ret > 0); +done: + kiocb_done(req, ret, issue_flags); +out_free: + /* it's faster to check here then delegate to kfree */ + if (iovec) + kfree(iovec); + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_write(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; + struct kiocb *kiocb = &rw->kiocb; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + ssize_t ret, ret2; + loff_t *ppos; + + if (!req_has_async_data(req)) { + ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } else { + struct io_async_rw *io = req->async_data; + + s = &io->s; + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; + } + ret = io_rw_init_file(req, FMODE_WRITE); + if (unlikely(ret)) { + kfree(iovec); + return ret; + } + req->cqe.res = iov_iter_count(&s->iter); + + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) + goto copy_iov; + + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; + + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } + + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); + if (unlikely(ret)) + goto out_free; + + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + sb_start_write(file_inode(req->file)->i_sb); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; + + if (likely(req->file->f_op->write_iter)) + ret2 = call_write_iter(req->file, kiocb, &s->iter); + else if (req->file->f_op->write) + ret2 = loop_rw_iter(WRITE, rw, &s->iter); + else + ret2 = -EINVAL; + + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + ret2 = -EAGAIN; + } + + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + /* no retry on NONBLOCK nor RWF_NOWAIT */ + if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) + goto done; + if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + goto copy_iov; +done: + kiocb_done(req, ret2, issue_flags); + ret = IOU_ISSUE_SKIP_COMPLETE; + } else { +copy_iov: + iov_iter_restore(&s->iter, &s->iter_state); + ret = io_setup_async_rw(req, iovec, s, false); + return ret ?: -EAGAIN; + } +out_free: + /* it's reportedly faster than delegating the null check to kfree() */ + if (iovec) + kfree(iovec); + return ret; +} + +static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); + + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); +} + +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) +{ + struct io_wq_work_node *pos, *start, *prev; + unsigned int poll_flags = BLK_POLL_NOSLEEP; + DEFINE_IO_COMP_BATCH(iob); + int nr_events = 0; + + /* + * Only spin for completions if we don't have multiple devices hanging + * off our complete list. + */ + if (ctx->poll_multi_queue || force_nonspin) + poll_flags |= BLK_POLL_ONESHOT; + + wq_list_for_each(pos, start, &ctx->iopoll_list) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + struct io_rw *rw = io_kiocb_to_cmd(req); + int ret; + + /* + * Move completed and retryable entries to our local lists. + * If we find a request that requires polling, break out + * and complete those lists first, if we have entries there. + */ + if (READ_ONCE(req->iopoll_completed)) + break; + + ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); + if (unlikely(ret < 0)) + return ret; + else if (ret) + poll_flags |= BLK_POLL_ONESHOT; + + /* iopoll may have completed current req */ + if (!rq_list_empty(iob.req_list) || + READ_ONCE(req->iopoll_completed)) + break; + } + + if (!rq_list_empty(iob.req_list)) + iob.complete(&iob); + else if (!pos) + return 0; + + prev = start; + wq_list_for_each_resume(pos, prev) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + + /* order with io_complete_rw_iopoll(), e.g. ->result updates */ + if (!smp_load_acquire(&req->iopoll_completed)) + break; + nr_events++; + if (unlikely(req->flags & REQ_F_CQE_SKIP)) + continue; + + req->cqe.flags = io_put_kbuf(req, 0); + __io_fill_cqe_req(req->ctx, req); + } + + if (unlikely(!nr_events)) + return 0; + + io_commit_cqring(ctx); + io_cqring_ev_posted_iopoll(ctx); + pos = start ? start->next : ctx->iopoll_list.first; + wq_list_cut(&ctx->iopoll_list, prev, start); + io_free_batch_list(ctx, pos); + return nr_events; +} diff --git a/io_uring/rw.h b/io_uring/rw.h new file mode 100644 index 00000000000000..0204c3fcafa517 --- /dev/null +++ b/io_uring/rw.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +struct io_rw_state { + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV]; +}; + +struct io_async_rw { + struct io_rw_state s; + const struct iovec *free_iovec; + size_t bytes_done; + struct wait_page_queue wpq; +}; + +int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_read(struct io_kiocb *req, unsigned int issue_flags); +int io_readv_prep_async(struct io_kiocb *req); +int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_writev_prep_async(struct io_kiocb *req); +void io_readv_writev_cleanup(struct io_kiocb *req); From d9b57aa3cfc792ccac6858376c017dbea6cb2872 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Jun 2022 16:27:42 -0600 Subject: [PATCH 055/172] io_uring: move opcode table to opdef.c We already have the declarations in opdef.h, move the rest into its own file rather than in the main io_uring.c file. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 469 +----------------------------------- io_uring/io_uring_types.h | 2 + io_uring/opdef.c | 495 ++++++++++++++++++++++++++++++++++++++ io_uring/opdef.h | 2 + 5 files changed, 501 insertions(+), 469 deletions(-) create mode 100644 io_uring/opdef.c diff --git a/io_uring/Makefile b/io_uring/Makefile index d70deed65a0bb2..466639c289be7f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o + cancel.o kbuf.o rsrc.o rw.o opdef.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0af61a6c29cfee..c703190986270b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,22 +90,8 @@ #include "kbuf.h" #include "rsrc.h" -#include "xattr.h" -#include "nop.h" -#include "fs.h" -#include "splice.h" -#include "sync.h" -#include "advise.h" -#include "openclose.h" -#include "uring_cmd.h" -#include "epoll.h" -#include "statx.h" -#include "net.h" -#include "msg_ring.h" #include "timeout.h" #include "poll.h" -#include "cancel.h" -#include "rw.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -161,13 +147,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; -const char *io_uring_get_opcode(u8 opcode) -{ - if (opcode < IORING_OP_LAST) - return io_op_defs[opcode].name; - return "INVALID"; -} - struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -1478,12 +1457,6 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, - const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - int io_req_prep_async(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -3909,442 +3882,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, return ret; } -static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) -{ - WARN_ON_ONCE(1); - return -ECANCELED; -} - -const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = { - .audit_skip = 1, - .iopoll = 1, - .name = "NOP", - .prep = io_nop_prep, - .issue = io_nop, - }, - [IORING_OP_READV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READV", - .prep = io_prep_rw, - .issue = io_read, - .prep_async = io_readv_prep_async, - .cleanup = io_readv_writev_cleanup, - }, - [IORING_OP_WRITEV] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITEV", - .prep = io_prep_rw, - .issue = io_write, - .prep_async = io_writev_prep_async, - .cleanup = io_readv_writev_cleanup, - }, - [IORING_OP_FSYNC] = { - .needs_file = 1, - .audit_skip = 1, - .name = "FSYNC", - .prep = io_fsync_prep, - .issue = io_fsync, - }, - [IORING_OP_READ_FIXED] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ_FIXED", - .prep = io_prep_rw, - .issue = io_read, - }, - [IORING_OP_WRITE_FIXED] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE_FIXED", - .prep = io_prep_rw, - .issue = io_write, - }, - [IORING_OP_POLL_ADD] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "POLL_ADD", - .prep = io_poll_add_prep, - .issue = io_poll_add, - }, - [IORING_OP_POLL_REMOVE] = { - .audit_skip = 1, - .name = "POLL_REMOVE", - .prep = io_poll_remove_prep, - .issue = io_poll_remove, - }, - [IORING_OP_SYNC_FILE_RANGE] = { - .needs_file = 1, - .audit_skip = 1, - .name = "SYNC_FILE_RANGE", - .prep = io_sfr_prep, - .issue = io_sync_file_range, - }, - [IORING_OP_SENDMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .ioprio = 1, - .name = "SENDMSG", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep = io_sendmsg_prep, - .issue = io_sendmsg, - .prep_async = io_sendmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_RECVMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .ioprio = 1, - .name = "RECVMSG", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep = io_recvmsg_prep, - .issue = io_recvmsg, - .prep_async = io_recvmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "TIMEOUT", - .prep = io_timeout_prep, - .issue = io_timeout, - }, - [IORING_OP_TIMEOUT_REMOVE] = { - /* used by timeout updates' prep() */ - .audit_skip = 1, - .name = "TIMEOUT_REMOVE", - .prep = io_timeout_remove_prep, - .issue = io_timeout_remove, - }, - [IORING_OP_ACCEPT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .poll_exclusive = 1, - .ioprio = 1, /* used for flags */ - .name = "ACCEPT", -#if defined(CONFIG_NET) - .prep = io_accept_prep, - .issue = io_accept, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_ASYNC_CANCEL] = { - .audit_skip = 1, - .name = "ASYNC_CANCEL", - .prep = io_async_cancel_prep, - .issue = io_async_cancel, - }, - [IORING_OP_LINK_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "LINK_TIMEOUT", - .prep = io_link_timeout_prep, - .issue = io_no_issue, - }, - [IORING_OP_CONNECT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .name = "CONNECT", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), - .prep = io_connect_prep, - .issue = io_connect, - .prep_async = io_connect_prep_async, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_FALLOCATE] = { - .needs_file = 1, - .name = "FALLOCATE", - .prep = io_fallocate_prep, - .issue = io_fallocate, - }, - [IORING_OP_OPENAT] = { - .name = "OPENAT", - .prep = io_openat_prep, - .issue = io_openat, - .cleanup = io_open_cleanup, - }, - [IORING_OP_CLOSE] = { - .name = "CLOSE", - .prep = io_close_prep, - .issue = io_close, - }, - [IORING_OP_FILES_UPDATE] = { - .audit_skip = 1, - .iopoll = 1, - .name = "FILES_UPDATE", - .prep = io_files_update_prep, - .issue = io_files_update, - }, - [IORING_OP_STATX] = { - .audit_skip = 1, - .name = "STATX", - .prep = io_statx_prep, - .issue = io_statx, - .cleanup = io_statx_cleanup, - }, - [IORING_OP_READ] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ", - .prep = io_prep_rw, - .issue = io_read, - }, - [IORING_OP_WRITE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE", - .prep = io_prep_rw, - .issue = io_write, - }, - [IORING_OP_FADVISE] = { - .needs_file = 1, - .audit_skip = 1, - .name = "FADVISE", - .prep = io_fadvise_prep, - .issue = io_fadvise, - }, - [IORING_OP_MADVISE] = { - .name = "MADVISE", - .prep = io_madvise_prep, - .issue = io_madvise, - }, - [IORING_OP_SEND] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .audit_skip = 1, - .ioprio = 1, - .name = "SEND", -#if defined(CONFIG_NET) - .prep = io_sendmsg_prep, - .issue = io_send, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_RECV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .audit_skip = 1, - .ioprio = 1, - .name = "RECV", -#if defined(CONFIG_NET) - .prep = io_recvmsg_prep, - .issue = io_recv, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_OPENAT2] = { - .name = "OPENAT2", - .prep = io_openat2_prep, - .issue = io_openat2, - .cleanup = io_open_cleanup, - }, - [IORING_OP_EPOLL_CTL] = { - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "EPOLL", -#if defined(CONFIG_EPOLL) - .prep = io_epoll_ctl_prep, - .issue = io_epoll_ctl, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_SPLICE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "SPLICE", - .prep = io_splice_prep, - .issue = io_splice, - }, - [IORING_OP_PROVIDE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - .name = "PROVIDE_BUFFERS", - .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, - }, - [IORING_OP_REMOVE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - .name = "REMOVE_BUFFERS", - .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, - }, - [IORING_OP_TEE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "TEE", - .prep = io_tee_prep, - .issue = io_tee, - }, - [IORING_OP_SHUTDOWN] = { - .needs_file = 1, - .name = "SHUTDOWN", -#if defined(CONFIG_NET) - .prep = io_shutdown_prep, - .issue = io_shutdown, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_RENAMEAT] = { - .name = "RENAMEAT", - .prep = io_renameat_prep, - .issue = io_renameat, - .cleanup = io_renameat_cleanup, - }, - [IORING_OP_UNLINKAT] = { - .name = "UNLINKAT", - .prep = io_unlinkat_prep, - .issue = io_unlinkat, - .cleanup = io_unlinkat_cleanup, - }, - [IORING_OP_MKDIRAT] = { - .name = "MKDIRAT", - .prep = io_mkdirat_prep, - .issue = io_mkdirat, - .cleanup = io_mkdirat_cleanup, - }, - [IORING_OP_SYMLINKAT] = { - .name = "SYMLINKAT", - .prep = io_symlinkat_prep, - .issue = io_symlinkat, - .cleanup = io_link_cleanup, - }, - [IORING_OP_LINKAT] = { - .name = "LINKAT", - .prep = io_linkat_prep, - .issue = io_linkat, - .cleanup = io_link_cleanup, - }, - [IORING_OP_MSG_RING] = { - .needs_file = 1, - .iopoll = 1, - .name = "MSG_RING", - .prep = io_msg_ring_prep, - .issue = io_msg_ring, - }, - [IORING_OP_FSETXATTR] = { - .needs_file = 1, - .name = "FSETXATTR", - .prep = io_fsetxattr_prep, - .issue = io_fsetxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_SETXATTR] = { - .name = "SETXATTR", - .prep = io_setxattr_prep, - .issue = io_setxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_FGETXATTR] = { - .needs_file = 1, - .name = "FGETXATTR", - .prep = io_fgetxattr_prep, - .issue = io_fgetxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_GETXATTR] = { - .name = "GETXATTR", - .prep = io_getxattr_prep, - .issue = io_getxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_SOCKET] = { - .audit_skip = 1, - .name = "SOCKET", -#if defined(CONFIG_NET) - .prep = io_socket_prep, - .issue = io_socket, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_URING_CMD] = { - .needs_file = 1, - .plug = 1, - .name = "URING_CMD", - .async_size = uring_cmd_pdu_size(1), - .prep = io_uring_cmd_prep, - .issue = io_uring_cmd, - .prep_async = io_uring_cmd_prep_async, - }, -}; - static int __init io_uring_init(void) { - int i; - #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ @@ -4400,17 +3939,11 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { - BUG_ON(!io_op_defs[i].prep); - if (io_op_defs[i].prep != io_eopnotsupp_prep) - BUG_ON(!io_op_defs[i].issue); - WARN_ON_ONCE(!io_op_defs[i].name); - } + io_uring_optable_init(); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 147e1e597530a9..fb87af5fd8e751 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -3,6 +3,8 @@ #include #include +#include +#include #include "io-wq.h" #include "filetable.h" diff --git a/io_uring/opdef.c b/io_uring/opdef.c new file mode 100644 index 00000000000000..d687d33f9c0c03 --- /dev/null +++ b/io_uring/opdef.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_uring opcode handling table + */ +#include +#include +#include +#include +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "opdef.h" +#include "refs.h" +#include "tctx.h" +#include "sqpoll.h" +#include "fdinfo.h" +#include "kbuf.h" +#include "rsrc.h" + +#include "xattr.h" +#include "nop.h" +#include "fs.h" +#include "splice.h" +#include "sync.h" +#include "advise.h" +#include "openclose.h" +#include "uring_cmd.h" +#include "epoll.h" +#include "statx.h" +#include "net.h" +#include "msg_ring.h" +#include "timeout.h" +#include "poll.h" +#include "cancel.h" +#include "rw.h" + +static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) +{ + WARN_ON_ONCE(1); + return -ECANCELED; +} + +static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, + const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .name = "NOP", + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_READV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "READV", + .prep = io_prep_rw, + .issue = io_read, + .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, + }, + [IORING_OP_WRITEV] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", + .prep = io_prep_rw, + .issue = io_write, + .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + .audit_skip = 1, + .name = "FSYNC", + .prep = io_fsync_prep, + .issue = io_fsync, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "POLL_ADD", + .prep = io_poll_add_prep, + .issue = io_poll_add, + }, + [IORING_OP_POLL_REMOVE] = { + .audit_skip = 1, + .name = "POLL_REMOVE", + .prep = io_poll_remove_prep, + .issue = io_poll_remove, + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + .audit_skip = 1, + .name = "SYNC_FILE_RANGE", + .prep = io_sfr_prep, + .issue = io_sync_file_range, + }, + [IORING_OP_SENDMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .ioprio = 1, + .name = "SENDMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep = io_sendmsg_prep, + .issue = io_sendmsg, + .prep_async = io_sendmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RECVMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .ioprio = 1, + .name = "RECVMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep = io_recvmsg_prep, + .issue = io_recvmsg, + .prep_async = io_recvmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", + .prep = io_timeout_prep, + .issue = io_timeout, + }, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .audit_skip = 1, + .name = "TIMEOUT_REMOVE", + .prep = io_timeout_remove_prep, + .issue = io_timeout_remove, + }, + [IORING_OP_ACCEPT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ + .name = "ACCEPT", +#if defined(CONFIG_NET) + .prep = io_accept_prep, + .issue = io_accept, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_ASYNC_CANCEL] = { + .audit_skip = 1, + .name = "ASYNC_CANCEL", + .prep = io_async_cancel_prep, + .issue = io_async_cancel, + }, + [IORING_OP_LINK_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", + .prep = io_link_timeout_prep, + .issue = io_no_issue, + }, + [IORING_OP_CONNECT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .name = "CONNECT", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_connect), + .prep = io_connect_prep, + .issue = io_connect, + .prep_async = io_connect_prep_async, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + .name = "FALLOCATE", + .prep = io_fallocate_prep, + .issue = io_fallocate, + }, + [IORING_OP_OPENAT] = { + .name = "OPENAT", + .prep = io_openat_prep, + .issue = io_openat, + .cleanup = io_open_cleanup, + }, + [IORING_OP_CLOSE] = { + .name = "CLOSE", + .prep = io_close_prep, + .issue = io_close, + }, + [IORING_OP_FILES_UPDATE] = { + .audit_skip = 1, + .iopoll = 1, + .name = "FILES_UPDATE", + .prep = io_files_update_prep, + .issue = io_files_update, + }, + [IORING_OP_STATX] = { + .audit_skip = 1, + .name = "STATX", + .prep = io_statx_prep, + .issue = io_statx, + .cleanup = io_statx_cleanup, + }, + [IORING_OP_READ] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "READ", + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "WRITE", + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + .audit_skip = 1, + .name = "FADVISE", + .prep = io_fadvise_prep, + .issue = io_fadvise, + }, + [IORING_OP_MADVISE] = { + .name = "MADVISE", + .prep = io_madvise_prep, + .issue = io_madvise, + }, + [IORING_OP_SEND] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, + .name = "SEND", +#if defined(CONFIG_NET) + .prep = io_sendmsg_prep, + .issue = io_send, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RECV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .audit_skip = 1, + .ioprio = 1, + .name = "RECV", +#if defined(CONFIG_NET) + .prep = io_recvmsg_prep, + .issue = io_recv, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_OPENAT2] = { + .name = "OPENAT2", + .prep = io_openat2_prep, + .issue = io_openat2, + .cleanup = io_open_cleanup, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "EPOLL", +#if defined(CONFIG_EPOLL) + .prep = io_epoll_ctl_prep, + .issue = io_epoll_ctl, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "SPLICE", + .prep = io_splice_prep, + .issue = io_splice, + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .name = "PROVIDE_BUFFERS", + .prep = io_provide_buffers_prep, + .issue = io_provide_buffers, + }, + [IORING_OP_REMOVE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .name = "REMOVE_BUFFERS", + .prep = io_remove_buffers_prep, + .issue = io_remove_buffers, + }, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "TEE", + .prep = io_tee_prep, + .issue = io_tee, + }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + .name = "SHUTDOWN", +#if defined(CONFIG_NET) + .prep = io_shutdown_prep, + .issue = io_shutdown, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", + .prep = io_renameat_prep, + .issue = io_renameat, + .cleanup = io_renameat_cleanup, + }, + [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", + .prep = io_unlinkat_prep, + .issue = io_unlinkat, + .cleanup = io_unlinkat_cleanup, + }, + [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", + .prep = io_mkdirat_prep, + .issue = io_mkdirat, + .cleanup = io_mkdirat_cleanup, + }, + [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", + .prep = io_symlinkat_prep, + .issue = io_symlinkat, + .cleanup = io_link_cleanup, + }, + [IORING_OP_LINKAT] = { + .name = "LINKAT", + .prep = io_linkat_prep, + .issue = io_linkat, + .cleanup = io_link_cleanup, + }, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + .iopoll = 1, + .name = "MSG_RING", + .prep = io_msg_ring_prep, + .issue = io_msg_ring, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1, + .name = "FSETXATTR", + .prep = io_fsetxattr_prep, + .issue = io_fsetxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SETXATTR] = { + .name = "SETXATTR", + .prep = io_setxattr_prep, + .issue = io_setxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_FGETXATTR] = { + .needs_file = 1, + .name = "FGETXATTR", + .prep = io_fgetxattr_prep, + .issue = io_fgetxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_GETXATTR] = { + .name = "GETXATTR", + .prep = io_getxattr_prep, + .issue = io_getxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + .name = "SOCKET", +#if defined(CONFIG_NET) + .prep = io_socket_prep, + .issue = io_socket, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .name = "URING_CMD", + .async_size = uring_cmd_pdu_size(1), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + .prep_async = io_uring_cmd_prep_async, + }, +}; + +const char *io_uring_get_opcode(u8 opcode) +{ + if (opcode < IORING_OP_LAST) + return io_op_defs[opcode].name; + return "INVALID"; +} + +void __init io_uring_optable_init(void) +{ + int i; + + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + + for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { + BUG_ON(!io_op_defs[i].prep); + if (io_op_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_op_defs[i].issue); + WARN_ON_ONCE(!io_op_defs[i].name); + } +} diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 4578adcdba8a41..ece8ed4f96c434 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -37,4 +37,6 @@ struct io_op_def { }; extern const struct io_op_def io_op_defs[]; + +void io_uring_optable_init(void); #endif From b9ba8a4463cd78d0aee520c4bf2569820ac29929 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 May 2022 10:55:07 -0600 Subject: [PATCH 056/172] io_uring: add support for level triggered poll By default, the POLL_ADD command does edge triggered poll - if we get a non-zero mask on the initial poll attempt, we complete the request successfully. Support level triggered by always waiting for a notification, regardless of whether or not the initial mask matches the file state. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ io_uring/poll.c | 15 ++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0ad3da28d2fce8..4927bb69387a29 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -229,10 +229,13 @@ enum io_uring_op { * * IORING_POLL_UPDATE Update existing poll request, matching * sqe->addr as the old user_data field. + * + * IORING_POLL_LEVEL Level triggered poll. */ #define IORING_POLL_ADD_MULTI (1U << 0) #define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_USER_DATA (1U << 2) +#define IORING_POLL_ADD_LEVEL (1U << 3) /* * ASYNC_CANCEL flags. diff --git a/io_uring/poll.c b/io_uring/poll.c index b80f7fa261232d..558dc170468ad8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -423,11 +423,13 @@ static int __io_arm_poll_handler(struct io_kiocb *req, atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; - if (mask && (poll->events & EPOLLONESHOT)) { + if (mask && + ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { io_poll_remove_entries(req); /* no one else has access to the req, forget about the ref */ return mask; } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); if (!ipt->error) @@ -439,7 +441,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, io_poll_req_insert(req); spin_unlock(&ctx->completion_lock); - if (mask) { + if (mask && (poll->events & EPOLLET)) { /* can't multishot if failed, just queue the event we've got */ if (unlikely(ipt->error || !ipt->nr_entries)) { poll->events |= EPOLLONESHOT; @@ -475,7 +477,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR; + __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; if (!def->pollin && !def->pollout) @@ -638,7 +640,10 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, #endif if (!(flags & IORING_POLL_ADD_MULTI)) events |= EPOLLONESHOT; - return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); + if (!(flags & IORING_POLL_ADD_LEVEL)) + events |= EPOLLET; + return demangle_poll(events) | + (events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET)); } int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -679,7 +684,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); - if (flags & ~IORING_POLL_ADD_MULTI) + if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL)) return -EINVAL; if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; From 61a2732af4b0337f7e36093612c846e9f5962965 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jun 2022 12:36:42 -0600 Subject: [PATCH 057/172] io_uring: deprecate epoll_ctl support As far as we know, nobody ever adopted the epoll_ctl management via io_uring. Deprecate it now with a warning, and plan on removing it in a later kernel version. When we do remove it, we can revert the following commits as well: 39220e8d4a2a ("eventpoll: support non-blocking do_epoll_ctl() calls") 58e41a44c488 ("eventpoll: abstract out epoll_ctl() handler") Suggested-by: Linus Torvalds Link: https://lore.kernel.org/io-uring/CAHk-=wiTyisXBgKnVHAGYCNvkmjk=50agS2Uk6nr+n3ssLZg2w@mail.gmail.com/ Signed-off-by: Jens Axboe --- io_uring/epoll.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/epoll.c b/io_uring/epoll.c index acbb32498127ad..10853e8ed07887 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -26,6 +26,10 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req); + pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will " + "be removed in a future Linux kernel version.\n", + current->comm); + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; From 5ff4fdffad483dad815b3701be8b82c5a07b156e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:47 +0100 Subject: [PATCH 058/172] io_uring: make reg buf init consistent The default (i.e. empty) state of register buffer is dummy_ubuf, so set it to dummy on init instead of NULL. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c5456aecf03d9627fbd6e65e100e2b5293a6151e.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 8c40b20659d40a..214ff0dfa6a48e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -567,7 +567,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, io_buffer_unmap(ctx, &imu); break; } - ctx->user_bufs[i] = NULL; + ctx->user_bufs[i] = ctx->dummy_ubuf; needs_switch = true; } @@ -1203,14 +1203,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size_t size; int ret, nr_pages, i; - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; + *pimu = ctx->dummy_ubuf; + if (!iov->iov_base) return 0; - } - *pimu = NULL; ret = -ENOMEM; - pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { From b25436038f6cc20c3198792cbfab8a312d09282e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:48 +0100 Subject: [PATCH 059/172] io_uring: move defer_list to slow data draining is slow path, move defer_list to the end where slow data lives inside the context. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e16379391ca72b490afdd24e8944baab849b4a7b.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring_types.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index fb87af5fd8e751..ee58273569702d 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -162,7 +162,6 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; unsigned cached_sq_head; unsigned sq_entries; - struct list_head defer_list; /* * Fixed resources fast path, should be accessed only under @@ -274,8 +273,12 @@ struct io_ring_ctx { struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; + + /* io-wq management, e.g. thread count */ u32 iowq_limits[2]; bool iowq_limits_set; + + struct list_head defer_list; }; }; From aff5b2df9e8b35f9814c5a4907f471472cd6be77 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:49 +0100 Subject: [PATCH 060/172] io_uring: better caching for ctx timeout fields Following timeout fields access patterns, move all of them into a separate cache line inside ctx, so they don't intervene with normal completion caching, especially since timeout removals and completion are separated and the later is done via tw. It also sheds some bytes from io_ring_ctx, 1216B -> 1152B Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4b163793072840de53b3cb66e0c2995e7226ff78.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring_types.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index ee58273569702d..4fc42055dbdd1c 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -181,8 +181,6 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; - struct list_head timeout_list; - struct list_head ltimeout_list; struct list_head cq_overflow_list; struct list_head apoll_cache; struct xarray personalities; @@ -215,15 +213,11 @@ struct io_ring_ctx { struct io_ev_fd __rcu *io_ev_fd; struct wait_queue_head cq_wait; unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; struct { spinlock_t completion_lock; - spinlock_t timeout_lock; - /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -255,6 +249,15 @@ struct io_ring_ctx { struct list_head io_buffers_pages; }; + /* timeouts */ + struct { + spinlock_t timeout_lock; + atomic_t cq_timeouts; + struct list_head timeout_list; + struct list_head ltimeout_list; + unsigned cq_last_tm_flush; + } ____cacheline_aligned_in_smp; + /* Keep this last, we don't need it for the fast path */ struct { #if defined(CONFIG_UNIX) From 22eb2a3fdea03e6056dcfb4c8bb9d3fde5ce02ff Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:50 +0100 Subject: [PATCH 061/172] io_uring: refactor ctx slow data placement Shove all slow path data at the end of ctx and get rid of extra indention. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bcaf200298dd469af20787650550efc66d89bef2.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring_types.h | 81 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 4fc42055dbdd1c..ef1cf86e893264 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -185,7 +185,6 @@ struct io_ring_ctx { struct list_head apoll_cache; struct xarray personalities; u32 pers_next; - unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ @@ -232,23 +231,6 @@ struct io_ring_ctx { struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - - struct list_head io_buffers_pages; - }; - /* timeouts */ struct { spinlock_t timeout_lock; @@ -259,30 +241,45 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* Keep this last, we don't need it for the fast path */ - struct { - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; - - /* Only used for accounting purposes */ - struct user_struct *user; - struct mm_struct *mm_account; - - /* ctx exit and cancelation */ - struct llist_head fallback_llist; - struct delayed_work fallback_work; - struct work_struct exit_work; - struct list_head tctx_list; - struct completion ref_comp; - - /* io-wq management, e.g. thread count */ - u32 iowq_limits[2]; - bool iowq_limits_set; - - struct list_head defer_list; - }; + + struct io_restriction restrictions; + + /* slow path rsrc auxilary data, used by update/register */ + struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; + struct io_rsrc_data *file_data; + struct io_rsrc_data *buf_data; + + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; + + #if defined(CONFIG_UNIX) + struct socket *ring_sock; + #endif + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + + /* Only used for accounting purposes */ + struct user_struct *user; + struct mm_struct *mm_account; + + /* ctx exit and cancelation */ + struct llist_head fallback_llist; + struct delayed_work fallback_work; + struct work_struct exit_work; + struct list_head tctx_list; + struct completion ref_comp; + + /* io-wq management, e.g. thread count */ + u32 iowq_limits[2]; + bool iowq_limits_set; + + struct list_head defer_list; + unsigned sq_thread_idle; }; enum { From aa1e90f64ee5c82f6d3feb69b2a19370686f1f02 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:51 +0100 Subject: [PATCH 062/172] io_uring: move small helpers to headers There is a bunch of inline helpers that will be useful not only to the core of io_uring, move them to headers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/22df99c83723e44cba7e945e8519e64e3642c064.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 ----------------- io_uring/io_uring.h | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c703190986270b..fd3ad784865287 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -160,14 +160,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) -{ - if (!*locked) { - mutex_lock(&ctx->uring_lock); - *locked = true; - } -} - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs)) @@ -423,15 +415,6 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static inline void io_req_add_compl_list(struct io_kiocb *req) -{ - struct io_submit_state *state = &req->ctx->submit_state; - - if (!(req->flags & REQ_F_CQE_SKIP)) - state->flush_cqes = true; - wq_list_add_tail(&req->comp_list, &state->compl_reqs); -} - void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 22e6e52c42d261..6744ce111e3812 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -217,6 +217,28 @@ static inline bool io_run_task_work(void) return false; } +static inline void io_req_complete_state(struct io_kiocb *req) +{ + req->flags |= REQ_F_COMPLETE_INLINE; +} + +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +{ + if (!*locked) { + mutex_lock(&ctx->uring_lock); + *locked = true; + } +} + +static inline void io_req_add_compl_list(struct io_kiocb *req) +{ + struct io_submit_state *state = &req->ctx->submit_state; + + if (!(req->flags & REQ_F_CQE_SKIP)) + state->flush_cqes = true; + wq_list_add_tail(&req->comp_list, &state->compl_reqs); +} + int io_run_task_work_sig(void); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); From 48c13d8980840e489a537e4af4b2503eb9d8a1ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:52 +0100 Subject: [PATCH 063/172] io_uring: explain io_wq_work::cancel_seq placement Add a comment on why we keep ->cancel_seq in struct io_wq_work instead of struct io_kiocb despite it needed only by io_uring but not io-wq. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/988e87eec9dc700b5dae933df3aefef303502f6c.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.h | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index ba6eee76d028f6..3f54ee2a8eebd3 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + /* place it here instead of io_kiocb as it fills padding and saves 4B */ int cancel_seq; }; From 6a02e4be8187588ac476136496af9e3feeeb9a75 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:53 +0100 Subject: [PATCH 064/172] io_uring: inline ->registered_rings There can be only 16 registered rings, no need to allocate an array for them separately but store it in tctx. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/495f0b953c87994dd9e13de2134019054fa5830d.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/tctx.c | 10 ---------- io_uring/tctx.h | 3 ++- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 3f7e9feb6ca2bc..5a5d4f908529a2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -53,7 +53,6 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); - kfree(tctx->registered_rings); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; @@ -69,16 +68,8 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, if (unlikely(!tctx)) return -ENOMEM; - tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, - sizeof(struct file *), GFP_KERNEL); - if (unlikely(!tctx->registered_rings)) { - kfree(tctx); - return -ENOMEM; - } - ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { - kfree(tctx->registered_rings); kfree(tctx); return ret; } @@ -87,7 +78,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); - kfree(tctx->registered_rings); kfree(tctx); return ret; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index f4964e40d07e0d..7684713e950f4f 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -20,8 +20,9 @@ struct io_uring_task { struct io_wq_work_list task_list; struct io_wq_work_list prio_task_list; struct callback_head task_work; - struct file **registered_rings; bool task_running; + + struct file *registered_rings[IO_RINGFD_REG_MAX]; }; struct io_tctx_node { From aeaa72c69473d7e68addbd31f43c7c12af252bfc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:54 +0100 Subject: [PATCH 065/172] io_uring: never defer-complete multi-apoll Luckily, nnobody completes multi-apoll requests outside the polling functions, but don't set IO_URING_F_COMPLETE_DEFER in any case as there is nobody who is catching REQ_F_COMPLETE_INLINE, and so will leak requests if used. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a65ed3f5effd9321ee06e6edea294a03be3e15a0.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fd3ad784865287..8a8d8b323519a0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1599,7 +1599,7 @@ int io_poll_issue(struct io_kiocb *req, bool *locked) io_tw_lock(req->ctx, locked); if (unlikely(req->task->flags & PF_EXITING)) return -EFAULT; - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + return io_issue_sqe(req, IO_URING_F_NONBLOCK); } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) From 3a08576b96e365d424225dd034c651e963b3ae64 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:55 +0100 Subject: [PATCH 066/172] io_uring: remove check_cq checking from hot paths All ctx->check_cq events are slow path, don't test every single flag one by one in the hot path, but add a common guarding if. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dff026585cea7ff3a172a7c83894a3b0111bbf6a.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8a8d8b323519a0..a4c1746d0691d2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1259,24 +1259,25 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) int ret = 0; unsigned long check_cq; + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + __io_cqring_overflow_flush(ctx, false); + /* + * Similarly do not spin if we have not informed the user of any + * dropped CQE. + */ + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) + return -EBADR; + } /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - check_cq = READ_ONCE(ctx->check_cq); - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) return 0; - /* - * Similarly do not spin if we have not informed the user of any - * dropped CQE. - */ - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; - do { /* * If a submit got punted to a workqueue, we can have the @@ -2203,12 +2204,15 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, ret = io_run_task_work_sig(); if (ret || io_should_wake(iowq)) return ret; + check_cq = READ_ONCE(ctx->check_cq); - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + return 1; + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) + return -EBADR; + } if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) return -ETIME; return 1; From c65f5279ba02e47c073520ecff3c84408a87fd17 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:56 +0100 Subject: [PATCH 067/172] io_uring: don't set REQ_F_COMPLETE_INLINE in tw io_req_task_complete() enqueues requests for state completion itself, no need for REQ_F_COMPLETE_INLINE, which is only serve the purpose of not bloating the kernel. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/aca80f71464ad02c06f1311d998a2d6ee0b31573.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a4c1746d0691d2..4adfc4ebf8c199 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1314,7 +1314,6 @@ inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { if (*locked) { req->cqe.flags |= io_put_kbuf(req, 0); - req->flags |= REQ_F_COMPLETE_INLINE; io_req_add_compl_list(req); } else { req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED); From bb8f870031584ebf50fcc1f049fa421375556114 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Jun 2022 16:28:17 -0600 Subject: [PATCH 068/172] io_uring: remove unused IO_REQ_CACHE_SIZE defined Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4adfc4ebf8c199..72640aa55abcde 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -115,7 +115,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 -#define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 enum { From df9830d883b914296beb17e4f75017faac9c4312 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:21:57 +0100 Subject: [PATCH 069/172] io_uring: rw: delegate sync completions to core io_uring io_issue_sqe() from the io_uring core knows how to complete requests based on the returned error code, we can delegate io_read()/io_write() completion to it. Make kiocb_done() to return the right completion code and propagate it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/32ef005b45d23bf6b5e6837740dc0331bb051bd4.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/rw.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index f0b60199ee2279..e5ca23d0783e2c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -207,15 +207,6 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static void __io_complete_rw(struct io_kiocb *req, long res, - unsigned int issue_flags) -{ - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); - __io_req_complete(req, issue_flags); -} - static void io_complete_rw(struct kiocb *kiocb, long res) { struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); @@ -247,7 +238,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) smp_store_release(&req->iopoll_completed, 1); } -static void kiocb_done(struct io_kiocb *req, ssize_t ret, +static int kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { struct io_async_rw *io = req->async_data; @@ -263,10 +254,15 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, issue_flags); - else + if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { + if (!__io_complete_rw_common(req, ret)) { + io_req_set_res(req, req->cqe.res, + io_put_kbuf(req, issue_flags)); + return IOU_OK; + } + } else { io_rw_done(&rw->kiocb, ret); + } if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; @@ -275,6 +271,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, else io_req_task_queue_fail(req, ret); } + return IOU_ISSUE_SKIP_COMPLETE; } static int __io_import_fixed(struct io_kiocb *req, int ddir, @@ -847,7 +844,9 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { - goto out_free; + if (iovec) + kfree(iovec); + return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ @@ -905,12 +904,10 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); } while (ret > 0); done: - kiocb_done(req, ret, issue_flags); -out_free: /* it's faster to check here then delegate to kfree */ if (iovec) kfree(iovec); - return IOU_ISSUE_SKIP_COMPLETE; + return kiocb_done(req, ret, issue_flags); } int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -960,8 +957,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) - goto out_free; + if (unlikely(ret)) { + kfree(iovec); + return ret; + } /* * Open-code file_start_write here to grab freeze protection, @@ -1003,15 +1002,13 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: - kiocb_done(req, ret2, issue_flags); - ret = IOU_ISSUE_SKIP_COMPLETE; + ret = kiocb_done(req, ret2, issue_flags); } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); ret = io_setup_async_rw(req, iovec, s, false); return ret ?: -EAGAIN; } -out_free: /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); From 75d7b3aec13be557f15d099dc612dd658d670d1d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:21:58 +0100 Subject: [PATCH 070/172] io_uring: kill REQ_F_COMPLETE_INLINE REQ_F_COMPLETE_INLINE is only needed to delay queueing into the completion list to io_queue_sqe() as __io_req_complete() is inlined and we don't want to bloat the kernel. As now we complete in a more centralised fashion in io_issue_sqe() we can get rid of the flag and queue to the list directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/600ba20a9338b8a39b249b23d3d177803613dde4.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 +++++++----------- io_uring/io_uring.h | 5 ----- io_uring/io_uring_types.h | 3 --- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 72640aa55abcde..541c109a9273bf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -742,10 +742,7 @@ void io_req_complete_post(struct io_kiocb *req) inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) { - if (issue_flags & IO_URING_F_COMPLETE_DEFER) - req->flags |= REQ_F_COMPLETE_INLINE; - else - io_req_complete_post(req); + io_req_complete_post(req); } void io_req_complete_failed(struct io_kiocb *req, s32 res) @@ -1581,9 +1578,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); - if (ret == IOU_OK) - __io_req_complete(req, issue_flags); - else if (ret != IOU_ISSUE_SKIP_COMPLETE) + if (ret == IOU_OK) { + if (issue_flags & IO_URING_F_COMPLETE_DEFER) + io_req_add_compl_list(req); + else + io_req_complete_post(req); + } else if (ret != IOU_ISSUE_SKIP_COMPLETE) return ret; /* If the op doesn't have a file, we're not polling for it */ @@ -1749,10 +1749,6 @@ static inline void io_queue_sqe(struct io_kiocb *req) ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (req->flags & REQ_F_COMPLETE_INLINE) { - io_req_add_compl_list(req); - return; - } /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6744ce111e3812..3f06fbae0ee9e5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -217,11 +217,6 @@ static inline bool io_run_task_work(void) return false; } -static inline void io_req_complete_state(struct io_kiocb *req) -{ - req->flags |= REQ_F_COMPLETE_INLINE; -} - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index ef1cf86e893264..4576ea8cad2e6c 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -301,7 +301,6 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_BUFFER_RING_BIT, - REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, @@ -356,8 +355,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), - /* completion is deferred through io_comp_state */ - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), /* supports async reads/writes */ From 7012c81593d5a3bc6d0b8720345cf887ef1df914 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:21:59 +0100 Subject: [PATCH 071/172] io_uring: refactor io_req_task_complete() Clean up io_req_task_complete() and deduplicate io_put_kbuf() calls. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ae3148ac7eb5cce3e06895cde306e9e959d6f6ae.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 541c109a9273bf..957a5bc1b528c9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1306,15 +1306,19 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } -inline void io_req_task_complete(struct io_kiocb *req, bool *locked) + +void io_req_task_complete(struct io_kiocb *req, bool *locked) { - if (*locked) { - req->cqe.flags |= io_put_kbuf(req, 0); + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + + req->cqe.flags |= io_put_kbuf(req, issue_flags); + } + + if (*locked) io_req_add_compl_list(req); - } else { - req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED); + else io_req_complete_post(req); - } } /* From 53ccf69bda6f51e462f3c4ab7eb9c0ec34e78be4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:00 +0100 Subject: [PATCH 072/172] io_uring: don't inline io_put_kbuf io_put_kbuf() is huge, don't bloat the kernel with inlining. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2e21ccf0be471ffa654032914b9430813cae53f8.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 33 +++++++++++++++++++++++++++++++++ io_uring/kbuf.h | 38 ++++++-------------------------------- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index bc58890d932b2f..b9c7f6e87cc9a3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -83,6 +83,39 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +{ + unsigned int cflags; + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf_list(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + lockdep_assert_held(&req->ctx->uring_lock); + + cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + } + return cflags; +} + static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl) { diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 9da3a933ef40e1..304e7139d83562 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -47,6 +47,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); + static inline bool io_do_buffer_select(struct io_kiocb *req) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -79,7 +81,8 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) __io_kbuf_recycle(req, issue_flags); } -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) +static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, + struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { if (req->buf_list) @@ -99,44 +102,15 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); + return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); } static inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { - unsigned int cflags; if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; + return __io_put_kbuf(req, issue_flags); } #endif From 3654ab0c51a91036d15dd2fd9c2809317edf2228 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 16 Jun 2022 10:22:01 +0100 Subject: [PATCH 073/172] io_uring: poll: remove unnecessary req->ref set We now don't need to set req->refcount for poll requests since the reworked poll code ensures no request release race. Signed-off-by: Hao Xu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ec6fee45705890bdb968b0c175519242753c0215.1655371007.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 558dc170468ad8..fdb6b1101ffcb7 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -689,7 +689,6 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; - io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } From 38513c464d3d45b4088c82f6e42d9cdbc5ee57e6 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 16 Jun 2022 10:22:02 +0100 Subject: [PATCH 074/172] io_uring: switch cancel_hash to use per entry spinlock Add a new io_hash_bucket structure so that each bucket in cancel_hash has separate spinlock. Use per entry lock for cancel_hash, this removes some completion lock invocation and remove contension between different cancel_hash entries. Signed-off-by: Hao Xu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/05d1e135b0c8bce9d1441e6346776589e5783e26.1655371007.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/cancel.c | 14 ++++++- io_uring/cancel.h | 6 +++ io_uring/fdinfo.c | 9 +++-- io_uring/io_uring.c | 9 +++-- io_uring/io_uring_types.h | 2 +- io_uring/poll.c | 80 ++++++++++++++++++++++++--------------- 6 files changed, 80 insertions(+), 40 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 83cceb52d82d64..6f2888388a40ed 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -93,14 +93,14 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) if (!ret) return 0; - spin_lock(&ctx->completion_lock); ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) goto out; + spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); -out: spin_unlock(&ctx->completion_lock); +out: return ret; } @@ -192,3 +192,13 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void init_hash_table(struct io_hash_bucket *hash_table, unsigned size) +{ + unsigned int i; + + for (i = 0; i < size; i++) { + spin_lock_init(&hash_table[i].lock); + INIT_HLIST_HEAD(&hash_table[i].list); + } +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 4f35d86963253f..556a7dcf160e47 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -4,3 +4,9 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); +void init_hash_table(struct io_hash_bucket *hash_table, unsigned size); + +struct io_hash_bucket { + spinlock_t lock; + struct hlist_head list; +} ____cacheline_aligned_in_smp; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index fcedde4b4b1e1f..f941c73f550259 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -13,6 +13,7 @@ #include "io_uring.h" #include "sqpoll.h" #include "fdinfo.h" +#include "cancel.h" #ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, @@ -157,17 +158,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); seq_puts(m, "PollList:\n"); - spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_hash_bucket *hb = &ctx->cancel_hash[i]; struct io_kiocb *req; - hlist_for_each_entry(req, list, hash_node) + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, task_work_pending(req->task)); + spin_unlock(&hb->lock); } seq_puts(m, "CqOverflowList:\n"); + spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { struct io_uring_cqe *cqe = &ocqe->cqe; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 957a5bc1b528c9..ac6946e3f174b5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -89,6 +89,7 @@ #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" +#include "cancel.h" #include "timeout.h" #include "poll.h" @@ -260,11 +261,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (hash_bits <= 0) hash_bits = 1; ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), - GFP_KERNEL); + ctx->cancel_hash = + kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket), + GFP_KERNEL); if (!ctx->cancel_hash) goto err; - __hash_init(ctx->cancel_hash, 1U << hash_bits); + + init_hash_table(ctx->cancel_hash, 1U << hash_bits); ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 4576ea8cad2e6c..1f8db2dd7af752 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -224,7 +224,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; + struct io_hash_bucket *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; diff --git a/io_uring/poll.c b/io_uring/poll.c index fdb6b1101ffcb7..1511a26b412d17 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -19,6 +19,7 @@ #include "opdef.h" #include "kbuf.h" #include "poll.h" +#include "cancel.h" struct io_poll_update { struct file *file; @@ -73,10 +74,22 @@ static struct io_poll *io_poll_get_single(struct io_kiocb *req) static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; + u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); + struct io_hash_bucket *hb = &ctx->cancel_hash[index]; - list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + spin_lock(&hb->lock); + hlist_add_head(&req->hash_node, &hb->list); + spin_unlock(&hb->lock); +} + +static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); + spinlock_t *lock = &ctx->cancel_hash[index].lock; + + spin_lock(lock); + hash_del(&req->hash_node); + spin_unlock(lock); } static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, @@ -220,8 +233,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_poll_remove_entries(req); + io_poll_req_delete(req, ctx); spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); req->cqe.flags = 0; __io_req_complete_post(req); io_commit_cqring(ctx); @@ -231,7 +244,6 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) static void io_apoll_task_func(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; int ret; ret = io_poll_check_events(req, locked); @@ -239,9 +251,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) return; io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - spin_unlock(&ctx->completion_lock); + io_poll_req_delete(req, req->ctx); if (!ret) io_req_task_submit(req, locked); @@ -437,9 +447,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 0; } - spin_lock(&ctx->completion_lock); io_poll_req_insert(req); - spin_unlock(&ctx->completion_lock); if (mask && (poll->events & EPOLLET)) { /* can't multishot if failed, just queue the event we've got */ @@ -540,32 +548,31 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool found = false; int i; - spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; + struct io_hash_bucket *hb = &ctx->cancel_hash[i]; - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) { + spin_lock(&hb->lock); + hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } + spin_unlock(&hb->lock); } - spin_unlock(&ctx->completion_lock); return found; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) { - struct hlist_head *list; struct io_kiocb *req; + u32 index = hash_long(cd->data, ctx->cancel_hash_bits); + struct io_hash_bucket *hb = &ctx->cancel_hash[index]; - list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) @@ -577,21 +584,21 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, } return req; } + spin_unlock(&hb->lock); return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) { struct io_kiocb *req; int i; for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; + struct io_hash_bucket *hb = &ctx->cancel_hash[i]; - list = &ctx->cancel_hash[i]; - hlist_for_each_entry(req, list, hash_node) { + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) { if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && req->file != cd->file) continue; @@ -600,12 +607,12 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, req->work.cancel_seq = cd->seq; return req; } + spin_unlock(&hb->lock); } return NULL; } static bool io_poll_disarm(struct io_kiocb *req) - __must_hold(&ctx->completion_lock) { if (!io_poll_get_ownership(req)) return false; @@ -615,17 +622,23 @@ static bool io_poll_disarm(struct io_kiocb *req) } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) { struct io_kiocb *req; + u32 index; + spinlock_t *lock; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd); else req = io_poll_find(ctx, false, cd); - if (!req) + if (!req) { return -ENOENT; + } else { + index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); + lock = &ctx->cancel_hash[index].lock; + } io_poll_cancel_req(req); + spin_unlock(lock); return 0; } @@ -719,18 +732,23 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_poll_update *poll_update = io_kiocb_to_cmd(req); struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; + u32 index = hash_long(cd.data, ctx->cancel_hash_bits); + spinlock_t *lock = &ctx->cancel_hash[index].lock; struct io_kiocb *preq; int ret2, ret = 0; bool locked; - spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, true, &cd); - if (!preq || !io_poll_disarm(preq)) { - spin_unlock(&ctx->completion_lock); - ret = preq ? -EALREADY : -ENOENT; + if (!preq) { + ret = -ENOENT; + goto out; + } + ret2 = io_poll_disarm(preq); + spin_unlock(lock); + if (!ret2) { + ret = -EALREADY; goto out; } - spin_unlock(&ctx->completion_lock); if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ From 1ab1edb0a104dd683d83e7853aa5b624856f4127 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:03 +0100 Subject: [PATCH 075/172] io_uring: pass poll_find lock back Instead of using implicit knowledge of what is locked or not after io_poll_find() and co returns, pass back a pointer to the locked bucket if any. If set the user must to unlock the spinlock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dae1dc5749aa34367812ecf62f82fd3f053aae44.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/poll.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 1511a26b412d17..5b2c6ce26a5547 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -565,12 +565,15 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd) + struct io_cancel_data *cd, + struct io_hash_bucket **out_bucket) { struct io_kiocb *req; u32 index = hash_long(cd->data, ctx->cancel_hash_bits); struct io_hash_bucket *hb = &ctx->cancel_hash[index]; + *out_bucket = NULL; + spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) @@ -582,6 +585,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, continue; req->work.cancel_seq = cd->seq; } + *out_bucket = hb; return req; } spin_unlock(&hb->lock); @@ -589,11 +593,14 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) + struct io_cancel_data *cd, + struct io_hash_bucket **out_bucket) { struct io_kiocb *req; int i; + *out_bucket = NULL; + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_hash[i]; @@ -605,6 +612,7 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, if (cd->seq == req->work.cancel_seq) continue; req->work.cancel_seq = cd->seq; + *out_bucket = hb; return req; } spin_unlock(&hb->lock); @@ -623,23 +631,19 @@ static bool io_poll_disarm(struct io_kiocb *req) int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { + struct io_hash_bucket *bucket; struct io_kiocb *req; - u32 index; - spinlock_t *lock; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd); + req = io_poll_file_find(ctx, cd, &bucket); else - req = io_poll_find(ctx, false, cd); - if (!req) { - return -ENOENT; - } else { - index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); - lock = &ctx->cancel_hash[index].lock; - } - io_poll_cancel_req(req); - spin_unlock(lock); - return 0; + req = io_poll_find(ctx, false, cd, &bucket); + + if (req) + io_poll_cancel_req(req); + if (bucket) + spin_unlock(&bucket->lock); + return req ? 0 : -ENOENT; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -732,19 +736,21 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_poll_update *poll_update = io_kiocb_to_cmd(req); struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; - u32 index = hash_long(cd.data, ctx->cancel_hash_bits); - spinlock_t *lock = &ctx->cancel_hash[index].lock; + struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; bool locked; - preq = io_poll_find(ctx, true, &cd); + preq = io_poll_find(ctx, true, &cd, &bucket); + if (preq) + ret2 = io_poll_disarm(preq); + if (bucket) + spin_unlock(&bucket->lock); + if (!preq) { ret = -ENOENT; goto out; } - ret2 = io_poll_disarm(preq); - spin_unlock(lock); if (!ret2) { ret = -EALREADY; goto out; From 4dfab8abb4721da278a2ccd45c1b6a69f8a9dd14 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:04 +0100 Subject: [PATCH 076/172] io_uring: clean up io_try_cancel Get rid of an unnecessary extra goto in io_try_cancel() and simplify the function. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/48cf5417b43a8386c6c364dba1ad9b4c7382d158.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/cancel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 6f2888388a40ed..a253e2ad22ebd0 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -95,12 +95,12 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) - goto out; + return ret; + spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); spin_unlock(&ctx->completion_lock); -out: return ret; } From 4a07723fb4bb2568a31d43709904ab0d4c33d6c8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:05 +0100 Subject: [PATCH 077/172] io_uring: limit the number of cancellation buckets Don't allocate to many hash/cancellation buckets, there might be too many, clamp it to 8 bits, or 256 * 64B = 16KB. We don't usually have too many requests, and 256 buckets should be enough, especially since we do hash search only in the cancellation path. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b9620c8072ba61a2d50eba894b89bd93a94a9abd.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ac6946e3f174b5..aafdf1330ec67d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -254,12 +254,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* * Use 5 bits less than the max cq entries, that should give us around - * 32 entries per hash list if totally full and uniformly spread. + * 32 entries per hash list if totally full and uniformly spread, but + * don't keep too many buckets to not overconsume memory. */ - hash_bits = ilog2(p->cq_entries); - hash_bits -= 5; - if (hash_bits <= 0) - hash_bits = 1; + hash_bits = ilog2(p->cq_entries) - 5; + hash_bits = clamp(hash_bits, 1, 8); + ctx->cancel_hash_bits = hash_bits; ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket), From 8b1dfd343ae6a64ca2b4741ab47a162fa59f43be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:06 +0100 Subject: [PATCH 078/172] io_uring: clean up io_ring_ctx_alloc Add a variable for the number of hash buckets in io_ring_ctx_alloc(), makes it more readable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/993926ed0d614ba9a76b2a85bebae2babcb13983.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index aafdf1330ec67d..85a479594b05aa 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -244,6 +244,8 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + unsigned hash_buckets; + size_t hash_size; int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -259,15 +261,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); + hash_buckets = 1U << hash_bits; + hash_size = hash_buckets * sizeof(struct io_hash_bucket); ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = - kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket), - GFP_KERNEL); + ctx->cancel_hash = kmalloc(hash_size, GFP_KERNEL); if (!ctx->cancel_hash) goto err; - init_hash_table(ctx->cancel_hash, 1U << hash_bits); + init_hash_table(ctx->cancel_hash, hash_buckets); ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) From 0ec6dca223195aca2f7df6e432c4205b5f63305b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:07 +0100 Subject: [PATCH 079/172] io_uring: use state completion infra for poll reqs Use io_req_task_complete() for poll request completions, so it can utilise state completions and save lots of unnecessary locking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ced94cb5a728d8e386c640d052fd3da3f5d6891a.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/poll.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 5b2c6ce26a5547..7f2ebb56f7dbf9 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -234,12 +234,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) io_poll_remove_entries(req); io_poll_req_delete(req, ctx); - spin_lock(&ctx->completion_lock); - req->cqe.flags = 0; - __io_req_complete_post(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_req_set_res(req, req->cqe.res, 0); + io_req_task_complete(req, locked); } static void io_apoll_task_func(struct io_kiocb *req, bool *locked) From 97bbdc06a4446bc69d8ba71d722abae542a6b70c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:08 +0100 Subject: [PATCH 080/172] io_uring: add IORING_SETUP_SINGLE_ISSUER Add a new IORING_SETUP_SINGLE_ISSUER flag and the userspace visible part of it, i.e. put limitations of submitters. Also, don't allow it together with IOPOLL as we're not going to put it to good use. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4bcc41ee467fdf04c8aab8baf6ce3ba21858c3d4.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 ++++- io_uring/io_uring.c | 7 +++++-- io_uring/io_uring_types.h | 1 + io_uring/tctx.c | 27 ++++++++++++++++++++++++--- io_uring/tctx.h | 4 ++-- 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4927bb69387a29..d7ae81b10893e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -140,9 +140,12 @@ enum { * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. */ #define IORING_SETUP_TASKRUN_FLAG (1U << 9) - #define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ #define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ +/* + * Only one task is allowed to submit requests + */ +#define IORING_SETUP_SINGLE_ISSUER (1U << 12) enum io_uring_op { IORING_OP_NOP, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 85a479594b05aa..06772139b7dad8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2457,6 +2457,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_destroy_buffers(ctx); if (ctx->sq_creds) put_cred(ctx->sq_creds); + if (ctx->submitter_task) + put_task_struct(ctx->submitter_task); /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) @@ -3189,7 +3191,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_tctx_node(ctx); + ret = __io_uring_add_tctx_node(ctx, false); if (ret) { put_unused_fd(fd); return ret; @@ -3409,7 +3411,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) + IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | + IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 1f8db2dd7af752..8b00243abf65fa 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -243,6 +243,7 @@ struct io_ring_ctx { /* Keep this last, we don't need it for the fast path */ struct io_restriction restrictions; + struct task_struct *submitter_task; /* slow path rsrc auxilary data, used by update/register */ struct io_rsrc_node *rsrc_backup_node; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 5a5d4f908529a2..a819da8fc85cd2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -94,12 +94,32 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return 0; } -int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) +static int io_register_submitter(struct io_ring_ctx *ctx) +{ + int ret = 0; + + mutex_lock(&ctx->uring_lock); + if (!ctx->submitter_task) + ctx->submitter_task = get_task_struct(current); + else if (ctx->submitter_task != current) + ret = -EEXIST; + mutex_unlock(&ctx->uring_lock); + + return ret; +} + +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; int ret; + if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) { + ret = io_register_submitter(ctx); + if (ret) + return ret; + } + if (unlikely(!tctx)) { ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) @@ -133,7 +153,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } - tctx->last = ctx; + if (submitter) + tctx->last = ctx; return 0; } @@ -241,7 +262,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, return -EINVAL; mutex_unlock(&ctx->uring_lock); - ret = io_uring_add_tctx_node(ctx); + ret = __io_uring_add_tctx_node(ctx, false); mutex_lock(&ctx->uring_lock); if (ret) return ret; diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 7684713e950f4f..dde82ce4d8e29b 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -34,7 +34,7 @@ struct io_tctx_node { int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); -int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter); void io_uring_clean_tctx(struct io_uring_task *tctx); void io_uring_unreg_ringfd(void); @@ -52,5 +52,5 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) if (likely(tctx && tctx->last == ctx)) return 0; - return __io_uring_add_tctx_node(ctx); + return __io_uring_add_tctx_node(ctx, true); } From a2cdd5193218c557b4ee7828017cce83f251e99a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:09 +0100 Subject: [PATCH 081/172] io_uring: pass hash table into poll_find In preparation for having multiple cancellation hash tables, pass a table pointer into io_poll_find() and other poll cancel functions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a31c88502463dce09254240fa037352927d7ecc3.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/poll.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 7f2ebb56f7dbf9..96199c999fe60c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -562,11 +562,12 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, + struct io_hash_bucket hash_table[], struct io_hash_bucket **out_bucket) { struct io_kiocb *req; u32 index = hash_long(cd->data, ctx->cancel_hash_bits); - struct io_hash_bucket *hb = &ctx->cancel_hash[index]; + struct io_hash_bucket *hb = &hash_table[index]; *out_bucket = NULL; @@ -590,6 +591,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + struct io_hash_bucket hash_table[], struct io_hash_bucket **out_bucket) { struct io_kiocb *req; @@ -598,7 +600,7 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, *out_bucket = NULL; for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &ctx->cancel_hash[i]; + struct io_hash_bucket *hb = &hash_table[i]; spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { @@ -625,15 +627,16 @@ static bool io_poll_disarm(struct io_kiocb *req) return true; } -int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) +static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + struct io_hash_bucket hash_table[]) { struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd, &bucket); + req = io_poll_file_find(ctx, cd, ctx->cancel_hash, &bucket); else - req = io_poll_find(ctx, false, cd, &bucket); + req = io_poll_find(ctx, false, cd, ctx->cancel_hash, &bucket); if (req) io_poll_cancel_req(req); @@ -642,6 +645,11 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return req ? 0 : -ENOENT; } +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) +{ + return __io_poll_cancel(ctx, cd, ctx->cancel_hash); +} + static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, unsigned int flags) { @@ -737,7 +745,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) int ret2, ret = 0; bool locked; - preq = io_poll_find(ctx, true, &cd, &bucket); + preq = io_poll_find(ctx, true, &cd, ctx->cancel_hash, &bucket); if (preq) ret2 = io_poll_disarm(preq); if (bucket) From e6f89be61410ff5a0e690d87d7a308e81f0f5a71 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:10 +0100 Subject: [PATCH 082/172] io_uring: introduce a struct for hash table Instead of passing around a pointer to hash buckets, add a bit of type safety and wrap it into a structure. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d65bc3faba537ec2aca9eabf334394936d44bd28.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/cancel.c | 6 +++--- io_uring/cancel.h | 7 +------ io_uring/fdinfo.c | 4 ++-- io_uring/io_uring.c | 29 ++++++++++++++++------------ io_uring/io_uring_types.h | 13 +++++++++++-- io_uring/poll.c | 40 +++++++++++++++++++++------------------ 6 files changed, 56 insertions(+), 43 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index a253e2ad22ebd0..f28f0a7d127240 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -193,12 +193,12 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -void init_hash_table(struct io_hash_bucket *hash_table, unsigned size) +void init_hash_table(struct io_hash_table *table, unsigned size) { unsigned int i; for (i = 0; i < size; i++) { - spin_lock_init(&hash_table[i].lock); - INIT_HLIST_HEAD(&hash_table[i].list); + spin_lock_init(&table->hbs[i].lock); + INIT_HLIST_HEAD(&table->hbs[i].list); } } diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 556a7dcf160e47..fd4cb1a2595de5 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -4,9 +4,4 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); -void init_hash_table(struct io_hash_bucket *hash_table, unsigned size); - -struct io_hash_bucket { - spinlock_t lock; - struct hlist_head list; -} ____cacheline_aligned_in_smp; +void init_hash_table(struct io_hash_table *table, unsigned size); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index f941c73f550259..344e7d90d55756 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -158,8 +158,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); seq_puts(m, "PollList:\n"); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &ctx->cancel_hash[i]; + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; spin_lock(&hb->lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06772139b7dad8..0b3851a0db2ba1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -241,11 +241,23 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_put(&ctx->refs); } +static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) +{ + unsigned hash_buckets = 1U << bits; + size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + + table->hbs = kmalloc(hash_size, GFP_KERNEL); + if (!table->hbs) + return -ENOMEM; + + table->hash_bits = bits; + init_hash_table(table, hash_buckets); + return 0; +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - unsigned hash_buckets; - size_t hash_size; int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -261,16 +273,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); - hash_buckets = 1U << hash_bits; - hash_size = hash_buckets * sizeof(struct io_hash_bucket); - - ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = kmalloc(hash_size, GFP_KERNEL); - if (!ctx->cancel_hash) + if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; - init_hash_table(ctx->cancel_hash, hash_buckets); - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) goto err; @@ -311,7 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; err: kfree(ctx->dummy_ubuf); - kfree(ctx->cancel_hash); + kfree(ctx->cancel_table.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -2487,7 +2492,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); - kfree(ctx->cancel_hash); + kfree(ctx->cancel_table.hbs); kfree(ctx->dummy_ubuf); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 8b00243abf65fa..d3b9bde9c702c6 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -9,6 +9,16 @@ #include "io-wq.h" #include "filetable.h" +struct io_hash_bucket { + spinlock_t lock; + struct hlist_head list; +} ____cacheline_aligned_in_smp; + +struct io_hash_table { + struct io_hash_bucket *hbs; + unsigned hash_bits; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -224,8 +234,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct io_wq_work_list iopoll_list; - struct io_hash_bucket *cancel_hash; - unsigned cancel_hash_bits; + struct io_hash_table cancel_table; bool poll_multi_queue; struct list_head io_buffers_comp; diff --git a/io_uring/poll.c b/io_uring/poll.c index 96199c999fe60c..ea6466388ed9ef 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -73,9 +73,9 @@ static struct io_poll *io_poll_get_single(struct io_kiocb *req) static void io_poll_req_insert(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); - struct io_hash_bucket *hb = &ctx->cancel_hash[index]; + struct io_hash_table *table = &req->ctx->cancel_table; + u32 index = hash_long(req->cqe.user_data, table->hash_bits); + struct io_hash_bucket *hb = &table->hbs[index]; spin_lock(&hb->lock); hlist_add_head(&req->hash_node, &hb->list); @@ -84,8 +84,9 @@ static void io_poll_req_insert(struct io_kiocb *req) static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) { - u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); - spinlock_t *lock = &ctx->cancel_hash[index].lock; + struct io_hash_table *table = &req->ctx->cancel_table; + u32 index = hash_long(req->cqe.user_data, table->hash_bits); + spinlock_t *lock = &table->hbs[index].lock; spin_lock(lock); hash_del(&req->hash_node); @@ -539,13 +540,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) { + struct io_hash_table *table = &ctx->cancel_table; + unsigned nr_buckets = 1U << table->hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &ctx->cancel_hash[i]; + for (i = 0; i < nr_buckets; i++) { + struct io_hash_bucket *hb = &table->hbs[i]; spin_lock(&hb->lock); hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { @@ -562,12 +565,12 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, - struct io_hash_bucket hash_table[], + struct io_hash_table *table, struct io_hash_bucket **out_bucket) { struct io_kiocb *req; - u32 index = hash_long(cd->data, ctx->cancel_hash_bits); - struct io_hash_bucket *hb = &hash_table[index]; + u32 index = hash_long(cd->data, table->hash_bits); + struct io_hash_bucket *hb = &table->hbs[index]; *out_bucket = NULL; @@ -591,16 +594,17 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd, - struct io_hash_bucket hash_table[], + struct io_hash_table *table, struct io_hash_bucket **out_bucket) { + unsigned nr_buckets = 1U << table->hash_bits; struct io_kiocb *req; int i; *out_bucket = NULL; - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &hash_table[i]; + for (i = 0; i < nr_buckets; i++) { + struct io_hash_bucket *hb = &table->hbs[i]; spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { @@ -628,15 +632,15 @@ static bool io_poll_disarm(struct io_kiocb *req) } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, - struct io_hash_bucket hash_table[]) + struct io_hash_table *table) { struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd, ctx->cancel_hash, &bucket); + req = io_poll_file_find(ctx, cd, table, &bucket); else - req = io_poll_find(ctx, false, cd, ctx->cancel_hash, &bucket); + req = io_poll_find(ctx, false, cd, table, &bucket); if (req) io_poll_cancel_req(req); @@ -647,7 +651,7 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { - return __io_poll_cancel(ctx, cd, ctx->cancel_hash); + return __io_poll_cancel(ctx, cd, &ctx->cancel_table); } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -745,7 +749,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) int ret2, ret = 0; bool locked; - preq = io_poll_find(ctx, true, &cd, ctx->cancel_hash, &bucket); + preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); if (preq) ret2 = io_poll_disarm(preq); if (bucket) From 5d7943d99df9326c7b02f773b2d6f09709c30594 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:11 +0100 Subject: [PATCH 083/172] io_uring: propagate locking state to poll cancel Poll cancellation will be soon need to grab ->uring_lock inside, pass the locking state, i.e. issue_flags, inside the cancellation functions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b86781d047727c07163443b57551a3fa57c7c5e1.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/cancel.c | 7 ++++--- io_uring/cancel.h | 3 ++- io_uring/poll.c | 3 ++- io_uring/poll.h | 3 ++- io_uring/timeout.c | 3 ++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index f28f0a7d127240..f07bfd27c98ac2 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -78,7 +78,8 @@ static int io_async_cancel_one(struct io_uring_task *tctx, return ret; } -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, + unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -93,7 +94,7 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) if (!ret) return 0; - ret = io_poll_cancel(ctx, cd); + ret = io_poll_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; @@ -136,7 +137,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, int ret, nr = 0; do { - ret = io_try_cancel(req, cd); + ret = io_try_cancel(req, cd, issue_flags); if (ret == -ENOENT) break; if (!all) diff --git a/io_uring/cancel.h b/io_uring/cancel.h index fd4cb1a2595de5..8dd259dc383e2c 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -3,5 +3,6 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, + unsigned int issue_flags); void init_hash_table(struct io_hash_table *table, unsigned size); diff --git a/io_uring/poll.c b/io_uring/poll.c index ea6466388ed9ef..c4edf8794538c8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -649,7 +649,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, return req ? 0 : -ENOENT; } -int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned issue_flags) { return __io_poll_cancel(ctx, cd, &ctx->cancel_table); } diff --git a/io_uring/poll.h b/io_uring/poll.h index cc75c1567a84ae..fa3e19790281b2 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -24,7 +24,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); -int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 69cca42d6835b4..526fc8b2e3b65d 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -262,6 +262,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -273,7 +274,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) .data = prev->cqe.user_data, }; - ret = io_try_cancel(req, &cd); + ret = io_try_cancel(req, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_complete_post(req); From 9ca9fb24d5febccea354089c41f96a8ad0d853f8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:12 +0100 Subject: [PATCH 084/172] io_uring: mutex locked poll hashing Currently we do two extra spin lock/unlock pairs to add a poll/apoll request to the cancellation hash table and remove it from there. On the submission side we often already hold ->uring_lock and tw completion is likely to hold it as well. Add a second cancellation hash table protected by ->uring_lock. In concerns for latency because of a need to have the mutex locked on the completion side, use the new table only in following cases: 1) IORING_SETUP_SINGLE_ISSUER: only one task grabs uring_lock, so there is little to no contention and so the main tw hander will almost always end up grabbing it before calling callbacks. 2) IORING_SETUP_SQPOLL: same as with single issuer, only one task is a major user of ->uring_lock. 3) apoll: we normally grab the lock on the completion side anyway to execute the request, so it's free. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1bbad9c78c454b7b92f100bbf46730a37df7194f.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++- io_uring/io_uring_types.h | 4 ++ io_uring/poll.c | 117 +++++++++++++++++++++++++++++++------- 3 files changed, 108 insertions(+), 22 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0b3851a0db2ba1..eeda1673179544 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -275,6 +275,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; + if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) + goto err; ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) @@ -317,6 +319,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_table.hbs); + kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -2493,6 +2496,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_table.hbs); + kfree(ctx->cancel_table_locked.hbs); kfree(ctx->dummy_ubuf); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); @@ -2654,12 +2658,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) __io_cqring_overflow_flush(ctx, true); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); + if (ctx->rings) + io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); /* failed during ring init, it couldn't have issued any requests */ if (ctx->rings) { io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); } @@ -2784,7 +2789,9 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } ret |= io_cancel_defer_files(ctx, task, cancel_all); + mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, task, cancel_all); + mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work(); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index d3b9bde9c702c6..65ac7cdaaa73f4 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -191,6 +191,7 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; + struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct list_head apoll_cache; struct xarray personalities; @@ -323,6 +324,7 @@ enum { REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, + REQ_F_HASH_LOCKED_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -393,6 +395,8 @@ enum { REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), /* recvmsg special flag, clear EPOLLIN */ REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), + /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ + REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); diff --git a/io_uring/poll.c b/io_uring/poll.c index c4edf8794538c8..9ae2982aef7c6c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -93,6 +93,32 @@ static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) spin_unlock(lock); } +static void io_poll_req_insert_locked(struct io_kiocb *req) +{ + struct io_hash_table *table = &req->ctx->cancel_table_locked; + u32 index = hash_long(req->cqe.user_data, table->hash_bits); + + hlist_add_head(&req->hash_node, &table->hbs[index].list); +} + +static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (req->flags & REQ_F_HASH_LOCKED) { + /* + * ->cancel_table_locked is protected by ->uring_lock in + * contrast to per bucket spinlocks. Likely, tctx_task_work() + * already grabbed the mutex for us, but there is a chance it + * failed. + */ + io_tw_lock(ctx, locked); + hash_del(&req->hash_node); + } else { + io_poll_req_delete(req, ctx); + } +} + static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, wait_queue_func_t wake_func) { @@ -217,7 +243,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) static void io_poll_task_func(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; int ret; ret = io_poll_check_events(req, locked); @@ -234,7 +259,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_poll_remove_entries(req); - io_poll_req_delete(req, ctx); + io_poll_tw_hash_eject(req, locked); + io_req_set_res(req, req->cqe.res, 0); io_req_task_complete(req, locked); } @@ -248,7 +274,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) return; io_poll_remove_entries(req); - io_poll_req_delete(req, req->ctx); + io_poll_tw_hash_eject(req, locked); if (!ret) io_req_task_submit(req, locked); @@ -444,7 +470,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 0; } - io_poll_req_insert(req); + if (req->flags & REQ_F_HASH_LOCKED) + io_poll_req_insert_locked(req); + else + io_poll_req_insert(req); if (mask && (poll->events & EPOLLET)) { /* can't multishot if failed, just queue the event we've got */ @@ -485,6 +514,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; + /* + * apoll requests already grab the mutex to complete in the tw handler, + * so removal from the mutex-backed hash is free, use it by default. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + req->flags &= ~REQ_F_HASH_LOCKED; + else + req->flags |= REQ_F_HASH_LOCKED; + if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; if (!file_can_poll(req->file)) @@ -534,13 +572,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } -/* - * Returns true if we found and killed one or more poll requests - */ -__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_poll_remove_all_table(struct task_struct *tsk, + struct io_hash_table *table, + bool cancel_all) { - struct io_hash_table *table = &ctx->cancel_table; unsigned nr_buckets = 1U << table->hash_bits; struct hlist_node *tmp; struct io_kiocb *req; @@ -563,6 +598,17 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, return found; } +/* + * Returns true if we found and killed one or more poll requests + */ +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all) + __must_hold(&ctx->uring_lock) +{ + return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) | + io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); +} + static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, struct io_hash_table *table, @@ -622,13 +668,15 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, return NULL; } -static bool io_poll_disarm(struct io_kiocb *req) +static int io_poll_disarm(struct io_kiocb *req) { + if (!req) + return -ENOENT; if (!io_poll_get_ownership(req)) - return false; + return -EALREADY; io_poll_remove_entries(req); hash_del(&req->hash_node); - return true; + return 0; } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, @@ -652,7 +700,16 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags) { - return __io_poll_cancel(ctx, cd, &ctx->cancel_table); + int ret; + + ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table); + if (ret != -ENOENT) + return ret; + + io_ring_submit_lock(ctx, issue_flags); + ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked); + io_ring_submit_unlock(ctx, issue_flags); + return ret; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -727,6 +784,16 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; + /* + * If sqpoll or single issuer, there is no contention for ->uring_lock + * and we'll end up holding it in tw handlers anyway. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED) && + (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) + req->flags |= REQ_F_HASH_LOCKED; + else + req->flags &= ~REQ_F_HASH_LOCKED; + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); if (ret) { io_req_set_res(req, ret, 0); @@ -751,20 +818,28 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) bool locked; preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); - if (preq) - ret2 = io_poll_disarm(preq); + ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); - - if (!preq) { - ret = -ENOENT; + if (!ret2) + goto found; + if (ret2 != -ENOENT) { + ret = ret2; goto out; } - if (!ret2) { - ret = -EALREADY; + + io_ring_submit_lock(ctx, issue_flags); + preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); + ret2 = io_poll_disarm(preq); + if (bucket) + spin_unlock(&bucket->lock); + io_ring_submit_unlock(ctx, issue_flags); + if (ret2) { + ret = ret2; goto out; } +found: if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { From f09c8643f0fad0e287b9f737955276000fd76a5d Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Fri, 17 Jun 2022 13:04:29 +0800 Subject: [PATCH 085/172] io_uring: kbuf: add comments for some tricky code Add comments to explain why it is always under uring lock when incrementing head in __io_kbuf_recycle. And rectify one comemnt about kbuf consuming in iowq case. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220617050429.94293-1-hao.xu@linux.dev Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index b9c7f6e87cc9a3..59e4fafeb28ce4 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -52,6 +52,13 @@ void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (req->flags & REQ_F_BUFFER_RING) { if (req->buf_list) { if (req->flags & REQ_F_PARTIAL_IO) { + /* + * If we end up here, then the io_uring_lock has + * been kept held since we retrieved the buffer. + * For the io-wq case, we already cleared + * req->buf_list when the buffer was retrieved, + * hence it cannot be set here for that case. + */ req->buf_list->head++; req->buf_list = NULL; } else { @@ -163,12 +170,13 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { /* * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). + * buffer here, otherwise nothing ensures that the buffer won't + * get used by others. This does mean it'll be pinned until the + * IO completes, coming in unlocked means we're being called from + * io-wq context and there may be further retries in async hybrid + * mode. For the locked case, the caller must call commit when + * the transfer completes (or if we get -EAGAIN and must poll of + * retry). */ req->buf_list = NULL; bl->head++; From d245bca6375bccfd589a6a7d5007df28575bb626 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:00 +0100 Subject: [PATCH 086/172] io_uring: don't expose io_fill_cqe_aux() Deduplicate some code and add a helper for filling an aux CQE, locking and notification. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b7c6557c8f9dc5c4cfb01292116c682a0ff61081.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 ++++++++++++++++-- io_uring/io_uring.h | 3 +-- io_uring/msg_ring.c | 11 +---------- io_uring/net.c | 20 +++++--------------- io_uring/poll.c | 24 ++++++++---------------- io_uring/rsrc.c | 14 +++++--------- 6 files changed, 36 insertions(+), 54 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eeda1673179544..8c1b0e0ce5bb8a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -676,8 +676,8 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, return true; } -bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -704,6 +704,20 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); } +bool io_post_aux_cqe(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags) +{ + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) + io_cqring_ev_posted(ctx); + return filled; +} + static void __io_req_complete_put(struct io_kiocb *req) { /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 3f06fbae0ee9e5..18754fb790255e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -239,8 +239,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); -bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 3b89f9a0a0b459..7c3c5f3ab06b57 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -34,7 +34,6 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req); struct io_ring_ctx *target_ctx; - bool filled; int ret; ret = -EBADFD; @@ -43,16 +42,8 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) ret = -EOVERFLOW; target_ctx = req->file->private_data; - - spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); - io_commit_cqring(target_ctx); - spin_unlock(&target_ctx->completion_lock); - - if (filled) { - io_cqring_ev_posted(target_ctx); + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) ret = 0; - } done: if (ret < 0) diff --git a/io_uring/net.c b/io_uring/net.c index fe1fe920b9291f..35d0183fe75817 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -644,22 +644,12 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } - if (ret >= 0) { - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, - IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - goto retry; - } - ret = -ECANCELED; - } - return ret; + if (ret < 0) + return ret; + if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE)) + goto retry; + return -ECANCELED; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/poll.c b/io_uring/poll.c index 9ae2982aef7c6c..e0c181fe6264c0 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -214,23 +214,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - continue; - } - return -ECANCELED; - } - ret = io_poll_issue(req, locked); - if (ret) - return ret; + if (!io_post_aux_cqe(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE)) + return -ECANCELED; + } else { + ret = io_poll_issue(req, locked); + if (ret) + return ret; + } /* * Release all references, retry if someone tried to restart diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 214ff0dfa6a48e..7fed3105152a95 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -174,17 +174,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) list_del(&prsrc->list); if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) + if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ctx->flags & IORING_SETUP_IOPOLL) + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); mutex_unlock(&ctx->uring_lock); + } else { + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + } } rsrc_data->do_put(ctx, prsrc); From faf88dde060f74117b3a86a62cb32a20f27fd636 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:01 +0100 Subject: [PATCH 087/172] io_uring: don't inline __io_get_cqe() __io_get_cqe() is not as hot as io_get_cqe(), no need to inline it, it sheds ~500B from the binary. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c1ac829198a881b7af8710926f99a3559b9f24c0.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++++++ io_uring/io_uring.h | 36 +----------------------------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8c1b0e0ce5bb8a..df6a9abdd966ad 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -166,6 +166,11 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) __io_submit_flush_completions(ctx); } +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -676,6 +681,36 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, return true; } +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) + return NULL; + + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 18754fb790255e..94bd6732f55873 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -14,44 +14,10 @@ enum { IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, }; +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, u64 extra1, u64 extra2); -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -/* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ -static inline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; - unsigned int free, queued, len; - - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; - - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; - /* we need a contiguous range, limit based on the current array offset */ - len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; - - ctx->cached_cq_tail++; - ctx->cqe_cached = &rings->cqes[off]; - ctx->cqe_sentinel = ctx->cqe_cached + len; - ctx->cqe_cached++; - return &rings->cqes[off << shift]; -} - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { From 68494a65d0e2de3d99b28ae050971b6161eabac0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:02 +0100 Subject: [PATCH 088/172] io_uring: introduce io_req_cqe_overflow() __io_fill_cqe_req() is hot and inlined, we want it to be as small as possible. Add io_req_cqe_overflow() accepting only a request and doing all overflow accounting, and replace with it two calls to 6 argument io_cqring_event_overflow(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/048b9fbcce56814d77a1a540409c98c3d383edcb.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 +++++++++++++-- io_uring/io_uring.h | 12 ++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index df6a9abdd966ad..7acb94c180b8e8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -643,8 +643,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags, u64 extra1, u64 extra2) +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags, u64 extra1, u64 extra2) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -681,6 +681,17 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, return true; } +bool io_req_cqe_overflow(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_CQE32_INIT)) { + req->extra1 = 0; + req->extra2 = 0; + } + return io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->extra1, req->extra2); +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 94bd6732f55873..88c64a69beaaeb 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,8 +15,7 @@ enum { }; struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); -bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags, u64 extra1, u64 extra2); +bool io_req_cqe_overflow(struct io_kiocb *req); static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { @@ -56,10 +55,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, memcpy(cqe, &req->cqe, sizeof(*cqe)); return true; } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - 0, 0); } else { u64 extra1 = 0, extra2 = 0; @@ -83,11 +78,8 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, WRITE_ONCE(cqe->big_cqe[1], extra2); return true; } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - extra1, extra2); } + return io_req_cqe_overflow(req); } static inline void req_set_fail(struct io_kiocb *req) From ae5735c69bf28fea0b8260a03caeb906319228a2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:03 +0100 Subject: [PATCH 089/172] io_uring: deduplicate __io_fill_cqe_req tracing Deduplicate two trace_io_uring_complete() calls in __io_fill_cqe_req(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/277ed85dba5189ab7d932164b314013a0f0b0fdc.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 88c64a69beaaeb..763915c665933f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -41,10 +41,12 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (!(ctx->flags & IORING_SETUP_CQE32)) { - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, + (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + if (!(ctx->flags & IORING_SETUP_CQE32)) { /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -63,9 +65,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, extra2 = req->extra2; } - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); - /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in From e8c328c3913d381bf60f2aecdf350c04b5b7e67d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:04 +0100 Subject: [PATCH 090/172] io_uring: deduplicate io_get_cqe() calls Deduplicate calls to io_get_cqe() from __io_fill_cqe_req(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4fa077986cc3abab7c59ff4e7c390c783885465f.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 763915c665933f..dfb490e7cf4593 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -45,19 +45,17 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, req->cqe.res, req->cqe.flags, (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (unlikely(!cqe)) + return io_req_cqe_overflow(req); + memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (!(ctx->flags & IORING_SETUP_CQE32)) { - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - } else { + if (ctx->flags & IORING_SETUP_CQE32) { u64 extra1 = 0, extra2 = 0; if (req->flags & REQ_F_CQE32_INIT) { @@ -65,20 +63,10 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, extra2 = req->extra2; } - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return true; - } + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); } - return io_req_cqe_overflow(req); + return true; } static inline void req_set_fail(struct io_kiocb *req) From b3659a65be70eb68d9fc9802c4ce81e0f943abfd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:05 +0100 Subject: [PATCH 091/172] io_uring: change ->cqe_cached invariant for CQE32 With IORING_SETUP_CQE32 ->cqe_cached doesn't store a real address but rather an implicit offset into cqes. Store the real cqe pointer and increment it accordingly if CQE32. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1ee1838cba16bed96381a006950b36ba640d998c.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 ++++++++++----- io_uring/io_uring.h | 8 ++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7acb94c180b8e8..0dbf6a74f9f3a1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -701,11 +701,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; unsigned int free, queued, len; - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); @@ -715,11 +712,19 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) if (!len) return NULL; - ctx->cached_cq_tail++; + if (ctx->flags & IORING_SETUP_CQE32) { + off <<= 1; + len <<= 1; + } + ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; + + ctx->cached_cq_tail++; ctx->cqe_cached++; - return &rings->cqes[off << shift]; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; + return &rings->cqes[off]; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dfb490e7cf4593..558a860a93fcd6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -22,14 +22,10 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { struct io_uring_cqe *cqe = ctx->cqe_cached; - if (ctx->flags & IORING_SETUP_CQE32) { - unsigned int off = ctx->cqe_cached - ctx->rings->cqes; - - cqe += off; - } - ctx->cached_cq_tail++; ctx->cqe_cached++; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; return cqe; } From 27a9d66fec77cff0e32d2ecd5d0ac7ef878a7bb0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 13:57:18 +0100 Subject: [PATCH 092/172] io_uring: kill extra io_uring_types.h includes io_uring/io_uring.h already includes io_uring_types.h, no need to include it every time. Kill it in a bunch of places, it prepares us for following patches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/94d8c943fbe0ef949981c508ddcee7fc1c18850f.1655384063.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/advise.c | 1 - io_uring/cancel.c | 1 - io_uring/epoll.c | 1 - io_uring/fdinfo.c | 1 - io_uring/filetable.c | 1 - io_uring/fs.c | 1 - io_uring/io_uring.c | 1 - io_uring/kbuf.c | 1 - io_uring/msg_ring.c | 1 - io_uring/net.c | 1 - io_uring/nop.c | 1 - io_uring/opdef.c | 1 - io_uring/openclose.c | 1 - io_uring/poll.c | 1 - io_uring/rsrc.c | 1 - io_uring/rw.c | 1 - io_uring/splice.c | 1 - io_uring/sqpoll.c | 1 - io_uring/statx.c | 1 - io_uring/sync.c | 1 - io_uring/tctx.c | 1 - io_uring/timeout.c | 1 - io_uring/uring_cmd.c | 1 - io_uring/xattr.c | 1 - 24 files changed, 24 deletions(-) diff --git a/io_uring/advise.c b/io_uring/advise.c index 8870fdf66ffbbd..581956934c0bf0 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -11,7 +11,6 @@ #include #include -#include "io_uring_types.h" #include "io_uring.h" #include "advise.h" diff --git a/io_uring/cancel.c b/io_uring/cancel.c index f07bfd27c98ac2..d1e7f5a955ab20 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -10,7 +10,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "tctx.h" #include "poll.h" diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 10853e8ed07887..a8b794471d6b83 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "epoll.h" diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 344e7d90d55756..61c35707a6cfc2 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "sqpoll.h" #include "fdinfo.h" diff --git a/io_uring/filetable.c b/io_uring/filetable.c index e449ceb9a848e6..534e1a3c625d9f 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "rsrc.h" #include "filetable.h" diff --git a/io_uring/fs.c b/io_uring/fs.c index aac1bc5255b07f..0de4f549bb7df6 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -12,7 +12,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "fs.h" diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0dbf6a74f9f3a1..0a1f83a936b753 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -80,7 +80,6 @@ #include "io-wq.h" -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 59e4fafeb28ce4..62de0dda24bf6e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -11,7 +11,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "kbuf.h" diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7c3c5f3ab06b57..b02be23496521c 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -7,7 +7,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "msg_ring.h" diff --git a/io_uring/net.c b/io_uring/net.c index 35d0183fe75817..b77bfbfb081673 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -10,7 +10,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "kbuf.h" #include "net.h" diff --git a/io_uring/nop.c b/io_uring/nop.c index d363d8ce70a3b7..d956599a3c1b8f 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -7,7 +7,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "nop.h" diff --git a/io_uring/opdef.c b/io_uring/opdef.c index d687d33f9c0c03..a7b84b43e6c235 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -8,7 +8,6 @@ #include #include -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 1cbf3903097053..099a5ec84dfdb7 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -12,7 +12,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" diff --git a/io_uring/poll.c b/io_uring/poll.c index e0c181fe6264c0..63aca920543b31 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -13,7 +13,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "refs.h" #include "opdef.h" diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7fed3105152a95..68629eba413265 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -12,7 +12,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "openclose.h" #include "rsrc.h" diff --git a/io_uring/rw.c b/io_uring/rw.c index e5ca23d0783e2c..f8b42f2265df98 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -14,7 +14,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "kbuf.h" diff --git a/io_uring/splice.c b/io_uring/splice.c index 0e19d63303452a..b013ba34bffa58 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -11,7 +11,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "splice.h" diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 149d5c976f1467..76d4d70c733a99 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -14,7 +14,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "sqpoll.h" diff --git a/io_uring/statx.c b/io_uring/statx.c index 83b15687e9c5ee..6056cd7f48761c 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -8,7 +8,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "statx.h" diff --git a/io_uring/sync.c b/io_uring/sync.c index 9ee8ff865521f3..f2102afa79ca63 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -11,7 +11,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "sync.h" diff --git a/io_uring/tctx.c b/io_uring/tctx.c index a819da8fc85cd2..9b30fb0d360307 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "tctx.h" diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 526fc8b2e3b65d..f9df359813c93e 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -8,7 +8,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "refs.h" #include "cancel.h" diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index abf78918a0995f..0a421ed51e7e16 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -6,7 +6,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "uring_cmd.h" diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 79adf4efba0184..b179f9acd5acc5 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -13,7 +13,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "xattr.h" From ab1c84d855cf2c284fa0f4b17fc04063659c54a1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 13:57:19 +0100 Subject: [PATCH 093/172] io_uring: make io_uring_types.h public Move io_uring types to linux/include, need them public so tracing can see the definitions and we can clean trace/events/io_uring.h Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a15f12e8cb7289b2de0deaddcc7518d98a132d17.1655384063.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- {io_uring => include/linux}/io_uring_types.h | 28 ++++++++++++++++++-- io_uring/filetable.h | 11 -------- io_uring/io-wq.h | 17 +----------- io_uring/io_uring.h | 4 ++- io_uring/refs.h | 2 +- 5 files changed, 31 insertions(+), 31 deletions(-) rename {io_uring => include/linux}/io_uring_types.h (96%) diff --git a/io_uring/io_uring_types.h b/include/linux/io_uring_types.h similarity index 96% rename from io_uring/io_uring_types.h rename to include/linux/io_uring_types.h index 65ac7cdaaa73f4..779c72da5b8faf 100644 --- a/io_uring/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -6,8 +6,32 @@ #include #include -#include "io-wq.h" -#include "filetable.h" +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + +struct io_wq_work { + struct io_wq_work_node list; + unsigned flags; + /* place it here instead of io_kiocb as it fills padding and saves 4B */ + int cancel_seq; +}; + +struct io_fixed_file { + /* file * with additional FFS_* flags */ + unsigned long file_ptr; +}; + +struct io_file_table { + struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; +}; struct io_hash_bucket { spinlock_t lock; diff --git a/io_uring/filetable.h b/io_uring/filetable.h index c404360f709053..6b58aa48bc45d3 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -22,17 +22,6 @@ struct io_kiocb; #endif #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 3f54ee2a8eebd3..10b80ef78bb817 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -2,6 +2,7 @@ #define INTERNAL_IO_WQ_H #include +#include struct io_wq; @@ -20,15 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -struct io_wq_work_node { - struct io_wq_work_node *next; -}; - -struct io_wq_work_list { - struct io_wq_work_node *first; - struct io_wq_work_node *last; -}; - #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) @@ -152,13 +144,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) return node; } -struct io_wq_work { - struct io_wq_work_node list; - unsigned flags; - /* place it here instead of io_kiocb as it fills padding and saves 4B */ - int cancel_seq; -}; - static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) { if (!work->list.next) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 558a860a93fcd6..5eaa01c4697c37 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -3,7 +3,9 @@ #include #include -#include "io_uring_types.h" +#include +#include "io-wq.h" +#include "filetable.h" #ifndef CREATE_TRACE_POINTS #include diff --git a/io_uring/refs.h b/io_uring/refs.h index 334c5ead4c43d9..1336de3f2a30aa 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -2,7 +2,7 @@ #define IOU_REQ_REF_H #include -#include "io_uring_types.h" +#include /* * Shamelessly stolen from the mm implementation of page reference checking, From 48863ffd3e81b6ec98606d3479c50aa77b7a98a9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 13:57:20 +0100 Subject: [PATCH 094/172] io_uring: clean up tracing events We have lots of trace events accepting an io_uring request and wanting to print some of its fields like user_data, opcode, flags and so on. However, as trace points were unaware of io_uring structures, we had to pass all the fields as arguments. Teach trace/events/io_uring.h about struct io_kiocb and stop the misery of passing a horde of arguments to trace helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/40ff72f92798114e56d400f2b003beb6cde6ef53.1655384063.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 142 +++++++++++++------------------- io_uring/io_uring.c | 16 ++-- io_uring/poll.c | 5 +- io_uring/timeout.c | 3 +- 4 files changed, 66 insertions(+), 100 deletions(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index aa2f951b07cdff..3bc8dec9acaacf 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -7,6 +7,7 @@ #include #include +#include #include struct io_wq_work; @@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register, /** * io_uring_file_get - called before getting references to an SQE file * - * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @user_data: user data associated with the request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can @@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register, */ TRACE_EVENT(io_uring_file_get, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd), + TP_PROTO(struct io_kiocb *req, int fd), - TP_ARGS(ctx, req, user_data, fd), + TP_ARGS(req, fd), TP_STRUCT__entry ( __field( void *, ctx ) @@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; + __entry->user_data = req->cqe.user_data; __entry->fd = fd; ), @@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get, /** * io_uring_queue_async_work - called before submitting a new async work * - * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @user_data: user data associated with the request - * @opcode: opcode of request - * @flags request flags - * @work: pointer to a submitted io_wq_work * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode, - unsigned int flags, struct io_wq_work *work, int rw), + TP_PROTO(struct io_kiocb *req, int rw), - TP_ARGS(ctx, req, user_data, opcode, flags, work, rw), + TP_ARGS(req, rw), TP_STRUCT__entry ( __field( void *, ctx ) @@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work, __field( struct io_wq_work *, work ) __field( int, rw ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->flags = flags; - __entry->opcode = opcode; - __entry->work = work; + __entry->user_data = req->cqe.user_data; + __entry->flags = req->flags; + __entry->opcode = req->opcode; + __entry->work = &req->work; __entry->rw = rw; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", @@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work, /** * io_uring_defer - called when an io_uring request is deferred * - * @ctx: pointer to a ring context structure * @req: pointer to a deferred request - * @user_data: user data associated with the request - * @opcode: opcode of request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode), + TP_PROTO(struct io_kiocb *req), - TP_ARGS(ctx, req, user_data, opcode), + TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) @@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer, __field( unsigned long long, data ) __field( u8, opcode ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->data = user_data; - __entry->opcode = opcode; + __entry->data = req->cqe.user_data; + __entry->opcode = req->opcode; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", @@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer, * io_uring_link - called before the io_uring request added into link_list of * another request * - * @ctx: pointer to a ring context structure * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * @@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer, */ TRACE_EVENT(io_uring_link, - TP_PROTO(void *ctx, void *req, void *target_req), + TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req), - TP_ARGS(ctx, req, target_req), + TP_ARGS(req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) @@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; __entry->target_req = target_req; ), @@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait, /** * io_uring_fail_link - called before failing a linked request * - * @ctx: pointer to a ring context structure * @req: request, which links were cancelled - * @user_data: user data associated with the request - * @opcode: opcode of request * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work @@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait, */ TRACE_EVENT(io_uring_fail_link, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link), + TP_PROTO(struct io_kiocb *req, struct io_kiocb *link), - TP_ARGS(ctx, req, user_data, opcode, link), + TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, ctx ) @@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link, __field( u8, opcode ) __field( void *, link ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; __entry->link = link; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", @@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete, /** * io_uring_submit_sqe - called before submitting one SQE * - * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @user_data: user data associated with the request - * @opcode: opcode of request - * @flags request flags * @force_nonblock: whether a context blocking or not - * @sq_thread: true if sq_thread has submitted this SQE * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags, - bool force_nonblock, bool sq_thread), + TP_PROTO(struct io_kiocb *req, bool force_nonblock), - TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread), + TP_ARGS(req, force_nonblock), TP_STRUCT__entry ( __field( void *, ctx ) @@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe, __field( bool, force_nonblock ) __field( bool, sq_thread ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; - __entry->flags = flags; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; + __entry->flags = req->flags; __entry->force_nonblock = force_nonblock; - __entry->sq_thread = sq_thread; + __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " @@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe, /* * io_uring_poll_arm - called after arming a poll wait if successful * - * @ctx: pointer to a ring context structure * @req: pointer to the armed request - * @user_data: user data associated with the request - * @opcode: opcode of request * @mask: request poll events mask * @events: registered events of interest * @@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe, */ TRACE_EVENT(io_uring_poll_arm, - TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode, - int mask, int events), + TP_PROTO(struct io_kiocb *req, int mask, int events), - TP_ARGS(ctx, req, user_data, opcode, mask, events), + TP_ARGS(req, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) @@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm, __field( int, mask ) __field( int, events ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; __entry->mask = mask; __entry->events = events; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", @@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm, /* * io_uring_task_add - called after adding a task * - * @ctx: pointer to a ring context structure * @req: pointer to request - * @user_data: user data associated with the request - * @opcode: opcode of request * @mask: request poll events mask * */ TRACE_EVENT(io_uring_task_add, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask), + TP_PROTO(struct io_kiocb *req, int mask), - TP_ARGS(ctx, req, user_data, opcode, mask), + TP_ARGS(req, mask), TP_STRUCT__entry ( __field( void *, ctx ) @@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add, __field( u8, opcode ) __field( int, mask ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; __entry->mask = mask; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", @@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add, * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed - * @ctx: pointer to a ring context structure * @req: pointer to request * @error: error it failed with * @@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add, */ TRACE_EVENT(io_uring_req_failed, - TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error), + TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error), - TP_ARGS(sqe, ctx, req, error), + TP_ARGS(sqe, req, error), TP_STRUCT__entry ( __field( void *, ctx ) @@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a1f83a936b753..ef4371790aaa5d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -452,9 +452,7 @@ void io_queue_iowq(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, - req->opcode, req->flags, &req->work, - io_wq_is_hashed(&req->work)); + trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1583,7 +1581,7 @@ static __cold void io_drain_req(struct io_kiocb *req) goto queue; } - trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); + trace_io_uring_defer(req); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -1783,7 +1781,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); + trace_io_uring_file_get(req, fd); /* we don't allow fixed io_uring files */ if (file && io_is_uring_fops(file)) @@ -2006,7 +2004,7 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, struct io_submit_link *link = &ctx->submit_state.link; struct io_kiocb *head = link->head; - trace_io_uring_req_failed(sqe, ctx, req, ret); + trace_io_uring_req_failed(sqe, req, ret); /* * Avoid breaking links in the middle as it renders links with SQPOLL @@ -2048,9 +2046,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_submit_fail_init(sqe, req, ret); /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, - req->flags, true, - ctx->flags & IORING_SETUP_SQPOLL); + trace_io_uring_submit_sqe(req, true); /* * If we already have a head request, queue this one for async @@ -2064,7 +2060,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); - trace_io_uring_link(ctx, req, link->head); + trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; diff --git a/io_uring/poll.c b/io_uring/poll.c index 63aca920543b31..b2659b56c702f3 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -288,7 +288,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + trace_io_uring_task_add(req, mask); io_req_task_work_add(req); } @@ -558,8 +558,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, - mask, apoll->poll.events); + trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f9df359813c93e..557c637af158be 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -115,8 +115,7 @@ static void io_fail_links(struct io_kiocb *req) nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, - req->opcode, link); + trace_io_uring_fail_link(req, link); if (ignore_cqes) link->flags |= REQ_F_CQE_SKIP; From ad163a7e2562230c77102c60f668bac440e60cce Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 19:44:33 -0600 Subject: [PATCH 095/172] io_uring: move a few private types to local headers Commit 3a3d47fa9cfd ("io_uring: make io_uring_types.h public") moved a bunch of io_uring types to a kernel wide header, so we could make tracing a bit saner rather than pass in a ton of arguments. However, there are a few types in there that are not really needed to be system wide. Move the cancel data and mapped buffers back to the appropriate io_uring local headers. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 18 ------------------ io_uring/cancel.h | 13 +++++++++++++ io_uring/fdinfo.c | 1 + io_uring/poll.h | 1 + io_uring/rsrc.h | 8 ++++++++ io_uring/timeout.h | 1 + 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 779c72da5b8faf..2015f3ea7cb76b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -528,27 +528,9 @@ struct io_kiocb { struct io_wq_work work; }; -struct io_cancel_data { - struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; - u32 flags; - int seq; -}; - struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - #endif diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 8dd259dc383e2c..2338012a5b06fb 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -1,5 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include + +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + + int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 61c35707a6cfc2..b29e2d02216f22 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -13,6 +13,7 @@ #include "sqpoll.h" #include "fdinfo.h" #include "cancel.h" +#include "rsrc.h" #ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, diff --git a/io_uring/poll.h b/io_uring/poll.h index fa3e19790281b2..c40673d7da0199 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -24,6 +24,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); +struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 872c86312cbc2d..03f26516e99463 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -45,6 +45,14 @@ struct io_rsrc_node { bool done; }; +struct io_mapped_ubuf { + u64 ubuf; + u64 ubuf_end; + unsigned int nr_bvecs; + unsigned long acct_pages; + struct bio_vec bvec[]; +}; + void io_rsrc_put_work(struct work_struct *work); void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); diff --git a/io_uring/timeout.h b/io_uring/timeout.h index dd7cfb0d936671..858c62644897a5 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -22,6 +22,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) } __cold void io_flush_timeouts(struct io_ring_ctx *ctx); +struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); From d142c3ec8d160bea9801f0d727e92007787df8c0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:04 +0100 Subject: [PATCH 096/172] io_uring: remove extra io_commit_cqring() We don't post events in __io_commit_cqring_flush() anymore but send all requests to tw, so no need to do io_commit_cqring() there. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f2481e32375e749be89c42e4804268b608722cef.1655637157.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ef4371790aaa5d..efad2d9b7b42c5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -480,7 +480,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_flush_timeouts(ctx); if (ctx->drain_active) io_queue_deferred(ctx); - io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); } if (ctx->has_evfd) From 9046c6415be60f51f60f8b771a74ac4e72e3599d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:05 +0100 Subject: [PATCH 097/172] io_uring: reshuffle io_uring/io_uring.h It's a good idea to first do forward declarations and then inline helpers, otherwise there will be keep stumbling on dependencies between them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1d7fa6672ed43f20ccc0c54ae201369ebc3ebfab.1655637157.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 95 ++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5eaa01c4697c37..7b2055b342dfab 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -18,6 +18,53 @@ enum { struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); bool io_req_cqe_overflow(struct io_kiocb *req); +int io_run_task_work_sig(void); +void io_req_complete_failed(struct io_kiocb *req, s32 res); +void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); +void io_req_complete_post(struct io_kiocb *req); +void __io_req_complete_post(struct io_kiocb *req); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +void io_cqring_ev_posted(struct io_ring_ctx *ctx); +void __io_commit_cqring_flush(struct io_ring_ctx *ctx); + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); + +struct file *io_file_get_normal(struct io_kiocb *req, int fd); +struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned issue_flags); + +bool io_is_uring_fops(struct file *file); +bool io_alloc_async_data(struct io_kiocb *req); +void io_req_task_work_add(struct io_kiocb *req); +void io_req_task_prio_work_add(struct io_kiocb *req); +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); +void io_req_task_queue(struct io_kiocb *req); +void io_queue_iowq(struct io_kiocb *req, bool *dont_use); +void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_req_task_queue_fail(struct io_kiocb *req, int ret); +void io_req_task_submit(struct io_kiocb *req, bool *locked); +void tctx_task_work(struct callback_head *cb); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); + +int io_poll_issue(struct io_kiocb *req, bool *locked); +int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); +int io_req_prep_async(struct io_kiocb *req); + +struct io_wq_work *io_wq_free_work(struct io_wq_work *work); +void io_wq_submit_work(struct io_wq_work *work); + +void io_free_req(struct io_kiocb *req); +void io_queue_next(struct io_kiocb *req); + +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all); + +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { @@ -177,52 +224,4 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -int io_run_task_work_sig(void); -void io_req_complete_failed(struct io_kiocb *req, s32 res); -void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); -void io_req_complete_post(struct io_kiocb *req); -void __io_req_complete_post(struct io_kiocb *req); -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -void io_cqring_ev_posted(struct io_ring_ctx *ctx); -void __io_commit_cqring_flush(struct io_ring_ctx *ctx); - -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); - -struct file *io_file_get_normal(struct io_kiocb *req, int fd); -struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned issue_flags); - -bool io_is_uring_fops(struct file *file); -bool io_alloc_async_data(struct io_kiocb *req); -void io_req_task_work_add(struct io_kiocb *req); -void io_req_task_prio_work_add(struct io_kiocb *req); -void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); -void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, bool *dont_use); -void io_req_task_complete(struct io_kiocb *req, bool *locked); -void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, bool *locked); -void tctx_task_work(struct callback_head *cb); -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); - -int io_poll_issue(struct io_kiocb *req, bool *locked); -int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); -int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); -int io_req_prep_async(struct io_kiocb *req); - -struct io_wq_work *io_wq_free_work(struct io_wq_work *work); -void io_wq_submit_work(struct io_wq_work *work); - -void io_free_req(struct io_kiocb *req); -void io_queue_next(struct io_kiocb *req); - -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all); - -#define io_for_each_link(pos, head) \ - for (pos = (head); pos; pos = pos->link) - #endif From a830ffd28780627b6287bcd5b84e9fe2dd795935 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:06 +0100 Subject: [PATCH 098/172] io_uring: move io_eventfd_signal() Move io_eventfd_signal() in the sources without any changes and kill its forward declaration. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9ebebb3f6f56f5a5448a621e0b6a537720c43334.1655637157.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index efad2d9b7b42c5..61d4e6d0731a20 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -142,8 +142,6 @@ static void io_queue_sqe(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static void io_eventfd_signal(struct io_ring_ctx *ctx); - static struct kmem_cache *req_cachep; struct sock *io_uring_get_socket(struct file *file) @@ -472,20 +470,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used || ctx->drain_active) { - spin_lock(&ctx->completion_lock); - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } - if (ctx->has_evfd) - io_eventfd_signal(ctx); -} - static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; @@ -513,6 +497,20 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) rcu_read_unlock(); } +void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + /* * This should only get called when at least one event has been posted. * Some applications rely on the eventfd notification count only changing From d9dee4302a7cbd6c0142dbdf6d150acc7459de0d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:08 +0100 Subject: [PATCH 099/172] io_uring: remove ->flush_cqes optimisation It's not clear how widely used IOSQE_CQE_SKIP_SUCCESS is, and how often ->flush_cqes flag prevents from completion being flushed. Sometimes it's high level of concurrency that enables it at least for one CQE, but sometimes it doesn't save much because nobody waiting on the CQ. Remove ->flush_cqes flag and the optimisation, it should benefit the normal use case. Note, that there is no spurious eventfd problem with that as checks for spuriousness were incorporated into io_eventfd_signal(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/692e81eeddccc096f449a7960365fa7b4a18f8e6.1655637157.git.asml.silence@gmail.com [axboe: remove now dead state->flush_cqes variable] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/io_uring.c | 23 ++++++++++------------- io_uring/io_uring.h | 2 -- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 2015f3ea7cb76b..6bcd7bff6479be 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -148,7 +148,6 @@ struct io_submit_state { bool plug_started; bool need_plug; - bool flush_cqes; unsigned short submit_nr; struct blk_plug plug; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 61d4e6d0731a20..16a625e854ec33 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1250,22 +1250,19 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - if (state->flush_cqes) { - spin_lock(&ctx->completion_lock); - wq_list_for_each(node, prev, &state->compl_reqs) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - comp_list); - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(ctx, req); - } + spin_lock(&ctx->completion_lock); + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - state->flush_cqes = false; + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(ctx, req); } + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7b2055b342dfab..bdc62727638be8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -219,8 +219,6 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) { struct io_submit_state *state = &req->ctx->submit_state; - if (!(req->flags & REQ_F_CQE_SKIP)) - state->flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } From affa87db90108d9f017f927bcdab536e32c3915e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:52 +0100 Subject: [PATCH 100/172] io_uring: fix multi ctx cancellation io_uring_try_cancel_requests() loops until there is nothing left to do with the ring, however there might be several rings and they might have dependencies between them, e.g. via poll requests. Instead of cancelling rings one by one, try to cancel them all and only then loop over if we still potenially some work to do. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8d491fe02d8ac4c77ff38061cf86b9a827e8845c.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 87 ++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 16a625e854ec33..707b599b9224a4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -132,7 +132,7 @@ struct io_defer_entry { #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, +static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); @@ -2648,7 +2648,9 @@ static __cold void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - io_uring_try_cancel_requests(ctx, NULL, true); + while (io_uring_try_cancel_requests(ctx, NULL, true)) + cond_resched(); + if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; @@ -2806,53 +2808,48 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, +static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; + enum io_wq_cancel cret; + bool ret = false; /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) - return; - - while (1) { - enum io_wq_cancel cret; - bool ret = false; + return false; - if (!task) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } + if (!task) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx && tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - } + /* SQPOLL thread does its own polling */ + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || + (ctx->sq_data && ctx->sq_data->thread == current)) { + while (!wq_list_empty(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; } - - ret |= io_cancel_defer_files(ctx, task, cancel_all); - mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) - ret |= io_run_task_work(); - if (!ret) - break; - cond_resched(); } + + ret |= io_cancel_defer_files(ctx, task, cancel_all); + mutex_lock(&ctx->uring_lock); + ret |= io_poll_remove_all(ctx, task, cancel_all); + mutex_unlock(&ctx->uring_lock); + ret |= io_kill_timeouts(ctx, task, cancel_all); + if (task) + ret |= io_run_task_work(); + return ret; } static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) @@ -2882,6 +2879,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) atomic_inc(&tctx->in_idle); do { + bool loop = false; + io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); @@ -2896,13 +2895,19 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) continue; - io_uring_try_cancel_requests(node->ctx, current, - cancel_all); + loop |= io_uring_try_cancel_requests(node->ctx, + current, cancel_all); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_try_cancel_requests(ctx, current, - cancel_all); + loop |= io_uring_try_cancel_requests(ctx, + current, + cancel_all); + } + + if (loop) { + cond_resched(); + continue; } prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); From ba3cdb6fbb6e8eb525c868c60e103c5711edc068 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:53 +0100 Subject: [PATCH 101/172] io_uring: improve task exit timeout cancellations Don't spin trying to cancel timeouts that are reachable but not cancellable, e.g. already executing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ab8a7440a60bbdf69ae514f672ad050e43dd1b03.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 557c637af158be..a79a7d6ef1b37a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -49,7 +49,7 @@ static inline void io_put_req(struct io_kiocb *req) } } -static void io_kill_timeout(struct io_kiocb *req, int status) +static bool io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -64,7 +64,9 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&timeout->list); io_req_tw_post_queue(req, status, 0); + return true; } + return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) @@ -620,10 +622,9 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tsk, cancel_all)) { - io_kill_timeout(req, -ECANCELED); + if (io_match_task(req, tsk, cancel_all) && + io_kill_timeout(req, -ECANCELED)) canceled++; - } } spin_unlock_irq(&ctx->timeout_lock); io_commit_cqring(ctx); From b321823a03dc81a5b83b063e6e1904d612b53266 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:54 +0100 Subject: [PATCH 102/172] io_uring: fix io_poll_remove_all clang warnings clang complains on bitwise operations with bools, add a bit more verbosity to better show that we want to call io_poll_remove_all_table() twice but with different arguments. Reported-by: Nathan Chancellor Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f11d21dcdf9233e0eeb15fa13b858a05a78eb310.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index b2659b56c702f3..cbf44c38efd9ee 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -595,8 +595,11 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) __must_hold(&ctx->uring_lock) { - return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) | - io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); + bool ret; + + ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all); + ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); + return ret; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, From 305bef98870816ae58357d647521891ec558a92e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:55 +0100 Subject: [PATCH 103/172] io_uring: hide eventfd assumptions in eventfd paths Some io_uring-eventfd users assume that there won't be spurious wakeups. That assumption has to be honoured by all io_cqring_ev_posted() callers, which is inconvenient and from time to time leads to problems but should be maintained to not break the userspace. Instead of making the callers track whether a CQE was posted or not, hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw last time and triggers the eventfd only when ->cached_cq_tail changed since then. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ffc66bae37a2513080b601e4370e147faaa72c5.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ io_uring/io_uring.c | 44 ++++++++++++++++++++-------------- io_uring/timeout.c | 3 +-- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6bcd7bff6479be..5987f8acca3838 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -314,6 +314,8 @@ struct io_ring_ctx { struct list_head defer_list; unsigned sq_thread_idle; + /* protected by ->completion_lock */ + unsigned evfd_last_cq_tail; }; enum { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 707b599b9224a4..84f92362521670 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -473,6 +473,22 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; + bool skip; + + spin_lock(&ctx->completion_lock); + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count only + * changing IFF a new CQE has been added to the CQ ring. There's no + * depedency on 1:1 relationship between how many times this function is + * called (and hence the eventfd count) and number of CQEs posted to the + * CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; rcu_read_lock(); /* @@ -511,13 +527,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || @@ -530,7 +539,7 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - bool all_flushed, posted; + bool all_flushed; size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) @@ -539,7 +548,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; - posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); @@ -554,7 +562,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) else io_account_cq_overflow(ctx); - posted = true; list_del(&ocqe->list); kfree(ocqe); } @@ -567,8 +574,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return all_flushed; } @@ -758,8 +764,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, filled = io_fill_cqe_aux(ctx, user_data, res, cflags); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (filled) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return filled; } @@ -940,14 +945,12 @@ __cold void io_free_req(struct io_kiocb *req) static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool posted; spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); + io_disarm_next(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) @@ -2428,6 +2431,11 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, kfree(ev_fd); return ret; } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index a79a7d6ef1b37a..424b2fc858b8b4 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -629,7 +629,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, spin_unlock_irq(&ctx->timeout_lock); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return canceled != 0; } From 253993210bd8aa3b39a392807c03c8ef1cd7dc3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:56 +0100 Subject: [PATCH 104/172] io_uring: introduce locking helpers for CQE posting spin_lock(&ctx->completion_lock); /* post CQEs */ io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); We have many places repeating this sequence, and the three function unlock section is not perfect from the maintainance perspective and also makes it harder to add new locking/sync trick. Introduce two helpers. io_cq_lock(), which is simple and only grabs ->completion_lock, and io_cq_unlock_post() encapsulating the three call section. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/fe0c682bf7f7b55d9be55b0d034be9c1949277dc.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 57 +++++++++++++++++++++------------------------ io_uring/io_uring.h | 9 ++++++- io_uring/timeout.c | 6 ++--- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 84f92362521670..0db73d01455d10 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -527,7 +527,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) @@ -536,6 +536,19 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } +static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +void io_cq_unlock_post(struct io_ring_ctx *ctx) +{ + __io_cq_unlock_post(ctx); +} + /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -548,7 +561,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; @@ -572,9 +585,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); return all_flushed; } @@ -760,11 +771,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, { bool filled; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); return filled; } @@ -810,11 +819,9 @@ void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); __io_req_complete_post(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); } inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) @@ -946,11 +953,9 @@ static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); io_disarm_next(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) @@ -984,13 +989,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) -{ - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - static void handle_prev_tw_list(struct io_wq_work_node *node, struct io_ring_ctx **ctx, bool *uring_locked) { @@ -1006,7 +1004,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (req->ctx != *ctx) { if (unlikely(!*uring_locked && *ctx)) - ctx_commit_and_unlock(*ctx); + io_cq_unlock_post(*ctx); ctx_flush_and_put(*ctx, uring_locked); *ctx = req->ctx; @@ -1014,7 +1012,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, *uring_locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); if (unlikely(!*uring_locked)) - spin_lock(&(*ctx)->completion_lock); + io_cq_lock(*ctx); } if (likely(*uring_locked)) { req->io_task_work.func(req, uring_locked); @@ -1026,7 +1024,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, } while (node); if (unlikely(!*uring_locked)) - ctx_commit_and_unlock(*ctx); + io_cq_unlock_post(*ctx); } static void handle_tw_list(struct io_wq_work_node *node, @@ -1261,10 +1259,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(ctx, req); } - - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + __io_cq_unlock_post(ctx); io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index bdc62727638be8..738fb96575ab7c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -24,7 +24,6 @@ void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -void io_cqring_ev_posted(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -66,6 +65,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) +static inline void io_cq_lock(struct io_ring_ctx *ctx) + __acquires(ctx->completion_lock) +{ + spin_lock(&ctx->completion_lock); +} + +void io_cq_unlock_post(struct io_ring_ctx *ctx); + static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 424b2fc858b8b4..7e2c341f976258 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -617,7 +617,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_timeout *timeout, *tmp; int canceled = 0; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -627,8 +627,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, canceled++; } spin_unlock_irq(&ctx->timeout_lock); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); return canceled != 0; } From 46929b086886ade8302cff6e85ccea66bce0cf98 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:57 +0100 Subject: [PATCH 105/172] io_uring: add io_commit_cqring_flush() Since __io_commit_cqring_flush users moved to different files, introduce io_commit_cqring_flush() helper and encapsulate all flags testing details inside. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0da03887435dd9869ffe46dcd3962bf104afcca3.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +---- io_uring/io_uring.h | 6 ++++++ io_uring/rw.c | 5 +---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0db73d01455d10..3e65e04915a782 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -529,10 +529,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - + io_commit_cqring_flush(ctx); io_cqring_wake(ctx); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 738fb96575ab7c..afca7ff8956c32 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -229,4 +229,10 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } +static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) + __io_commit_cqring_flush(ctx); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index f8b42f2265df98..0028e95e6633af 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1016,10 +1016,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - + io_commit_cqring_flush(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_wake(ctx); } From f337a84d39527a2c46b1230470748d7bd0672ce0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:58 +0100 Subject: [PATCH 106/172] io_uring: opcode independent fixed buf import Fixed buffers are generic infrastructure, make io_import_fixed() opcode agnostic. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b1e765c8a1c2c913a05a28d2399fc53e1d3cf37a.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0028e95e6633af..ded8ef01165cdd 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -273,14 +273,15 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, return IOU_ISSUE_SKIP_COMPLETE; } -static int __io_import_fixed(struct io_kiocb *req, int ddir, - struct iov_iter *iter, struct io_mapped_ubuf *imu) +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) { - struct io_rw *rw = io_kiocb_to_cmd(req); - size_t len = rw->len; - u64 buf_end, buf_addr = rw->addr; + u64 buf_end; size_t offset; + if (WARN_ON_ONCE(!imu)) + return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ @@ -332,14 +333,6 @@ static int __io_import_fixed(struct io_kiocb *req, int ddir, return 0; } -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - unsigned int issue_flags) -{ - if (WARN_ON_ONCE(!req->imu)) - return -EFAULT; - return __io_import_fixed(req, rw, iter, req->imu); -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) @@ -426,7 +419,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, ddir, iter, issue_flags); + ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); if (ret) return ERR_PTR(ret); return NULL; From c059f785840807ed5e1f2810420c1555969b6246 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:59 +0100 Subject: [PATCH 107/172] io_uring: move io_import_fixed() Move io_import_fixed() into rsrc.c where it belongs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4d5becb21f332b4fef6a7cedd6a50e65e2371630.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 3 +++ io_uring/rw.c | 60 ------------------------------------------------- 3 files changed, 63 insertions(+), 60 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 68629eba413265..1106089551595e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1310,3 +1310,63 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, io_rsrc_node_switch(ctx, NULL); return ret; } + +int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + u64 buf_end; + size_t offset; + + if (WARN_ON_ONCE(!imu)) + return -EFAULT; + if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) + return -EFAULT; + /* not inside the mapped region */ + if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + + if (offset) { + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here, because + * we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are PAGE_SIZE in size, except potentially the + * first and last bvec + * + * So just find our index, and adjust the iterator afterwards. + * If the offset is within the first bvec (or the whole first + * bvec, just use iov_iter_advance(). This makes it easier + * since we can just skip the first segment, which may not + * be PAGE_SIZE aligned. + */ + const struct bio_vec *bvec = imu->bvec; + + if (offset <= bvec->bv_len) { + iov_iter_advance(iter, offset); + } else { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> PAGE_SHIFT); + + iter->bvec = bvec + seg_skip; + iter->nr_segs -= seg_skip; + iter->count -= bvec->bv_len + offset; + iter->iov_offset = offset & ~PAGE_MASK; + } + } + + return 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 03f26516e99463..87f58315b247bc 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -64,6 +64,9 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); +int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index ded8ef01165cdd..e07f2670dfa815 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -273,66 +273,6 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, return IOU_ISSUE_SKIP_COMPLETE; } -static int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) -{ - u64 buf_end; - size_t offset; - - if (WARN_ON_ONCE(!imu)) - return -EFAULT; - if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) - return -EFAULT; - /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) - return -EFAULT; - - /* - * May not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; - - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); - - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; - } - } - - return 0; -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) From 9da070b142820031a0e273097f2b39982f45bbfd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:26:00 +0100 Subject: [PATCH 108/172] io_uring: consistent naming for inline completion Improve naming of the inline/deferred completion helper so it's consistent with it's *_post counterpart. Add some comments and extra lockdeps to ensure the locking is done right. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/797c619943dac06529e9d3fcb16e4c3cde6ad1a3.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/io_uring.h | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3e65e04915a782..8bc63413fc547e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1380,7 +1380,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked) } if (*locked) - io_req_add_compl_list(req); + io_req_complete_defer(req); else io_req_complete_post(req); } @@ -1648,7 +1648,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) - io_req_add_compl_list(req); + io_req_complete_defer(req); else io_req_complete_post(req); } else if (ret != IOU_ISSUE_SKIP_COMPLETE) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index afca7ff8956c32..7a00bbe85d35dd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -222,10 +222,18 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) } } -static inline void io_req_add_compl_list(struct io_kiocb *req) +/* + * Don't complete immediately but use deferred completion infrastructure. + * Protected by ->uring_lock and can only be used either with + * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex. + */ +static inline void io_req_complete_defer(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { struct io_submit_state *state = &req->ctx->submit_state; + lockdep_assert_held(&req->ctx->uring_lock); + wq_list_add_tail(&req->comp_list, &state->compl_reqs); } From bce5d70cd64a5d48aff613334b8a5fac450b9753 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:26:01 +0100 Subject: [PATCH 109/172] io_uring: add a warn_once for poll_find io_poll_remove() expects poll_find() to search only for poll requests and passes a flag for this. Just be a little bit extra cautious considering lots of recent poll/cancellation changes and add a WARN_ON_ONCE checking that we don't get an apoll'ed request. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ec9a66f1e22f99dcd02288d4e42f3cc6bb357804.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/poll.c b/io_uring/poll.c index cbf44c38efd9ee..bd3110750cfada 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -833,6 +833,11 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) } found: + if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { + ret = -EFAULT; + goto out; + } + if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { From 4a0fef62788b69df09267c8e3f3f11d4bb9d50e7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 15:27:35 +0100 Subject: [PATCH 110/172] io_uring: optimize io_uring_task layout task_work bits of io_uring_task are split into two cache lines causing extra cache bouncing, place them into a separate cache line. Also move the most used submission path fields closer together, so there are hot. Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/tctx.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/io_uring/tctx.h b/io_uring/tctx.h index dde82ce4d8e29b..dead0ed0042984 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -7,22 +7,24 @@ struct io_uring_task { /* submission side */ - int cached_refs; - struct xarray xa; - struct wait_queue_head wait; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct percpu_counter inflight; - atomic_t inflight_tracked; - atomic_t in_idle; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; - struct callback_head task_work; - bool task_running; - - struct file *registered_rings[IO_RINGFD_REG_MAX]; + int cached_refs; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct file *registered_rings[IO_RINGFD_REG_MAX]; + + struct xarray xa; + struct wait_queue_head wait; + atomic_t in_idle; + atomic_t inflight_tracked; + struct percpu_counter inflight; + + struct { /* task_work */ + spinlock_t task_lock; + bool task_running; + struct io_wq_work_list task_list; + struct io_wq_work_list prio_task_list; + struct callback_head task_work; + } ____cacheline_aligned_in_smp; }; struct io_tctx_node { From 625d38b3fd34c58afb969810c4b3105eabb3b143 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Jun 2022 10:09:00 +0100 Subject: [PATCH 111/172] io_uring: improve io_run_task_work() Since SQPOLL now uses TWA_SIGNAL_NO_IPI, there won't be task work items without TIF_NOTIFY_SIGNAL. Simplify io_run_task_work() by removing task->task_works check. Even though looks it doesn't cause extra cache bouncing, it's still nice to not touch it an extra time when it might be not cached. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/75d4f34b0c671075892821a409e28da6cb1d64fe.1655802465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7a00bbe85d35dd..4c4d38ffc5ec5e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -203,7 +203,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_run_task_work(void) { - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) { __set_current_state(TASK_RUNNING); clear_notify_signal(); if (task_work_pending(current)) From a6b21fbb4ce3c4976ba478a9f0f10d4163038478 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Jun 2022 10:09:01 +0100 Subject: [PATCH 112/172] io_uring: move list helpers to a separate file It's annoying to have io-wq.h as a dependency every time we want some of struct io_wq_work_list helpers, move them into a separate file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c1d891ce12b30767d1d2a3b7db2ca3abc1ecc4a2.1655802465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 1 + io_uring/io-wq.h | 131 ----------------------------------------- io_uring/io_uring.h | 1 + io_uring/slist.h | 138 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 131 deletions(-) create mode 100644 io_uring/slist.h diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 824623bcf1a53f..3e34dfbdf9466a 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -18,6 +18,7 @@ #include #include "io-wq.h" +#include "slist.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 10b80ef78bb817..31228426d19241 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,137 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) - -#define wq_list_for_each_resume(pos, prv) \ - for (; pos; prv = pos, pos = (pos)->next) - -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ -} while (0) - -static inline void wq_list_add_after(struct io_wq_work_node *node, - struct io_wq_work_node *pos, - struct io_wq_work_list *list) -{ - struct io_wq_work_node *next = pos->next; - - pos->next = node; - node->next = next; - if (!next) - list->last = node; -} - -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - -static inline void wq_list_add_tail(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = NULL; - if (!list->first) { - list->last = node; - WRITE_ONCE(list->first, node); - } else { - list->last->next = node; - list->last = node; - } -} - -static inline void wq_list_add_head(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = list->first; - if (!node->next) - list->last = node; - WRITE_ONCE(list->first, node); -} - -static inline void wq_list_cut(struct io_wq_work_list *list, - struct io_wq_work_node *last, - struct io_wq_work_node *prev) -{ - /* first in the list, if prev==NULL */ - if (!prev) - WRITE_ONCE(list->first, last->next); - else - prev->next = last->next; - - if (last == list->last) - list->last = prev; - last->next = NULL; -} - -static inline void __wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - list->last->next = to->next; - to->next = list->first; - INIT_WQ_LIST(list); -} - -static inline bool wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - if (!wq_list_empty(list)) { - __wq_list_splice(list, to); - return true; - } - return false; -} - -static inline void wq_stack_add_head(struct io_wq_work_node *node, - struct io_wq_work_node *stack) -{ - node->next = stack->next; - stack->next = node; -} - -static inline void wq_list_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, - struct io_wq_work_node *prev) -{ - wq_list_cut(list, node, prev); -} - -static inline -struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) -{ - struct io_wq_work_node *node = stack->next; - - stack->next = node->next; - return node; -} - -static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) -{ - if (!work->list.next) - return NULL; - - return container_of(work->list.next, struct io_wq_work, list); -} - typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4c4d38ffc5ec5e..f026d2670959e6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,6 +5,7 @@ #include #include #include "io-wq.h" +#include "slist.h" #include "filetable.h" #ifndef CREATE_TRACE_POINTS diff --git a/io_uring/slist.h b/io_uring/slist.h new file mode 100644 index 00000000000000..f27601fa46607b --- /dev/null +++ b/io_uring/slist.h @@ -0,0 +1,138 @@ +#ifndef INTERNAL_IO_SLIST_H +#define INTERNAL_IO_SLIST_H + +#include + +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_for_each_resume(pos, prv) \ + for (; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) + +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ +} while (0) + +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + +/** + * wq_list_merge - merge the second list to the first one. + * @list0: the first list + * @list1: the second list + * Return the first node after mergence. + */ +static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, + struct io_wq_work_list *list1) +{ + struct io_wq_work_node *ret; + + if (!list0->first) { + ret = list1->first; + } else { + ret = list0->first; + list0->last->next = list1->first; + } + INIT_WQ_LIST(list0); + INIT_WQ_LIST(list1); + return ret; +} + +static inline void wq_list_add_tail(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = NULL; + if (!list->first) { + list->last = node; + WRITE_ONCE(list->first, node); + } else { + list->last->next = node; + list->last = node; + } +} + +static inline void wq_list_add_head(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = list->first; + if (!node->next) + list->last = node; + WRITE_ONCE(list->first, node); +} + +static inline void wq_list_cut(struct io_wq_work_list *list, + struct io_wq_work_node *last, + struct io_wq_work_node *prev) +{ + /* first in the list, if prev==NULL */ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) + list->last = prev; + last->next = NULL; +} + +static inline void __wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + list->last->next = to->next; + to->next = list->first; + INIT_WQ_LIST(list); +} + +static inline bool wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + if (!wq_list_empty(list)) { + __wq_list_splice(list, to); + return true; + } + return false; +} + +static inline void wq_stack_add_head(struct io_wq_work_node *node, + struct io_wq_work_node *stack) +{ + node->next = stack->next; + stack->next = node; +} + +static inline void wq_list_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + wq_list_cut(list, node, prev); +} + +static inline +struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) +{ + struct io_wq_work_node *node = stack->next; + + stack->next = node->next; + return node; +} + +static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) +{ + if (!work->list.next) + return NULL; + + return container_of(work->list.next, struct io_wq_work, list); +} + +#endif // INTERNAL_IO_SLIST_H \ No newline at end of file From 024f15e033a52660a045947ee56c7e842180fa81 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Jun 2022 10:09:02 +0100 Subject: [PATCH 113/172] io_uring: dedup io_run_task_work We have an identical copy of io_run_task_work() for io-wq called io_flush_signals(), deduplicate them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a157a4df5fa217b8bd03c73494f2fd0e24e44fbc.1655802465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.h | 2 ++ io_uring/io-wq.c | 17 +++-------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 6b58aa48bc45d3..fb5a274c08ffcc 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -2,6 +2,8 @@ #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H +#include + struct io_ring_ctx; struct io_kiocb; diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 3e34dfbdf9466a..77df5b43bf5239 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -19,6 +19,7 @@ #include "io-wq.h" #include "slist.h" +#include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) @@ -519,23 +520,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, return NULL; } -static bool io_flush_signals(void) -{ - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - return false; -} - static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { if (work) { - io_flush_signals(); + io_run_task_work(); cond_resched(); } @@ -655,7 +644,7 @@ static int io_wqe_worker(void *data) last_timeout = false; __io_worker_idle(wqe, worker); raw_spin_unlock(&wqe->lock); - if (io_flush_signals()) + if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); if (signal_pending(current)) { From ed5ccb3beeba0cadb0fcf353ae192021dfecf252 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:21 -0700 Subject: [PATCH 114/172] io_uring: remove priority tw list optimisation This optimisation has some built in assumptions that make it easy to introduce bugs. It also does not have clear wins that make it worth keeping. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-2-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 77 +++++++-------------------------------------- io_uring/io_uring.h | 1 - io_uring/rw.c | 2 +- io_uring/tctx.c | 1 - io_uring/tctx.h | 1 - 5 files changed, 12 insertions(+), 70 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8bc63413fc547e..d21d0fc3645b12 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static void handle_prev_tw_list(struct io_wq_work_node *node, - struct io_ring_ctx **ctx, bool *uring_locked) -{ - if (*ctx && !*uring_locked) - spin_lock(&(*ctx)->completion_lock); - - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - if (unlikely(!*uring_locked && *ctx)) - io_cq_unlock_post(*ctx); - - ctx_flush_and_put(*ctx, uring_locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *uring_locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); - if (unlikely(!*uring_locked)) - io_cq_lock(*ctx); - } - if (likely(*uring_locked)) { - req->io_task_work.func(req, uring_locked); - } else { - req->cqe.flags = io_put_kbuf_comp(req); - __io_req_complete_post(req); - } - node = next; - } while (node); - - if (unlikely(!*uring_locked)) - io_cq_unlock_post(*ctx); -} - static void handle_tw_list(struct io_wq_work_node *node, struct io_ring_ctx **ctx, bool *locked) { @@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb) task_work); while (1) { - struct io_wq_work_node *node1, *node2; + struct io_wq_work_node *node; spin_lock_irq(&tctx->task_lock); - node1 = tctx->prio_task_list.first; - node2 = tctx->task_list.first; + node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - if (!node2 && !node1) + if (!node) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); - if (!node2 && !node1) + if (!node) break; - - if (node1) - handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) - handle_tw_list(node2, &ctx, &uring_locked); + handle_tw_list(node, &ctx, &uring_locked); cond_resched(); - if (data_race(!tctx->task_list.first) && - data_race(!tctx->prio_task_list.first) && uring_locked) + if (data_race(!tctx->task_list.first) && uring_locked) io_submit_flush_completions(ctx); } @@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb) } static void __io_req_task_work_add(struct io_kiocb *req, - struct io_uring_task *tctx, - struct io_wq_work_list *list) + struct io_uring_task *tctx) { struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; @@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req, bool running; spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, list); + wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req, spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); + node = tctx->task_list.first; + INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; - __io_req_task_work_add(req, tctx, &tctx->task_list); -} - -void io_req_task_prio_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - if (req->ctx->flags & IORING_SETUP_SQPOLL) - __io_req_task_work_add(req, tctx, &tctx->prio_task_list); - else - __io_req_task_work_add(req, tctx, &tctx->task_list); + __io_req_task_work_add(req, tctx); } static void io_req_tw_post(struct io_kiocb *req, bool *locked) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f026d2670959e6..f77e4a5403e4e5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -36,7 +36,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); -void io_req_task_prio_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_queue(struct io_kiocb *req); void io_queue_iowq(struct io_kiocb *req, bool *dont_use); diff --git a/io_uring/rw.c b/io_uring/rw.c index e07f2670dfa815..ade3e235f2770c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; io_req_set_res(req, res, 0); req->io_task_work.func = io_req_task_complete; - io_req_task_prio_work_add(req); + io_req_task_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 9b30fb0d360307..7a68ba9beec3ea 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index dead0ed0042984..c8566ea5dca477 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -22,7 +22,6 @@ struct io_uring_task { spinlock_t task_lock; bool task_running; struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; struct callback_head task_work; } ____cacheline_aligned_in_smp; }; From c34398a8c018e0d3d2d30b718d03c7290c696f51 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:22 -0700 Subject: [PATCH 115/172] io_uring: remove __io_req_task_work_add this is no longer needed as there is only one caller Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-3-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d21d0fc3645b12..bf7ca2b279d3de 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb) io_uring_drop_tctx_refs(current); } -static void __io_req_task_work_add(struct io_kiocb *req, - struct io_uring_task *tctx) +void io_req_task_work_add(struct io_kiocb *req) { + struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; unsigned long flags; @@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req, } } -void io_req_task_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - __io_req_task_work_add(req, tctx); -} - static void io_req_tw_post(struct io_kiocb *req, bool *locked) { io_req_complete_post(req); From f88262e60bb9cb5740891672ce9f405e7f9393e5 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:23 -0700 Subject: [PATCH 116/172] io_uring: lockless task list With networking use cases we see contention on the spinlock used to protect the task_list when multiple threads try and add completions at once. Instead we can use a lockless list, and assume that the first caller to add to the list is responsible for kicking off task work. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-4-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 38 ++++++++-------------------------- io_uring/tctx.c | 3 +-- io_uring/tctx.h | 6 +++--- 4 files changed, 14 insertions(+), 35 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5987f8acca3838..918165a200533f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { union { - struct io_wq_work_node node; + struct llist_node node; struct llist_node fallback_node; }; io_req_tw_func_t func; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bf7ca2b279d3de..0124335c6d092a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static void handle_tw_list(struct io_wq_work_node *node, + +static void handle_tw_list(struct llist_node *node, struct io_ring_ctx **ctx, bool *locked) { do { - struct io_wq_work_node *next = node->next; + struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb) struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); + struct llist_node *node = llist_del_all(&tctx->task_list); - while (1) { - struct io_wq_work_node *node; - - spin_lock_irq(&tctx->task_lock); - node = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); - if (!node) - tctx->task_running = false; - spin_unlock_irq(&tctx->task_lock); - if (!node) - break; + if (node) { handle_tw_list(node, &ctx, &uring_locked); cond_resched(); - - if (data_race(!tctx->task_list.first) && uring_locked) - io_submit_flush_completions(ctx); } ctx_flush_and_put(ctx, &uring_locked); @@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - struct io_wq_work_node *node; - unsigned long flags; + struct llist_node *node; bool running; - spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); - running = tctx->task_running; - if (!running) - tctx->task_running = true; - spin_unlock_irqrestore(&tctx->task_lock, flags); + running = !llist_add(&req->io_task_work.node, &tctx->task_list); /* task_work already pending, we're done */ if (running) @@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - spin_lock_irqsave(&tctx->task_lock, flags); - tctx->task_running = false; - node = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); - spin_unlock_irqrestore(&tctx->task_lock, flags); + + node = llist_del_all(&tctx->task_list); while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7a68ba9beec3ea..7f97d97fef0a96 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; - spin_lock_init(&tctx->task_lock); - INIT_WQ_LIST(&tctx->task_list); + init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index c8566ea5dca477..8a33ff6e5d9138 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include + /* * Arbitrary limit, can be raised if need be */ @@ -19,9 +21,7 @@ struct io_uring_task { struct percpu_counter inflight; struct { /* task_work */ - spinlock_t task_lock; - bool task_running; - struct io_wq_work_list task_list; + struct llist_head task_list; struct callback_head task_work; } ____cacheline_aligned_in_smp; }; From 923d159247b732885b176b24e4bafad8eda5a477 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:24 -0700 Subject: [PATCH 117/172] io_uring: introduce llist helpers Introduce helpers to atomically switch llist. Will later move this into common code Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-5-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0124335c6d092a..356000255211fb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node, } while (node); } +/** + * io_llist_xchg - swap all entries in a lock-less list + * @head: the head of lock-less list to delete all entries + * @new: new entry as the head of the list + * + * If list is empty, return NULL, otherwise, return the pointer to the first entry. + * The order of entries returned is from the newest to the oldest added one. + */ +static inline struct llist_node *io_llist_xchg(struct llist_head *head, + struct llist_node *new) +{ + return xchg(&head->first, new); +} + +/** + * io_llist_cmpxchg - possibly swap all entries in a lock-less list + * @head: the head of lock-less list to delete all entries + * @old: expected old value of the first entry of the list + * @new: new entry as the head of the list + * + * perform a cmpxchg on the first entry of the list. + */ + +static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, + struct llist_node *old, + struct llist_node *new) +{ + return cmpxchg(&head->first, old, new); +} + void tctx_task_work(struct callback_head *cb) { bool uring_locked = false; From 3a0c037b0e16e38cf5d36d2ebc259e0ae644aaf4 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:25 -0700 Subject: [PATCH 118/172] io_uring: batch task_work Batching task work up is an important performance optimisation, as task_work_add is expensive. In order to keep the semantics replace the task_list with a fake node while processing the old list, and then do a cmpxchg at the end to see if there is more work. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-6-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 356000255211fb..9d523fafacb72d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,11 +986,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } - static void handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked) + struct io_ring_ctx **ctx, bool *locked, + struct llist_node *last) { - do { + while (node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1006,7 +1006,7 @@ static void handle_tw_list(struct llist_node *node, } req->io_task_work.func(req, locked); node = next; - } while (node); + } } /** @@ -1045,11 +1045,15 @@ void tctx_task_work(struct callback_head *cb) struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); - struct llist_node *node = llist_del_all(&tctx->task_list); - - if (node) { - handle_tw_list(node, &ctx, &uring_locked); - cond_resched(); + struct llist_node fake = {}; + struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake); + + handle_tw_list(node, &ctx, &uring_locked, NULL); + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); + while (node != &fake) { + node = io_llist_xchg(&tctx->task_list, &fake); + handle_tw_list(node, &ctx, &uring_locked, &fake); + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } ctx_flush_and_put(ctx, &uring_locked); From eccd8801858f03a19a4d1f8fef27e3d6e17b21fd Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:27 -0700 Subject: [PATCH 119/172] io_uring: add trace event for running task work This is useful for investigating if task_work is batching Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-8-dylany@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 3bc8dec9acaacf..918e3a43e4b280 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow, __entry->cflags, __entry->ocqe) ); +/* + * io_uring_task_work_run - ran task work + * + * @tctx: pointer to a io_uring_task + * @count: how many functions it ran + * @loops: how many loops it ran + * + */ +TRACE_EVENT(io_uring_task_work_run, + + TP_PROTO(void *tctx, unsigned int count, unsigned int loops), + + TP_ARGS(tctx, count, loops), + + TP_STRUCT__entry ( + __field( void *, tctx ) + __field( unsigned int, count ) + __field( unsigned int, loops ) + ), + + TP_fast_assign( + __entry->tctx = tctx; + __entry->count = count; + __entry->loops = loops; + ), + + TP_printk("tctx %p, count %u, loops %u", + __entry->tctx, __entry->count, __entry->loops) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ From c6dd763c246024ed0dd603db4284cbd0bd164e27 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:28 -0700 Subject: [PATCH 120/172] io_uring: trace task_work_run trace task_work_run to help provide stats on how often task work is run and what batch sizes are coming through. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-9-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9d523fafacb72d..997b915a1ff78a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,10 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static void handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, - struct llist_node *last) +static unsigned int handle_tw_list(struct llist_node *node, + struct io_ring_ctx **ctx, bool *locked, + struct llist_node *last) { + unsigned int count = 0; + while (node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, @@ -1006,7 +1008,10 @@ static void handle_tw_list(struct llist_node *node, } req->io_task_work.func(req, locked); node = next; + count++; } + + return count; } /** @@ -1047,12 +1052,14 @@ void tctx_task_work(struct callback_head *cb) task_work); struct llist_node fake = {}; struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake); + unsigned int loops = 1; + unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL); - handle_tw_list(node, &ctx, &uring_locked, NULL); node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); while (node != &fake) { + loops++; node = io_llist_xchg(&tctx->task_list, &fake); - handle_tw_list(node, &ctx, &uring_locked, &fake); + count += handle_tw_list(node, &ctx, &uring_locked, &fake); node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } @@ -1061,6 +1068,8 @@ void tctx_task_work(struct callback_head *cb) /* relaxed read is enough as only the task itself sets ->in_idle */ if (unlikely(atomic_read(&tctx->in_idle))) io_uring_drop_tctx_refs(current); + + trace_io_uring_task_work_run(tctx, count, loops); } void io_req_task_work_add(struct io_kiocb *req) From 024b8fde3320ea34d7a5a3fc9dbc47ec736cd8eb Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Jun 2022 13:55:51 +0800 Subject: [PATCH 121/172] io_uring: kbuf: kill __io_kbuf_recycle() __io_kbuf_recycle() is only called in io_kbuf_recycle(). Kill it and tweak the code so that the legacy pbuf and ring pbuf code become clear Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220622055551.642370-1-hao.xu@linux.dev Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 71 +++++++++++++++++++++++++++++-------------------- io_uring/kbuf.h | 21 +++++---------- 2 files changed, 49 insertions(+), 43 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 62de0dda24bf6e..8bf47e49ea5bc7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -37,36 +37,30 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, return xa_load(&ctx->io_bl_xa, bgid); } -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + bl->bgid = bgid; + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); +} + +void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; struct io_buffer *buf; /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - /* - * If we end up here, then the io_uring_lock has - * been kept held since we retrieved the buffer. - * For the io-wq case, we already cleared - * req->buf_list when the buffer was retrieved, - * hence it cannot be set here for that case. - */ - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } + if (req->flags & REQ_F_PARTIAL_IO) return; - } io_ring_submit_lock(ctx, issue_flags); @@ -77,16 +71,35 @@ void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); + return; } -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) +void io_kbuf_recycle_ring(struct io_kiocb *req) { - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + * The exception is partial io, that case we should increment bl->head + * to monopolize the buffer. + */ + if (req->buf_list) { + if (req->flags & REQ_F_PARTIAL_IO) { + /* + * If we end up here, then the io_uring_lock has + * been kept held since we retrieved the buffer. + * For the io-wq case, we already cleared + * req->buf_list when the buffer was retrieved, + * hence it cannot be set here for that case. + */ + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + } + return; } unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 304e7139d83562..721465c5d809fc 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -35,7 +35,6 @@ struct io_buffer { void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -49,6 +48,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_recycle_ring(struct io_kiocb *req); + static inline bool io_do_buffer_select(struct io_kiocb *req) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -58,18 +60,6 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - /* * READV uses fields in `struct io_rw` (len/addr) to stash the selected * buffer data. However if that buffer is recycled the original request @@ -78,7 +68,10 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (req->opcode == IORING_OP_READV) return; - __io_kbuf_recycle(req, issue_flags); + if (req->flags & REQ_F_BUFFER_SELECTED) + io_kbuf_recycle_legacy(req, issue_flags); + if (req->flags & REQ_F_BUFFER_RING) + io_kbuf_recycle_ring(req); } static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, From 88f52eaad2df2cb5ab49b864d79398c9cb9a57f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 09:23:54 -0600 Subject: [PATCH 122/172] io_uring: have cancelation API accept io_uring_task directly We just use the io_kiocb passed in to find the io_uring_task, and we already pass in the ctx via cd->ctx anyway. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 17 +++++++++-------- io_uring/cancel.h | 2 +- io_uring/timeout.c | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index d1e7f5a955ab20..500ee5f5fd23ca 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -77,15 +77,15 @@ static int io_async_cancel_one(struct io_uring_task *tctx, return ret; } -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, +int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned issue_flags) { - struct io_ring_ctx *ctx = req->ctx; + struct io_ring_ctx *ctx = cd->ctx; int ret; - WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); + WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring); - ret = io_async_cancel_one(req->task->io_uring, cd); + ret = io_async_cancel_one(tctx, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. @@ -104,7 +104,6 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, return ret; } - int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_cancel *cancel = io_kiocb_to_cmd(req); @@ -127,7 +126,8 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, +static int __io_async_cancel(struct io_cancel_data *cd, + struct io_uring_task *tctx, unsigned int issue_flags) { bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); @@ -136,7 +136,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, int ret, nr = 0; do { - ret = io_try_cancel(req, cd, issue_flags); + ret = io_try_cancel(tctx, cd, issue_flags); if (ret == -ENOENT) break; if (!all) @@ -170,6 +170,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .flags = cancel->flags, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; + struct io_uring_task *tctx = req->task->io_uring; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { @@ -185,7 +186,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) cd.file = req->file; } - ret = __io_async_cancel(&cd, req, issue_flags); + ret = __io_async_cancel(&cd, tctx, issue_flags); done: if (ret < 0) req_set_fail(req); diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 2338012a5b06fb..1bc7e917ce94e6 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -16,6 +16,6 @@ struct io_cancel_data { int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, +int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); void init_hash_table(struct io_hash_table *table, unsigned size); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7e2c341f976258..4af074b8f6b7d7 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -274,7 +274,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) .data = prev->cqe.user_data, }; - ret = io_try_cancel(req, &cd, issue_flags); + ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_complete_post(req); From 7d8ca7250197096bfa9f432c1d99b0555504bbba Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 09:47:04 -0600 Subject: [PATCH 123/172] io_uring: add IORING_ASYNC_CANCEL_FD_FIXED cancel flag In preparation for not having a request to pass in that carries this state, add a separate cancelation flag that allows the caller to ask for a fixed file for cancelation. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 ++ io_uring/cancel.c | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d7ae81b10893e9..a09a78bd75566e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -247,10 +247,12 @@ enum io_uring_op { * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the * request 'user_data' * IORING_ASYNC_CANCEL_ANY Match any request + * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor */ #define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_ANY (1U << 2) +#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) /* * send/sendmsg and recv/recvmsg flags (sqe->ioprio) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 500ee5f5fd23ca..da486de07029a5 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -24,7 +24,7 @@ struct io_cancel { }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY) + IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) static bool io_cancel_cb(struct io_wq_work *work, void *data) { @@ -174,11 +174,14 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { - if (req->flags & REQ_F_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE || + cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) { + req->flags |= REQ_F_FIXED_FILE; req->file = io_file_get_fixed(req, cancel->fd, issue_flags); - else + } else { req->file = io_file_get_normal(req, cancel->fd); + } if (!req->file) { ret = -EBADF; goto done; From 78a861b9495920f8609dee5b670dacbff09d359f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 10:00:50 -0600 Subject: [PATCH 124/172] io_uring: add sync cancelation API through io_uring_register() The io_uring cancelation API is async, like any other API that we expose there. For the case of finding a request to cancel, or not finding one, it is fully sync in that when submission returns, the CQE for both the cancelation request and the targeted request have been posted to the CQ ring. However, if the targeted work is being executed by io-wq, the API can only start the act of canceling it. This makes it difficult to use in some circumstances, as the caller then has to wait for the CQEs to come in and match on the same cancelation data there. Provide a IORING_REGISTER_SYNC_CANCEL command for io_uring_register() that does sync cancelations, always. For the io-wq case, it'll wait for the cancelation to come in before returning. The only expected returns from this API is: 0 Request found and canceled fine. > 0 Requests found and canceled. Only happens if asked to cancel multiple requests, and if the work wasn't in progress. -ENOENT Request not found. -ETIME A timeout on the operation was requested, but the timeout expired before we could cancel. and we won't get -EALREADY via this API. If the timeout value passed in is -1 (tv_sec and tv_nsec), then that means that no timeout is requested. Otherwise, the timespec passed in is the amount of time the sync cancel will wait for a successful cancelation. Link: https://github.com/axboe/liburing/discussions/608 Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 15 +++++ io_uring/cancel.c | 107 ++++++++++++++++++++++++++++++++++ io_uring/cancel.h | 2 + io_uring/io_uring.c | 6 ++ 4 files changed, 130 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a09a78bd75566e..094f706c93e0b9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -10,6 +10,7 @@ #include #include +#include /* * IO submission data structure (Submission Queue Entry) @@ -428,6 +429,9 @@ enum { IORING_REGISTER_PBUF_RING = 22, IORING_UNREGISTER_PBUF_RING = 23, + /* sync cancelation API */ + IORING_REGISTER_SYNC_CANCEL = 24, + /* this goes last */ IORING_REGISTER_LAST }; @@ -563,4 +567,15 @@ struct io_uring_getevents_arg { __u64 ts; }; +/* + * Argument for IORING_REGISTER_SYNC_CANCEL + */ +struct io_uring_sync_cancel_reg { + __u64 addr; + __s32 fd; + __u32 flags; + struct __kernel_timespec timeout; + __u64 pad[4]; +}; + #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index da486de07029a5..8435a1eba59acc 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -206,3 +207,109 @@ void init_hash_table(struct io_hash_table *table, unsigned size) INIT_HLIST_HEAD(&table->hbs[i].list); } } + +static int __io_sync_cancel(struct io_uring_task *tctx, + struct io_cancel_data *cd, int fd) +{ + struct io_ring_ctx *ctx = cd->ctx; + + /* fixed must be grabbed every time since we drop the uring_lock */ + if ((cd->flags & IORING_ASYNC_CANCEL_FD) && + (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { + unsigned long file_ptr; + + if (unlikely(fd > ctx->nr_user_files)) + return -EBADF; + fd = array_index_nospec(fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; + cd->file = (struct file *) (file_ptr & FFS_MASK); + if (!cd->file) + return -EBADF; + } + + return __io_async_cancel(cd, tctx, 0); +} + +int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) + __must_hold(&ctx->uring_lock) +{ + struct io_cancel_data cd = { + .ctx = ctx, + .seq = atomic_inc_return(&ctx->cancel_seq), + }; + ktime_t timeout = KTIME_MAX; + struct io_uring_sync_cancel_reg sc; + struct fd f = { }; + DEFINE_WAIT(wait); + int ret; + + if (copy_from_user(&sc, arg, sizeof(sc))) + return -EFAULT; + if (sc.flags & ~CANCEL_FLAGS) + return -EINVAL; + if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) + return -EINVAL; + + cd.data = sc.addr; + cd.flags = sc.flags; + + /* we can grab a normal file descriptor upfront */ + if ((cd.flags & IORING_ASYNC_CANCEL_FD) && + !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { + f = fdget(sc.fd); + if (!f.file) + return -EBADF; + cd.file = f.file; + } + + ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); + + /* found something, done! */ + if (ret != -EALREADY) + goto out; + + if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) { + struct timespec64 ts = { + .tv_sec = sc.timeout.tv_sec, + .tv_nsec = sc.timeout.tv_nsec + }; + + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } + + /* + * Keep looking until we get -ENOENT. we'll get woken everytime + * every time a request completes and will retry the cancelation. + */ + do { + cd.seq = atomic_inc_return(&ctx->cancel_seq); + + prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE); + + ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); + + if (ret != -EALREADY) + break; + + mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(); + if (ret < 0) { + mutex_lock(&ctx->uring_lock); + break; + } + ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); + mutex_lock(&ctx->uring_lock); + if (!ret) { + ret = -ETIME; + break; + } + } while (1); + + finish_wait(&ctx->cq_wait, &wait); + + if (ret == -ENOENT || ret > 0) + ret = 0; +out: + fdput(f); + return ret; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 1bc7e917ce94e6..6a59ee484d0cca 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -19,3 +19,5 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); void init_hash_table(struct io_hash_table *table, unsigned size); + +int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 997b915a1ff78a..45538b3c3a764e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3871,6 +3871,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_pbuf_ring(ctx, arg); break; + case IORING_REGISTER_SYNC_CANCEL: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_sync_cancel(ctx, arg); + break; default: ret = -EINVAL; break; From 0638cd7be21221c7b583a4901e25e63d96112395 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:44 +0100 Subject: [PATCH 125/172] io_uring: clean poll ->private flagging We store a req pointer in wqe->private but also take one bit to mark double poll entries. Replace macro helpers with inline functions for better type checking and also name the double flag. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9a61240555c64ac0b7a9b0eb59a9efeb638a35a4.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index bd3110750cfada..210b174b155b61 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -39,6 +39,22 @@ struct io_poll_table { #define IO_POLL_CANCEL_FLAG BIT(31) #define IO_POLL_REF_MASK GENMASK(30, 0) +#define IO_WQE_F_DOUBLE 1 + +static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) +{ + unsigned long priv = (unsigned long)wqe->private; + + return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE); +} + +static inline bool wqe_is_double(struct wait_queue_entry *wqe) +{ + unsigned long priv = (unsigned long)wqe->private; + + return priv & IO_WQE_F_DOUBLE; +} + /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not @@ -306,8 +322,6 @@ static void io_poll_cancel_req(struct io_kiocb *req) io_poll_execute(req, 0, 0); } -#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) -#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -392,7 +406,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, return; } /* mark as double wq entry */ - wqe_private |= 1; + wqe_private |= IO_WQE_F_DOUBLE; req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; From 13a99017ff19f179b51e1c51559a9d7005df1830 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:45 +0100 Subject: [PATCH 126/172] io_uring: remove events caching atavisms Remove events argument from *io_poll_execute(), it's not needed and not used. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/12efd4e15c6a90cf9e5b59807cfcb57852b51dc7.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 210b174b155b61..7de8c52793cd3b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -289,8 +289,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, - __poll_t __maybe_unused events) +static void __io_poll_execute(struct io_kiocb *req, int mask) { io_req_set_res(req, mask, 0); /* @@ -308,18 +307,17 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, io_req_task_work_add(req); } -static inline void io_poll_execute(struct io_kiocb *req, int res, - __poll_t events) +static inline void io_poll_execute(struct io_kiocb *req, int res) { if (io_poll_get_ownership(req)) - __io_poll_execute(req, res, events); + __io_poll_execute(req, res); } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ - io_poll_execute(req, 0, 0); + io_poll_execute(req, 0); } #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) @@ -334,7 +332,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (unlikely(mask & POLLFREE)) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0, poll->events); + io_poll_execute(req, 0); /* * If the waitqueue is being freed early but someone is already @@ -369,7 +367,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, else req->flags &= ~REQ_F_SINGLE_POLL; } - __io_poll_execute(req, mask, poll->events); + __io_poll_execute(req, mask); } return 1; } @@ -487,7 +485,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, req->apoll_events |= EPOLLONESHOT; ipt->error = 0; } - __io_poll_execute(req, mask, poll->events); + __io_poll_execute(req, mask); return 0; } @@ -497,7 +495,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, */ v = atomic_dec_return(&req->poll_refs); if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0, poll->events); + __io_poll_execute(req, 0); return 0; } From 5204aa8c43bd1c3428b8979229183ae8269a8c09 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:46 +0100 Subject: [PATCH 127/172] io_uring: add a helper for apoll alloc Extract a helper function for apoll allocation, makes the code easier to read. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2f93282b47dd678e805dd0d7097f66968ced495c.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 7de8c52793cd3b..aef77f2a8a9a76 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -508,10 +508,33 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } +static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, + unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + kfree(apoll->double_poll); + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return NULL; + } + apoll->double_poll = NULL; + req->apoll = apoll; + return apoll; +} + int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; @@ -546,21 +569,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; - if (req->flags & REQ_F_POLLED) { - apoll = req->apoll; - kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); - } else { - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; - } - apoll->double_poll = NULL; - req->apoll = apoll; + + apoll = io_req_alloc_apoll(req, issue_flags); + if (!apoll) + return IO_APOLL_ABORTED; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; From 063a007996bf725ba4c7d8741701670be9858300 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:47 +0100 Subject: [PATCH 128/172] io_uring: change arm poll return values The rules for __io_arm_poll_handler()'s result parsing are complicated, as the first step don't pass return a mask but pass back a positive return code and fill ipt->result_mask. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/529e29e9f97f2e6e383ccd44234d8b576a83a921.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index aef77f2a8a9a76..80113b036c88d8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -34,6 +34,8 @@ struct io_poll_table { struct io_kiocb *req; int nr_entries; int error; + /* output value, set only if arm poll returns >0 */ + __poll_t result_mask; }; #define IO_POLL_CANCEL_FLAG BIT(31) @@ -462,8 +464,9 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { io_poll_remove_entries(req); + ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ - return mask; + return 1; } if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { @@ -813,7 +816,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); if (ret) { - io_req_set_res(req, ret, 0); + io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; } if (ipt.error) { From de08356f4858628fdffb8bd7e9cceb60c7e08ead Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:48 +0100 Subject: [PATCH 129/172] io_uring: refactor poll arm error handling __io_arm_poll_handler() errors parsing is a horror, in case it failed it returns 0 and the caller is expected to look at ipt.error, which already led us to a number of problems before. When it returns a valid mask, leave it as it's not, i.e. return 1 and store the mask in ipt.result_mask. In case of a failure that can be handled inline return an error code (negative value), and return 0 if __io_arm_poll_handler() took ownership of the request and will complete it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/018cacdaef5fe95d7dc56b32e85d752cab7607f6.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 80113b036c88d8..3f3ae3b1505fa4 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -435,6 +435,12 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, (struct io_poll **) &pt->req->async_data); } +/* + * Returns 0 when it's handed over for polling. The caller owns the requests if + * it returns non-zero, but otherwise should not touch it. Negative values + * contain an error code. When the result is >0, the polling has completed + * inline and ipt.result_mask is set to the mask. + */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask) @@ -461,6 +467,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req, atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; + if (unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + + if (mask && (poll->events & EPOLLET)) { + ipt->result_mask = mask; + return 1; + } else { + return ipt->error ?: -EINVAL; + } + } + if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { io_poll_remove_entries(req); @@ -469,25 +486,12 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 1; } - if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { - io_poll_remove_entries(req); - if (!ipt->error) - ipt->error = -EINVAL; - return 0; - } - if (req->flags & REQ_F_HASH_LOCKED) io_poll_req_insert_locked(req); else io_poll_req_insert(req); if (mask && (poll->events & EPOLLET)) { - /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) { - poll->events |= EPOLLONESHOT; - req->apoll_events |= EPOLLONESHOT; - ipt->error = 0; - } __io_poll_execute(req, mask); return 0; } @@ -582,9 +586,8 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) io_kbuf_recycle(req, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); - if (ret || ipt.error) - return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - + if (ret) + return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -815,16 +818,11 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); - if (ret) { + if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; } - if (ipt.error) { - req_set_fail(req); - return ipt.error; - } - - return IOU_ISSUE_SKIP_COMPLETE; + return ret ?: IOU_ISSUE_SKIP_COMPLETE; } int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) From 49f1c68e048f1706b71c8255faf8110113d1cc48 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:49 +0100 Subject: [PATCH 130/172] io_uring: optimise submission side poll_refs The final poll_refs put in __io_arm_poll_handler() takes quite some cycles. When we're arming from the original task context task_work won't be run, so in this case we can assume that we won't race with task_works and so not take the initial ownership ref. One caveat is that after arming a poll we may race with it, so we have to add a bunch of io_poll_get_ownership() hidden inside of io_poll_can_finish_inline() whenever we want to complete arming inline. For the same reason we can't just set REQ_F_DOUBLE_POLL in __io_queue_proc() and so need to sync with the first poll entry by taking its wq head lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8825315d7f5e182ac1578a031e546f79b1c97d01.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 88 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 21 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 3f3ae3b1505fa4..eba767594deee5 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -34,6 +34,7 @@ struct io_poll_table { struct io_kiocb *req; int nr_entries; int error; + bool owning; /* output value, set only if arm poll returns >0 */ __poll_t result_mask; }; @@ -374,6 +375,27 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return 1; } +static void io_poll_double_prepare(struct io_kiocb *req) +{ + struct wait_queue_head *head; + struct io_poll *poll = io_poll_get_single(req); + + /* head is RCU protected, see io_poll_remove_entries() comments */ + rcu_read_lock(); + head = smp_load_acquire(&poll->head); + if (head) { + /* + * poll arm may not hold ownership and so race with + * io_poll_wake() by modifying req->flags. There is only one + * poll entry queued, serialise with it by taking its head lock. + */ + spin_lock_irq(&head->lock); + req->flags |= REQ_F_DOUBLE_POLL; + spin_unlock_irq(&head->lock); + } + rcu_read_unlock(); +} + static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll **poll_ptr) @@ -405,16 +427,19 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + + io_poll_double_prepare(req); /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; - req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; + } else { + /* fine to modify, there is no poll queued to race with us */ + req->flags |= REQ_F_SINGLE_POLL; } - req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; poll->wait.private = (void *) wqe_private; @@ -435,6 +460,12 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, (struct io_poll **) &pt->req->async_data); } +static bool io_poll_can_finish_inline(struct io_kiocb *req, + struct io_poll_table *pt) +{ + return pt->owning || io_poll_get_ownership(req); +} + /* * Returns 0 when it's handed over for polling. The caller owns the requests if * it returns non-zero, but otherwise should not touch it. Negative values @@ -443,7 +474,8 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, - struct io_poll_table *ipt, __poll_t mask) + struct io_poll_table *ipt, __poll_t mask, + unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; int v; @@ -452,34 +484,45 @@ static int __io_arm_poll_handler(struct io_kiocb *req, req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; - req->apoll_events = poll->events; ipt->pt._key = mask; ipt->req = req; ipt->error = 0; ipt->nr_entries = 0; - /* - * Take the ownership to delay any tw execution up until we're done - * with poll arming. see io_poll_get_ownership(). + * Polling is either completed here or via task_work, so if we're in the + * task context we're naturally serialised with tw by merit of running + * the same task. When it's io-wq, take the ownership to prevent tw + * from running. However, when we're in the task context, skip taking + * it as an optimisation. + * + * Note: even though the request won't be completed/freed, without + * ownership we still can race with io_poll_wake(). + * io_poll_can_finish_inline() tries to deal with that. */ - atomic_set(&req->poll_refs, 1); + ipt->owning = issue_flags & IO_URING_F_UNLOCKED; + + atomic_set(&req->poll_refs, (int)ipt->owning); mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); - if (mask && (poll->events & EPOLLET)) { + if (!io_poll_can_finish_inline(req, ipt)) { + io_poll_mark_cancelled(req); + return 0; + } else if (mask && (poll->events & EPOLLET)) { ipt->result_mask = mask; return 1; - } else { - return ipt->error ?: -EINVAL; } + return ipt->error ?: -EINVAL; } if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { + if (!io_poll_can_finish_inline(req, ipt)) + return 0; io_poll_remove_entries(req); ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ @@ -491,18 +534,21 @@ static int __io_arm_poll_handler(struct io_kiocb *req, else io_poll_req_insert(req); - if (mask && (poll->events & EPOLLET)) { + if (mask && (poll->events & EPOLLET) && + io_poll_can_finish_inline(req, ipt)) { __io_poll_execute(req, mask); return 0; } - /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. - */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0); + if (ipt->owning) { + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0); + } return 0; } @@ -585,7 +631,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) io_kbuf_recycle(req, issue_flags); - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); @@ -817,7 +863,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) else req->flags &= ~REQ_F_HASH_LOCKED; - ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; From 795bbbc8a9a1bbbafce762c706bfb5733c9d0426 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 23 Jun 2022 21:01:26 +0800 Subject: [PATCH 131/172] io_uring: kbuf: inline io_kbuf_recycle_ring() Make io_kbuf_recycle_ring() inline since it is the fast path of provided buffer. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220623130126.179232-1-hao.xu@linux.dev Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 28 ---------------------------- io_uring/kbuf.h | 28 +++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8bf47e49ea5bc7..5e00f16e89b866 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -74,34 +74,6 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return; } -void io_kbuf_recycle_ring(struct io_kiocb *req) -{ - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - * The exception is partial io, that case we should increment bl->head - * to monopolize the buffer. - */ - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - /* - * If we end up here, then the io_uring_lock has - * been kept held since we retrieved the buffer. - * For the io-wq case, we already cleared - * req->buf_list when the buffer was retrieved, - * hence it cannot be set here for that case. - */ - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } - return; -} - unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { unsigned int cflags; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 721465c5d809fc..b3e8c6c5fee143 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -49,7 +49,33 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void io_kbuf_recycle_ring(struct io_kiocb *req); + +static inline void io_kbuf_recycle_ring(struct io_kiocb *req) +{ + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + * The exception is partial io, that case we should increment bl->head + * to monopolize the buffer. + */ + if (req->buf_list) { + if (req->flags & REQ_F_PARTIAL_IO) { + /* + * If we end up here, then the io_uring_lock has + * been kept held since we retrieved the buffer. + * For the io-wq case, we already cleared + * req->buf_list when the buffer was retrieved, + * hence it cannot be set here for that case. + */ + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + } +} static inline bool io_do_buffer_select(struct io_kiocb *req) { From fe991a7688f894a74a6f6b4933bf6cd5fa086c1b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Jun 2022 14:34:15 -0600 Subject: [PATCH 132/172] io_uring: move POLLFREE handling to separate function We really don't care about this at all in terms of performance. Outside of having it already be marked unlikely(), shove it into a separate __cold function. Signed-off-by: Jens Axboe --- io_uring/poll.c | 50 ++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index eba767594deee5..fa25b88a7b9336 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -325,6 +325,31 @@ static void io_poll_cancel_req(struct io_kiocb *req) #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) +static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) +{ + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -332,29 +357,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); - if (unlikely(mask & POLLFREE)) { - io_poll_mark_cancelled(req); - /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); - return 1; - } + if (unlikely(mask & POLLFREE)) + return io_pollfree_wake(req, poll); /* for instances that support it check for an event match first */ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) From 37c7bd31b3e9e4b6aee3c5227f789c0b586a33a2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:52:58 +0100 Subject: [PATCH 133/172] io_uring: improve io_fail_links() io_fail_links() is called with ->completion_lock held and for that reason we'd want to keep it as small as we can. Instead of doing __io_req_complete_post() for each linked request under the lock, fail them in a task_work handler under ->uring_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a2f68708b970a21f4e84ddfa7b3abd67a8fffb27.1656153285.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4af074b8f6b7d7..2f9e5693547931 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -101,32 +101,44 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void io_fail_links(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) +static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) { - struct io_kiocb *nxt, *link = req->link; - bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; - - req->link = NULL; + io_tw_lock(link->ctx, locked); while (link) { + struct io_kiocb *nxt = link->link; long res = -ECANCELED; if (link->flags & REQ_F_FAIL) res = link->cqe.res; - - nxt = link->link; link->link = NULL; + io_req_set_res(link, res, 0); + io_req_task_complete(link, locked); + link = nxt; + } +} - trace_io_uring_fail_link(req, link); +static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + struct io_kiocb *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; + + if (!link) + return; + while (link) { if (ignore_cqes) link->flags |= REQ_F_CQE_SKIP; else link->flags &= ~REQ_F_CQE_SKIP; - io_req_set_res(link, res, 0); - __io_req_complete_post(link); - link = nxt; + trace_io_uring_fail_link(req, link); + link = link->link; } + + link = req->link; + link->io_task_work.func = io_req_tw_fail_links; + io_req_task_work_add(link); + req->link = NULL; } static inline void io_remove_next_linked(struct io_kiocb *req) From 3218e5d32dbcf1b9c6dc589eca21deebb14215fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:52:59 +0100 Subject: [PATCH 134/172] io_uring: fuse fallback_node and normal tw node Now as both normal and fallback paths use llist, just keep one node head in struct io_task_work and kill off ->fallback_node. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d04ebde409f7b162fe247b361b4486b193293e46.1656153285.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +---- io_uring/io_uring.c | 5 ++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 918165a200533f..3ca8f363f5046c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,10 +427,7 @@ enum { typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { - union { - struct llist_node node; - struct llist_node fallback_node; - }; + struct llist_node node; io_req_tw_func_t func; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 45538b3c3a764e..86a0b0c6f5bf77 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -233,7 +233,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) bool locked = false; percpu_ref_get(&ctx->refs); - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &locked); if (locked) { @@ -1091,13 +1091,12 @@ void io_req_task_work_add(struct io_kiocb *req) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - node = llist_del_all(&tctx->task_list); while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; - if (llist_add(&req->io_task_work.fallback_node, + if (llist_add(&req->io_task_work.node, &req->ctx->fallback_llist)) schedule_delayed_work(&req->ctx->fallback_work, 1); } From ad8b261d837400cb7ccc339e81d7c35ab018acd8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:53:00 +0100 Subject: [PATCH 135/172] io_uring: remove extra TIF_NOTIFY_SIGNAL check io_run_task_work() accounts for TIF_NOTIFY_SIGNAL, so no need to have an second check in io_run_task_work_sig(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/52ce41a592ad904511697f432141e5690fd4b968.1656153285.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 86a0b0c6f5bf77..f40526426db813 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2205,8 +2205,6 @@ int io_run_task_work_sig(void) { if (io_run_task_work()) return 1; - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - return -ERESTARTSYS; if (task_sigpending(current)) return -EINTR; return 0; From 3273c4407acd4348b1531e1f860fbf1da942893b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:53:01 +0100 Subject: [PATCH 136/172] io_uring: don't check file ops of registered rings Registered rings are per definitions io_uring files, so we don't need to additionally verify them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/425cd64fd885b8e329a46c205ee811987691baaf.1656153286.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f40526426db813..e1e8dcd17df354 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3036,22 +3036,22 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_REGISTERED_RING) { struct io_uring_task *tctx = current->io_uring; - if (!tctx || fd >= IO_RINGFD_REG_MAX) + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); f.file = tctx->registered_rings[fd]; f.flags = 0; + if (unlikely(!f.file)) + return -EBADF; } else { f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; + if (unlikely(!io_is_uring_fops(f.file))) + goto out_fput; } - if (unlikely(!f.file)) - return -EBADF; - - ret = -EOPNOTSUPP; - if (unlikely(!io_is_uring_fops(f.file))) - goto out_fput; - ret = -ENXIO; ctx = f.file->private_data; if (unlikely(!percpu_ref_tryget(&ctx->refs))) From fbb8bb02911790147ea936f3b1c5ceb1be54bf34 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:53:02 +0100 Subject: [PATCH 137/172] io_uring: remove ctx->refs pinning on enter io_uring_enter() takes ctx->refs, which was previously preventing racing with register quiesce. However, as register now doesn't touch the refs, we can freely kill extra ctx pinning and rely on the fact that we're holding a file reference preventing the ring from being destroyed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a11c57ad33a1be53541fce90669c1b79cf4d8940.1656153286.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e1e8dcd17df354..070ee9ec9ee721 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3049,14 +3049,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return -EBADF; ret = -EOPNOTSUPP; if (unlikely(!io_is_uring_fops(f.file))) - goto out_fput; + goto out; } - ret = -ENXIO; ctx = f.file->private_data; - if (unlikely(!percpu_ref_tryget(&ctx->refs))) - goto out_fput; - ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; @@ -3141,10 +3137,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, &ctx->check_cq); } } - out: - percpu_ref_put(&ctx->refs); -out_fput: fdput(f); return ret; } @@ -3730,11 +3723,10 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, int ret; /* - * We're inside the ring mutex, if the ref is already dying, then - * someone else killed the ctx or is already going through - * io_uring_register(). + * We don't quiesce the refs for register anymore and so it can't be + * dying as we're holding a file ref here. */ - if (percpu_ref_is_dying(&ctx->refs)) + if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO; if (ctx->restricted) { From 8fcf4c48f44bd7b1b75db139f56ff1ad6477379e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 28 Jun 2022 21:33:20 +0200 Subject: [PATCH 138/172] io_uring: replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/78 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 094f706c93e0b9..8fe0275cdaf311 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -495,7 +495,7 @@ struct io_uring_probe { __u8 ops_len; /* length of ops[] array below */ __u16 resv; __u32 resv2[3]; - struct io_uring_probe_op ops[0]; + struct io_uring_probe_op ops[]; }; struct io_uring_restriction { From f110ed8498afa6ff8e9a8c08fb26880e02117616 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 04:42:56 -0600 Subject: [PATCH 139/172] io_uring: split out fixed file installation and removal Put it with the filetable code, which is where it belongs. While doing so, have the helpers take a ctx rather than an io_kiocb. It doesn't make sense to use a request, as it's not an operation on the request itself. It applies to the ring itself. Signed-off-by: Jens Axboe --- io_uring/filetable.c | 66 ++++++++++++++++++++++++++++++++++---------- io_uring/filetable.h | 3 ++ io_uring/openclose.c | 35 +++-------------------- io_uring/openclose.h | 2 +- io_uring/rsrc.c | 2 +- 5 files changed, 60 insertions(+), 48 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 534e1a3c625d9f..abaa5ba7f6552a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -58,11 +58,10 @@ void io_free_file_tables(struct io_file_table *table) table->bitmap = NULL; } -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) +static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, + u32 slot_index) __must_hold(&req->ctx->uring_lock) { - struct io_ring_ctx *ctx = req->ctx; bool needs_switch = false; struct io_fixed_file *file_slot; int ret; @@ -108,34 +107,71 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, return ret; } -/* - * Note when io_fixed_fd_install() returns error value, it will ensure - * fput() is called correspondingly. - */ -int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) +int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, + unsigned int file_slot) { bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; - struct io_ring_ctx *ctx = req->ctx; int ret; - io_ring_submit_lock(ctx, issue_flags); - if (alloc_slot) { ret = io_file_bitmap_get(ctx); if (unlikely(ret < 0)) - goto err; + return ret; file_slot = ret; } else { file_slot--; } - ret = io_install_fixed_file(req, file, issue_flags, file_slot); + ret = io_install_fixed_file(ctx, file, file_slot); if (!ret && alloc_slot) ret = file_slot; -err: + return ret; +} +/* + * Note when io_fixed_fd_install() returns error value, it will ensure + * fput() is called correspondingly. + */ +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + ret = __io_fixed_fd_install(ctx, file, file_slot); io_ring_submit_unlock(ctx, issue_flags); + if (unlikely(ret < 0)) fput(file); return ret; } + +int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) +{ + struct io_fixed_file *file_slot; + struct file *file; + int ret; + + if (unlikely(!ctx->file_data)) + return -ENXIO; + if (offset >= ctx->nr_user_files) + return -EINVAL; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); + if (!file_slot->file_ptr) + return -EBADF; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + return ret; + + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); + io_rsrc_node_switch(ctx, ctx->file_data); + return 0; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h index fb5a274c08ffcc..79eb50c1980e84 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -29,6 +29,9 @@ void io_free_file_tables(struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); +int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, + unsigned int file_slot); +int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); unsigned int io_file_get_flags(struct file *file); diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 099a5ec84dfdb7..d1818ec9169ba2 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -173,42 +173,15 @@ void io_open_cleanup(struct io_kiocb *req) putname(open->filename); } -int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, +int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset) { - struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *file_slot; - struct file *file; int ret; io_ring_submit_lock(ctx, issue_flags); - ret = -ENXIO; - if (unlikely(!ctx->file_data)) - goto out; - ret = -EINVAL; - if (offset >= ctx->nr_user_files) - goto out; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto out; - - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - ret = -EBADF; - if (!file_slot->file_ptr) - goto out; - - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); - if (ret) - goto out; - - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); - ret = 0; -out: + ret = io_fixed_fd_remove(ctx, offset); io_ring_submit_unlock(ctx, issue_flags); + return ret; } @@ -216,7 +189,7 @@ static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { struct io_close *close = io_kiocb_to_cmd(req); - return __io_close_fixed(req, issue_flags, close->file_slot - 1); + return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1); } int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 9f578f3fad870d..4b1c28d3a66c3f 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, +int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset); int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1106089551595e..706fa020505b1a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -703,7 +703,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, if (ret < 0) break; if (copy_to_user(&fds[done], &ret, sizeof(ret))) { - __io_close_fixed(req, issue_flags, ret); + __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } From e6130eba8a848a7a6ba6c534bd8f6d60749ae1a9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 04:47:02 -0600 Subject: [PATCH 140/172] io_uring: add support for passing fixed file descriptors With IORING_OP_MSG_RING, one ring can send a message to another ring. Extend that support to also allow sending a fixed file descriptor to that ring, enabling one ring to pass a registered descriptor to another one. Arguments are extended to pass in: sqe->addr3 fixed file slot in source ring sqe->file_index fixed file slot in destination ring IORING_OP_MSG_RING is extended to take a command argument in sqe->addr. If set to zero (or IORING_MSG_DATA), it sends just a message like before. If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according to the above arguments. Two common use cases for this are: 1) Server needs to be shutdown or restarted, pass file descriptors to another onei 2) Backend is split, and one accepts connections, while others then get the fd passed and handle the actual connection. Both of those are classic SCM_RIGHTS use cases, and it's not possible to support them with direct descriptors today. By default, this will post a CQE to the target ring, similarly to how IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message is posted to the target ring. The issuer is expected to notify the receiver side separately. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 17 +++++ io_uring/msg_ring.c | 130 ++++++++++++++++++++++++++++++++-- 2 files changed, 140 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8fe0275cdaf311..f378eabbff21b6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -51,6 +51,7 @@ struct io_uring_sqe { __u32 unlink_flags; __u32 hardlink_flags; __u32 xattr_flags; + __u32 msg_ring_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -270,6 +271,22 @@ enum io_uring_op { */ #define IORING_ACCEPT_MULTISHOT (1U << 0) +/* + * IORING_OP_MSG_RING command types, stored in sqe->addr + */ +enum { + IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */ + IORING_MSG_SEND_FD, /* send a registered fd to another ring */ +}; + +/* + * IORING_OP_MSG_RING flags (sqe->msg_ring_flags) + * + * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not + * applicable for IORING_MSG_DATA, obviously. + */ +#define IORING_MSG_RING_CQE_SKIP (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index b02be23496521c..939205b30c8b62 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -3,46 +3,162 @@ #include #include #include +#include #include #include #include "io_uring.h" +#include "rsrc.h" +#include "filetable.h" #include "msg_ring.h" struct io_msg { struct file *file; u64 user_data; u32 len; + u32 cmd; + u32 src_fd; + u32 dst_fd; + u32 flags; }; +static int io_msg_ring_data(struct io_kiocb *req) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req); + + if (msg->src_fd || msg->dst_fd || msg->flags) + return -EINVAL; + + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + return 0; + + return -EOVERFLOW; +} + +static void io_double_unlock_ctx(struct io_ring_ctx *ctx, + struct io_ring_ctx *octx, + unsigned int issue_flags) +{ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); + mutex_unlock(&octx->uring_lock); +} + +static int io_double_lock_ctx(struct io_ring_ctx *ctx, + struct io_ring_ctx *octx, + unsigned int issue_flags) +{ + /* + * To ensure proper ordering between the two ctxs, we can only + * attempt a trylock on the target. If that fails and we already have + * the source ctx lock, punt to io-wq. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + if (!mutex_trylock(&octx->uring_lock)) + return -EAGAIN; + return 0; + } + + /* Always grab smallest value ctx first. We know ctx != octx. */ + if (ctx < octx) { + mutex_lock(&ctx->uring_lock); + mutex_lock(&octx->uring_lock); + } else { + mutex_lock(&octx->uring_lock); + mutex_lock(&ctx->uring_lock); + } + + return 0; +} + +static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned long file_ptr; + struct file *src_file; + int ret; + + if (target_ctx == ctx) + return -EINVAL; + + ret = io_double_lock_ctx(ctx, target_ctx, issue_flags); + if (unlikely(ret)) + return ret; + + ret = -EBADF; + if (unlikely(msg->src_fd >= ctx->nr_user_files)) + goto out_unlock; + + msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; + src_file = (struct file *) (file_ptr & FFS_MASK); + get_file(src_file); + + ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); + if (ret < 0) { + fput(src_file); + goto out_unlock; + } + + if (msg->flags & IORING_MSG_RING_CQE_SKIP) + goto out_unlock; + + /* + * If this fails, the target still received the file descriptor but + * wasn't notified of the fact. This means that if this request + * completes with -EOVERFLOW, then the sender must ensure that a + * later IORING_OP_MSG_RING delivers the message. + */ + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + ret = -EOVERFLOW; +out_unlock: + io_double_unlock_ctx(ctx, target_ctx, issue_flags); + return ret; +} + int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_msg *msg = io_kiocb_to_cmd(req); - if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || - sqe->buf_index || sqe->personality)) + if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; msg->user_data = READ_ONCE(sqe->off); msg->len = READ_ONCE(sqe->len); + msg->cmd = READ_ONCE(sqe->addr); + msg->src_fd = READ_ONCE(sqe->addr3); + msg->dst_fd = READ_ONCE(sqe->file_index); + msg->flags = READ_ONCE(sqe->msg_ring_flags); + if (msg->flags & ~IORING_MSG_RING_CQE_SKIP) + return -EINVAL; + return 0; } int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req); - struct io_ring_ctx *target_ctx; int ret; ret = -EBADFD; if (!io_is_uring_fops(req->file)) goto done; - ret = -EOVERFLOW; - target_ctx = req->file->private_data; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) - ret = 0; + switch (msg->cmd) { + case IORING_MSG_DATA: + ret = io_msg_ring_data(req); + break; + case IORING_MSG_SEND_FD: + ret = io_msg_send_fd(req, issue_flags); + break; + default: + ret = -EINVAL; + break; + } done: if (ret < 0) From 6e73dffbb93cb8797cd4e42e98d837edf0f1a967 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:55:38 +0100 Subject: [PATCH 141/172] io_uring: let to set a range for file slot allocation From recently io_uring provides an option to allocate a file index for operation registering fixed files. However, it's utterly unusable with mixed approaches when for a part of files the userspace knows better where to place it, as it may race and users don't have any sane way to pick a slot and hoping it will not be taken. Let the userspace to register a range of fixed file slots in which the auto-allocation happens. The use case is splittting the fixed table in two parts, where on of them is used for auto-allocation and another for slot-specified operations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/66ab0394e436f38437cf7c44676e1920d09687ad.1656154403.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ include/uapi/linux/io_uring.h | 13 +++++++++++++ io_uring/filetable.c | 24 ++++++++++++++++++++---- io_uring/filetable.h | 20 +++++++++++++++++--- io_uring/io_uring.c | 6 ++++++ io_uring/rsrc.c | 2 ++ 6 files changed, 61 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3ca8f363f5046c..26ef11e978d403 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -233,6 +233,9 @@ struct io_ring_ctx { unsigned long check_cq; + unsigned int file_alloc_start; + unsigned int file_alloc_end; + struct { /* * We cache a range of free CQEs we can use, once exhausted it diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f378eabbff21b6..cf95354198a393 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -449,6 +449,9 @@ enum { /* sync cancelation API */ IORING_REGISTER_SYNC_CANCEL = 24, + /* register a range of fixed file slots for automatic slot allocation */ + IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* this goes last */ IORING_REGISTER_LAST }; @@ -595,4 +598,14 @@ struct io_uring_sync_cancel_reg { __u64 pad[4]; }; +/* + * Argument for IORING_REGISTER_FILE_ALLOC_RANGE + * The range is specified as [off, off + len) + */ +struct io_uring_file_index_range { + __u32 off; + __u32 len; + __u64 resv; +}; + #endif diff --git a/io_uring/filetable.c b/io_uring/filetable.c index abaa5ba7f6552a..7b473259f3f45a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -16,7 +16,7 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) { struct io_file_table *table = &ctx->file_table; - unsigned long nr = ctx->nr_user_files; + unsigned long nr = ctx->file_alloc_end; int ret; do { @@ -24,11 +24,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) if (ret != nr) return ret; - if (!table->alloc_hint) + if (table->alloc_hint == ctx->file_alloc_start) break; - nr = table->alloc_hint; - table->alloc_hint = 0; + table->alloc_hint = ctx->file_alloc_start; } while (1); return -ENFILE; @@ -175,3 +174,20 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) io_rsrc_node_switch(ctx, ctx->file_data); return 0; } + +int io_register_file_alloc_range(struct io_ring_ctx *ctx, + struct io_uring_file_index_range __user *arg) +{ + struct io_uring_file_index_range range; + u32 end; + + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + if (check_add_overflow(range.off, range.len, &end)) + return -EOVERFLOW; + if (range.resv || end > ctx->nr_user_files) + return -EINVAL; + + io_file_table_set_alloc_range(ctx, range.off, range.len); + return 0; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 79eb50c1980e84..ff3a712e11bf33 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -3,9 +3,7 @@ #define IOU_FILE_TABLE_H #include - -struct io_ring_ctx; -struct io_kiocb; +#include /* * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 @@ -33,6 +31,9 @@ int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, unsigned int file_slot); int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); +int io_register_file_alloc_range(struct io_ring_ctx *ctx, + struct io_uring_file_index_range __user *arg); + unsigned int io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) @@ -71,4 +72,17 @@ static inline void io_fixed_file_set(struct io_fixed_file *file_slot, file_slot->file_ptr = file_ptr; } +static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) +{ + ctx->file_table.alloc_hint = ctx->file_alloc_start; +} + +static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, + unsigned off, unsigned len) +{ + ctx->file_alloc_start = off; + ctx->file_alloc_end = off + len; + io_reset_alloc_hint(ctx); +} + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 070ee9ec9ee721..745264938a489f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3866,6 +3866,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sync_cancel(ctx, arg); break; + case IORING_REGISTER_FILE_ALLOC_RANGE: + ret = -EINVAL; + if (!arg || nr_args) + break; + ret = io_register_file_alloc_range(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 706fa020505b1a..d2e589c703d063 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1012,6 +1012,8 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, io_file_bitmap_set(&ctx->file_table, i); } + /* default it to the whole table */ + io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); io_rsrc_node_switch(ctx, NULL); return 0; fail: From b8c015598c8ef9195b8a2a5089e275c4f64ca999 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:20 -0700 Subject: [PATCH 142/172] io_uring: allow 0 length for buffer select If user gives 0 for length, we can set it from the available buffer size. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-2-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 5e00f16e89b866..e538fa7cb727cf 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -115,7 +115,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); - if (*len > kbuf->len) + if (*len == 0 || *len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; @@ -145,7 +145,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, buf = page_address(bl->buf_pages[index]); buf += off; } - if (*len > buf->len) + if (*len == 0 || *len > buf->len) *len = buf->len; req->flags |= REQ_F_BUFFER_RING; req->buf_list = bl; From 32f3c434b14238a2eee0c726a1918de58c07faf6 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:21 -0700 Subject: [PATCH 143/172] io_uring: restore bgid in io_put_kbuf Attempt to restore bgid. This is needed when recycling unused buffers as the next time around it will want the correct bgid. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-3-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/kbuf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b3e8c6c5fee143..d6af208d109ffe 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -103,16 +103,21 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, struct list_head *list) { + unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; req->buf_list->head++; + } req->flags &= ~REQ_F_BUFFER_RING; } else { + req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); req->flags &= ~REQ_F_BUFFER_SELECTED; } - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + return ret; } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) From 5702196e7d9d1232c41769e197eb33ba78a9b463 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:22 -0700 Subject: [PATCH 144/172] io_uring: allow iov_len = 0 for recvmsg and buffer select When using BUFFER_SELECT there is no technical requirement that the user actually provides iov, and this removes one copy_from_user call. So allow iov_len to be 0. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-4-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b77bfbfb081673..06eaef9f97bec5 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -300,12 +300,18 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len > 1) + if (iov_len == 0) { + sr->len = iomsg->fast_iov[0].iov_len = 0; + iomsg->fast_iov[0].iov_base = NULL; + iomsg->free_iov = NULL; + } else if (iov_len > 1) { return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; + } else { + if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = iomsg->fast_iov[0].iov_len; + iomsg->free_iov = NULL; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, From d4e097dae29c24bf33a5056a2a43cff2e45c4978 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:23 -0700 Subject: [PATCH 145/172] io_uring: recycle buffers on error Rather than passing an error back to the user with a buffer attached, recycle the buffer immediately. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-5-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 06eaef9f97bec5..e4422dff07048a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -481,10 +481,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) + if (ret > 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; + else + io_kbuf_recycle(req, issue_flags); + cflags = io_put_kbuf(req, issue_flags); if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; @@ -557,10 +560,13 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); } - if (ret >= 0) + if (ret > 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; + else + io_kbuf_recycle(req, issue_flags); + cflags = io_put_kbuf(req, issue_flags); if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; From 2ba69707d9153eeca1ee8ac1bc55376b88978842 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:24 -0700 Subject: [PATCH 146/172] io_uring: clean up io_poll_check_events return values The values returned are a bit confusing, where 0 and 1 have implied meaning, so add some definitions for them. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-6-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index fa25b88a7b9336..922a3d1b2e31d1 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -192,13 +192,18 @@ static void io_poll_remove_entries(struct io_kiocb *req) rcu_read_unlock(); } +enum { + IOU_POLL_DONE = 0, + IOU_POLL_NO_ACTION = 1, +}; + /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * - * Returns a negative error on failure. >0 when no action require, which is - * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->cqe.res. + * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require, + * which is either spurious wakeup or multishot CQE is served. + * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res. */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { @@ -214,10 +219,11 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) /* tw handler should be the owner, and so have some references */ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return 0; + return IOU_POLL_DONE; if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; + /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; @@ -226,7 +232,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if ((unlikely(!req->cqe.res))) continue; if (req->apoll_events & EPOLLONESHOT) - return 0; + return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { @@ -238,7 +244,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return -ECANCELED; } else { ret = io_poll_issue(req, locked); - if (ret) + if (ret < 0) return ret; } @@ -248,7 +254,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) */ } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); - return 1; + return IOU_POLL_NO_ACTION; } static void io_poll_task_func(struct io_kiocb *req, bool *locked) @@ -256,12 +262,11 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) int ret; ret = io_poll_check_events(req, locked); - if (ret > 0) + if (ret == IOU_POLL_NO_ACTION) return; - if (!ret) { + if (ret == IOU_POLL_DONE) { struct io_poll *poll = io_kiocb_to_cmd(req); - req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else { req->cqe.res = ret; @@ -280,7 +285,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) int ret; ret = io_poll_check_events(req, locked); - if (ret > 0) + if (ret == IOU_POLL_NO_ACTION) return; io_poll_remove_entries(req); From 114eccdf0e368893b3d92e06e9788d9d94876853 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:25 -0700 Subject: [PATCH 147/172] io_uring: add IOU_STOP_MULTISHOT return code For multishot we want a way to signal the caller that multishot has ended but also this might not be an error return. For example sockets return 0 when closed, which should end a multishot recv, but still have a CQE with result 0 Introduce IOU_STOP_MULTISHOT which does this and indicates that the return code is stored inside req->cqe Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-7-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 7 +++++++ io_uring/poll.c | 11 +++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f77e4a5403e4e5..e8da70781fa34b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,6 +15,13 @@ enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, + + /* + * Intended only when both REQ_F_POLLED and REQ_F_APOLL_MULTISHOT + * are set to indicate to the poll runner that multishot should be + * removed and the result is set on req->cqe.res. + */ + IOU_STOP_MULTISHOT = -ECANCELED, }; struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); diff --git a/io_uring/poll.c b/io_uring/poll.c index 922a3d1b2e31d1..64d426d696abc8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -195,6 +195,7 @@ static void io_poll_remove_entries(struct io_kiocb *req) enum { IOU_POLL_DONE = 0, IOU_POLL_NO_ACTION = 1, + IOU_POLL_REMOVE_POLL_USE_RES = 2, }; /* @@ -204,6 +205,8 @@ enum { * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require, * which is either spurious wakeup or multishot CQE is served. * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res. + * IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result + * is stored in req->cqe. */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { @@ -244,6 +247,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return -ECANCELED; } else { ret = io_poll_issue(req, locked); + if (ret == IOU_STOP_MULTISHOT) + return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) return ret; } @@ -268,7 +273,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) if (ret == IOU_POLL_DONE) { struct io_poll *poll = io_kiocb_to_cmd(req); req->cqe.res = mangle_poll(req->cqe.res & poll->events); - } else { + } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; req_set_fail(req); } @@ -291,7 +296,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_poll_remove_entries(req); io_poll_tw_hash_eject(req, locked); - if (!ret) + if (ret == IOU_POLL_REMOVE_POLL_USE_RES) + io_req_complete_post(req); + else if (ret == IOU_POLL_DONE) io_req_task_submit(req, locked); else io_req_complete_failed(req, ret); From 52120f0fadcbdaaa981c19327f1865a714e85268 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:26 -0700 Subject: [PATCH 148/172] io_uring: add allow_overflow to io_post_aux_cqe Some use cases of io_post_aux_cqe would not want to overflow as is, but might want to change the flags/result. For example multishot receive requires in order CQE, and so if there is an overflow it would need to stop receiving until the overflow is taken care of. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-8-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 ++++++++++---- io_uring/io_uring.h | 3 ++- io_uring/msg_ring.c | 4 ++-- io_uring/net.c | 2 +- io_uring/poll.c | 2 +- io_uring/rsrc.c | 4 ++-- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 745264938a489f..523b6ebad15a30 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -736,7 +736,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags) + u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { struct io_uring_cqe *cqe; @@ -760,16 +761,21 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, } return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + + if (allow_overflow) + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + + return false; } bool io_post_aux_cqe(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags) + u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { bool filled; io_cq_lock(ctx); - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow); io_cq_unlock_post(ctx); return filled; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e8da70781fa34b..e022d71c177a46 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -31,7 +31,8 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 939205b30c8b62..753d16734319a4 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -31,7 +31,7 @@ static int io_msg_ring_data(struct io_kiocb *req) if (msg->src_fd || msg->dst_fd || msg->flags) return -EINVAL; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) return 0; return -EOVERFLOW; @@ -113,7 +113,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) * completes with -EOVERFLOW, then the sender must ensure that a * later IORING_OP_MSG_RING delivers the message. */ - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) ret = -EOVERFLOW; out_unlock: io_double_unlock_ctx(ctx, target_ctx, issue_flags); diff --git a/io_uring/net.c b/io_uring/net.c index e4422dff07048a..601955fdb124f7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -658,7 +658,7 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) return ret; - if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE)) + if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) goto retry; return -ECANCELED; } diff --git a/io_uring/poll.c b/io_uring/poll.c index 64d426d696abc8..e8f922a4f6d754 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -243,7 +243,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) req->apoll_events); if (!io_post_aux_cqe(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE)) + mask, IORING_CQE_F_MORE, true)) return -ECANCELED; } else { ret = io_poll_issue(req, locked); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d2e589c703d063..0250c13ae1cdd0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -175,10 +175,10 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) if (prsrc->tag) { if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); mutex_unlock(&ctx->uring_lock); } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); } } From a2da676376feb79224eacb9ac1f554bb3232b5de Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:27 -0700 Subject: [PATCH 149/172] io_uring: fix multishot poll on overflow On overflow, multishot poll can still complete with the IORING_CQE_F_MORE flag set. If in the meantime the user clears a CQE and a the poll was cancelled then the poll will post a CQE without the IORING_CQE_F_MORE (and likely result -ECANCELED). However when processing the application will encounter the non-overflow CQE which indicates that there will be no more events posted. Typical userspace applications would free memory associated with the poll in this case. It will then subsequently receive the earlier CQE which has overflowed, which breaks the contract given by the IORING_CQE_F_MORE flag. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-9-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index e8f922a4f6d754..57747d92bba427 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -243,8 +243,10 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) req->apoll_events); if (!io_post_aux_cqe(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE, true)) - return -ECANCELED; + mask, IORING_CQE_F_MORE, false)) { + io_req_set_res(req, mask, 0); + return IOU_POLL_REMOVE_POLL_USE_RES; + } } else { ret = io_poll_issue(req, locked); if (ret == IOU_STOP_MULTISHOT) From cbd25748545c709d35734deb9220e0af0a69e5d2 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:28 -0700 Subject: [PATCH 150/172] io_uring: fix multishot accept ordering Similar to multishot poll, drop multishot accept when CQE overflow occurs. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-10-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 601955fdb124f7..e1eaf902f3b201 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -656,11 +656,14 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } - if (ret < 0) - return ret; - if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) + if (ret >= 0 && + io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false)) goto retry; - return -ECANCELED; + + io_req_set_res(req, ret, 0); + if (req->flags & REQ_F_POLLED) + return IOU_STOP_MULTISHOT; + return IOU_OK; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) From b3fdea6ecb55c3ceea866ff66486927e51a982b3 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:29 -0700 Subject: [PATCH 151/172] io_uring: multishot recv Support multishot receive for io_uring. Typical server applications will run a loop where for each recv CQE it requeues another recv/recvmsg. This can be simplified by using the existing multishot functionality combined with io_uring's provided buffers. The API is to add the IORING_RECV_MULTISHOT flag to the SQE. CQEs will then be posted (with IORING_CQE_F_MORE flag set) when data is available and is read. Once an error occurs or the socket ends, the multishot will be removed and a completion without IORING_CQE_F_MORE will be posted. The benefit to this is that the recv is much more performant. * Subsequent receives are queued up straight away without requiring the application to finish a processing loop. * If there are more data in the socket (sat the provided buffer size is smaller than the socket buffer) then the data is immediately returned, improving batching. * Poll is only armed once and reused, saving CPU cycles Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-11-dylany@fb.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 ++ io_uring/net.c | 102 +++++++++++++++++++++++++++++----- 2 files changed, 94 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf95354198a393..499679134961b2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -263,8 +263,13 @@ enum io_uring_op { * or receive and arm poll if that yields an * -EAGAIN result, arm poll upfront and skip * the initial transfer attempt. + * + * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if + * the handler will continue to report + * CQEs on behalf of the same SQE. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) +#define IORING_RECV_MULTISHOT (1U << 1) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/net.c b/io_uring/net.c index e1eaf902f3b201..cb08a4b6284096 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -389,6 +389,8 @@ int io_recvmsg_prep_async(struct io_kiocb *req) return ret; } +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) + int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -399,13 +401,22 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + if (sr->flags & ~(RECVMSG_FLAGS)) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; + if (sr->flags & IORING_RECV_MULTISHOT) { + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sr->msg_flags & MSG_WAITALL) + return -EINVAL; + if (req->opcode == IORING_OP_RECV && sr->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; + } #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -415,6 +426,48 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static inline void io_recv_prep_retry(struct io_kiocb *req) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + sr->done_io = 0; + sr->len = 0; /* get from the provided buffer */ +} + +/* + * Finishes io_recv and io_recvmsg. + * + * Returns true if it is actually finished, or false if it should run + * again (for multishot). + */ +static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags) +{ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + io_req_set_res(req, *ret, cflags); + *ret = IOU_OK; + return true; + } + + if (*ret > 0) { + if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret, + cflags | IORING_CQE_F_MORE, false)) { + io_recv_prep_retry(req); + return false; + } + /* + * Otherwise stop multishot but use the current result. + * Probably will end up going into overflow, but this means + * we cannot trust the ordering anymore + */ + } + + io_req_set_res(req, *ret, cflags); + + if (req->flags & REQ_F_POLLED) + *ret = IOU_STOP_MULTISHOT; + return true; +} + int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -424,6 +477,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + size_t len = sr->len; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -442,16 +496,17 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg); +retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; - buf = io_buffer_select(req, &sr->len, issue_flags); + buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; + kmsg->fast_iov[0].iov_len = len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); + len); } flags = sr->msg_flags; @@ -463,8 +518,15 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_get_inq = 1; ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); + if (ret == -EAGAIN && force_nonblock) { + ret = io_setup_async_msg(req, kmsg); + if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) { + io_kbuf_recycle(req, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -491,8 +553,11 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; + + if (!io_recv_finish(req, &ret, cflags)) + goto retry_multishot; + + return ret; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) @@ -505,6 +570,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + size_t len = sr->len; if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) @@ -514,16 +580,17 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; +retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; - buf = io_buffer_select(req, &sr->len, issue_flags); + buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; } - ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); + ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter); if (unlikely(ret)) goto out_free; @@ -543,8 +610,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) ret = sock_recvmsg(sock, &msg, flags); if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) + if (ret == -EAGAIN && force_nonblock) { + if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) { + io_kbuf_recycle(req, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + return -EAGAIN; + } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -570,8 +643,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; + + if (!io_recv_finish(req, &ret, cflags)) + goto retry_multishot; + + return ret; } int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) From 9b26e811e934eebda59362c9a03d082852552574 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:30 -0700 Subject: [PATCH 152/172] io_uring: fix io_uring_cqe_overflow trace format Make the trace format consistent with io_uring_complete for cflags Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-12-dylany@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 918e3a43e4b280..95a8cfaad15a03 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -594,7 +594,7 @@ TRACE_EVENT(io_uring_cqe_overflow, __entry->ocqe = ocqe; ), - TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, " + TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, " "overflow_cqe %p", __entry->ctx, __entry->user_data, __entry->res, __entry->cflags, __entry->ocqe) From e0486f3f7c1b6ab7ba7e912bf48d17f04e71d86d Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:31 -0700 Subject: [PATCH 153/172] io_uring: only trace one of complete or overflow In overflow we see a duplcate line in the trace, and in some cases 3 lines (if initial io_post_aux_cqe fails). Instead just trace once for each CQE Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-13-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 ++- io_uring/io_uring.h | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 523b6ebad15a30..caf979cd432729 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -742,7 +742,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, struct io_uring_cqe *cqe; ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); /* * If we can't get a cq entry, userspace overflowed the @@ -751,6 +750,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, */ cqe = io_get_cqe(ctx); if (likely(cqe)) { + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); + WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e022d71c177a46..868f45d55543bc 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -101,10 +101,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, - (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -113,6 +109,12 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, cqe = io_get_cqe(ctx); if (unlikely(!cqe)) return io_req_cqe_overflow(req); + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, + (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { From cf0dd9527eee5d6d183fa724fdee6a128cb17b8d Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 4 Jul 2022 07:01:06 -0700 Subject: [PATCH 154/172] io_uring: disable multishot recvmsg recvmsg has semantics that do not make it trivial to extend to multishot. Specifically it has user pointers and returns data in the original parameter. In order to make this API useful these will need to be somehow included with the provided buffers. For now remove multishot for recvmsg as it is not useful. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220704140106.200167-1-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index cb08a4b6284096..6679069eeef162 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -409,6 +409,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; if (sr->flags & IORING_RECV_MULTISHOT) { + if (req->opcode == IORING_OP_RECVMSG) + return -EINVAL; if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) @@ -435,7 +437,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) } /* - * Finishes io_recv and io_recvmsg. + * Finishes io_recv * * Returns true if it is actually finished, or false if it should run * again (for multishot). @@ -477,7 +479,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - size_t len = sr->len; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -496,17 +497,16 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg); -retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; - buf = io_buffer_select(req, &len, issue_flags); + buf = io_buffer_select(req, &sr->len, issue_flags); if (!buf) return -ENOBUFS; kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = len; + kmsg->fast_iov[0].iov_len = sr->len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - len); + sr->len); } flags = sr->msg_flags; @@ -518,15 +518,8 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_get_inq = 1; ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg); - if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) == - IO_APOLL_MULTI_POLLED) { - io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return ret; - } + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -554,10 +547,8 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!io_recv_finish(req, &ret, cflags)) - goto retry_multishot; - - return ret; + io_req_set_res(req, ret, cflags); + return IOU_OK; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) From 7a121ced6e6430d49fb802067b4f020f6df62362 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:14 +0100 Subject: [PATCH 155/172] io_uring: don't miss setting REQ_F_DOUBLE_POLL When adding a second poll entry we should set REQ_F_DOUBLE_POLL unconditionally. We might race with the first entry removal but that doesn't change the rule. Fixes: a18427bb2d9b ("io_uring: optimise submission side poll_refs") Reported-and-tested-by: syzbot+49950ba66096b1f0209b@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8b680d83ded07424db83e8745585e7a6d72826ef.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 57747d92bba427..3710a0a46a8778 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -401,16 +401,18 @@ static void io_poll_double_prepare(struct io_kiocb *req) /* head is RCU protected, see io_poll_remove_entries() comments */ rcu_read_lock(); head = smp_load_acquire(&poll->head); - if (head) { - /* - * poll arm may not hold ownership and so race with - * io_poll_wake() by modifying req->flags. There is only one - * poll entry queued, serialise with it by taking its head lock. - */ + /* + * poll arm may not hold ownership and so race with + * io_poll_wake() by modifying req->flags. There is only one + * poll entry queued, serialise with it by taking its head lock. + */ + if (head) spin_lock_irq(&head->lock); - req->flags |= REQ_F_DOUBLE_POLL; + + req->flags |= REQ_F_DOUBLE_POLL; + + if (head) spin_unlock_irq(&head->lock); - } rcu_read_unlock(); } From ceff501790a9789c748a9b851f30f4f7e2fe4d72 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:15 +0100 Subject: [PATCH 156/172] io_uring: don't race double poll setting REQ_F_ASYNC_DATA Just as with io_poll_double_prepare() setting REQ_F_DOUBLE_POLL, we can race with the first poll entry when setting REQ_F_ASYNC_DATA. Move it under io_poll_double_prepare(). Fixes: a18427bb2d9b ("io_uring: optimise submission side poll_refs") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/df6920f509c11115aa2bce8b34dc5fdb0eb98920.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 3710a0a46a8778..c1359d45a396f2 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -410,6 +410,8 @@ static void io_poll_double_prepare(struct io_kiocb *req) spin_lock_irq(&head->lock); req->flags |= REQ_F_DOUBLE_POLL; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; if (head) spin_unlock_irq(&head->lock); @@ -448,13 +450,11 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, return; } - io_poll_double_prepare(req); /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; io_init_poll_iocb(poll, first->events, first->wait.func); + io_poll_double_prepare(req); *poll_ptr = poll; - if (req->opcode == IORING_OP_POLL_ADD) - req->flags |= REQ_F_ASYNC_DATA; } else { /* fine to modify, there is no poll queued to race with us */ req->flags |= REQ_F_SINGLE_POLL; From b21a51e26e9aed66159c2c18651da22d585d2998 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:16 +0100 Subject: [PATCH 157/172] io_uring: clear REQ_F_HASH_LOCKED on hash removal Instead of clearing REQ_F_HASH_LOCKED while arming a poll, unset the bit when we're removing the entry from the table in io_poll_tw_hash_eject(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/02e48bb88d6f1480c94ac2924c43ad1fbd48e92a.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index c1359d45a396f2..77b669b0604609 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -132,6 +132,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) */ io_tw_lock(ctx, locked); hash_del(&req->hash_node); + req->flags &= ~REQ_F_HASH_LOCKED; } else { io_poll_req_delete(req, ctx); } @@ -617,9 +618,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) * apoll requests already grab the mutex to complete in the tw handler, * so removal from the mutex-backed hash is free, use it by default. */ - if (issue_flags & IO_URING_F_UNLOCKED) - req->flags &= ~REQ_F_HASH_LOCKED; - else + if (!(issue_flags & IO_URING_F_UNLOCKED)) req->flags |= REQ_F_HASH_LOCKED; if (!def->pollin && !def->pollout) @@ -880,8 +879,6 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (!(issue_flags & IO_URING_F_UNLOCKED) && (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) req->flags |= REQ_F_HASH_LOCKED; - else - req->flags &= ~REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { From e8375e43ca2d67f31b2408092eee4f91e1e140f4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:17 +0100 Subject: [PATCH 158/172] io_uring: consolidate hash_locked io-wq handling Don't duplicate code disabling REQ_F_HASH_LOCKED for IO_URING_F_UNLOCKED (i.e. io-wq), move the handling into __io_arm_poll_handler(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ff0ffdfaa65b3d536131535c3dad3c63d9b7bb0.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 77b669b0604609..76592063abe7ba 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -523,8 +523,12 @@ static int __io_arm_poll_handler(struct io_kiocb *req, * io_poll_can_finish_inline() tries to deal with that. */ ipt->owning = issue_flags & IO_URING_F_UNLOCKED; - atomic_set(&req->poll_refs, (int)ipt->owning); + + /* io-wq doesn't hold uring_lock */ + if (issue_flags & IO_URING_F_UNLOCKED) + req->flags &= ~REQ_F_HASH_LOCKED; + mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { @@ -618,8 +622,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) * apoll requests already grab the mutex to complete in the tw handler, * so removal from the mutex-backed hash is free, use it by default. */ - if (!(issue_flags & IO_URING_F_UNLOCKED)) - req->flags |= REQ_F_HASH_LOCKED; + req->flags |= REQ_F_HASH_LOCKED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; @@ -876,8 +879,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) * If sqpoll or single issuer, there is no contention for ->uring_lock * and we'll end up holding it in tw handlers anyway. */ - if (!(issue_flags & IO_URING_F_UNLOCKED) && - (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) + if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER)) req->flags |= REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); From 9da7471ed10dab52410062be74896a6c0aa1bf3a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 11:18:33 -0600 Subject: [PATCH 159/172] io_uring: move apoll cache to poll.c This is where it's used, move the flush handler in there. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 ------------ io_uring/poll.c | 12 ++++++++++++ io_uring/poll.h | 2 ++ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index caf979cd432729..4d1ce58b015e19 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2445,18 +2445,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_flush_apoll_cache(struct io_ring_ctx *ctx) -{ - struct async_poll *apoll; - - while (!list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del(&apoll->poll.wait.entry); - kfree(apoll); - } -} - static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); diff --git a/io_uring/poll.c b/io_uring/poll.c index 76592063abe7ba..052fcb6472083a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -959,3 +959,15 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_flush_apoll_cache(struct io_ring_ctx *ctx) +{ + struct async_poll *apoll; + + while (!list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del(&apoll->poll.wait.entry); + kfree(apoll); + } +} diff --git a/io_uring/poll.h b/io_uring/poll.h index c40673d7da0199..95f192c7babbd5 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -30,3 +30,5 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); + +void io_flush_apoll_cache(struct io_ring_ctx *ctx); From 9b797a37c4bd83b03cedcfbd15852b836f5e562c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 14:16:20 -0600 Subject: [PATCH 160/172] io_uring: add abstraction around apoll cache In preparation for adding limits, and one more user, abstract out the core bits of the allocation+free cache. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ++++- io_uring/alloc_cache.h | 41 ++++++++++++++++++++++++++++++++++ io_uring/io_uring.c | 8 +++---- io_uring/poll.c | 18 +++++---------- io_uring/poll.h | 9 ++++++-- 5 files changed, 62 insertions(+), 20 deletions(-) create mode 100644 io_uring/alloc_cache.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 26ef11e978d403..b548da03b563cb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -158,6 +158,10 @@ struct io_ev_fd { struct rcu_head rcu; }; +struct io_alloc_cache { + struct hlist_head list; +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -216,7 +220,7 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; - struct list_head apoll_cache; + struct io_alloc_cache apoll_cache; struct xarray personalities; u32 pers_next; } ____cacheline_aligned_in_smp; diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h new file mode 100644 index 00000000000000..98f2374c37c7bb --- /dev/null +++ b/io_uring/alloc_cache.h @@ -0,0 +1,41 @@ +#ifndef IOU_ALLOC_CACHE_H +#define IOU_ALLOC_CACHE_H + +struct io_cache_entry { + struct hlist_node node; +}; + +static inline void io_alloc_cache_put(struct io_alloc_cache *cache, + struct io_cache_entry *entry) +{ + hlist_add_head(&entry->node, &cache->list); +} + +static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) +{ + if (!hlist_empty(&cache->list)) { + struct hlist_node *node = cache->list.first; + + hlist_del(node); + return container_of(node, struct io_cache_entry, node); + } + + return NULL; +} + +static inline void io_alloc_cache_init(struct io_alloc_cache *cache) +{ + INIT_HLIST_HEAD(&cache->list); +} + +static inline void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(struct io_cache_entry *)) +{ + while (!hlist_empty(&cache->list)) { + struct hlist_node *node = cache->list.first; + + hlist_del(node); + free(container_of(node, struct io_cache_entry, node)); + } +} +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4d1ce58b015e19..a360a3d390c6ba 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "timeout.h" #include "poll.h" +#include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -295,7 +296,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_LIST_HEAD(&ctx->apoll_cache); + io_alloc_cache_init(&ctx->apoll_cache); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -1180,8 +1181,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (apoll->double_poll) kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); + io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -2467,7 +2467,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rings) __io_cqring_overflow_flush(ctx, true); io_eventfd_unregister(ctx); - io_flush_apoll_cache(ctx); + io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) diff --git a/io_uring/poll.c b/io_uring/poll.c index 052fcb6472083a..dadd293749b07b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -590,16 +590,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_cache_entry *entry; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); + (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) { + apoll = container_of(entry, struct async_poll, cache); } else { apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) @@ -960,14 +959,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -void io_flush_apoll_cache(struct io_ring_ctx *ctx) +void io_apoll_cache_free(struct io_cache_entry *entry) { - struct async_poll *apoll; - - while (!list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del(&apoll->poll.wait.entry); - kfree(apoll); - } + kfree(container_of(entry, struct async_poll, cache)); } diff --git a/io_uring/poll.h b/io_uring/poll.h index 95f192c7babbd5..5f3bae50fc81a0 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include "alloc_cache.h" + enum { IO_APOLL_OK, IO_APOLL_ABORTED, @@ -14,7 +16,10 @@ struct io_poll { }; struct async_poll { - struct io_poll poll; + union { + struct io_poll poll; + struct io_cache_entry cache; + }; struct io_poll *double_poll; }; @@ -31,4 +36,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); -void io_flush_apoll_cache(struct io_ring_ctx *ctx); +void io_apoll_cache_free(struct io_cache_entry *entry); From 9731bc9855dc169f27433fef3c4d0ff3496c512d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 14:20:54 -0600 Subject: [PATCH 161/172] io_uring: impose max limit on apoll cache Caches like this tend to grow to the peak size, and then never get any smaller. Impose a max limit on the size, to prevent it from growing too big. A somewhat randomly chosen 512 is the max size we'll allow the cache to get. If a batch of frees come in and would bring it over that, we simply start kfree'ing the surplus. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/alloc_cache.h | 16 ++++++++++++++-- io_uring/io_uring.c | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b548da03b563cb..bf8f95332edae0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -160,6 +160,7 @@ struct io_ev_fd { struct io_alloc_cache { struct hlist_head list; + unsigned int nr_cached; }; struct io_ring_ctx { diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 98f2374c37c7bb..729793ae97127a 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,14 +1,24 @@ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H +/* + * Don't allow the cache to grow beyond this size. + */ +#define IO_ALLOC_CACHE_MAX 512 + struct io_cache_entry { struct hlist_node node; }; -static inline void io_alloc_cache_put(struct io_alloc_cache *cache, +static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, struct io_cache_entry *entry) { - hlist_add_head(&entry->node, &cache->list); + if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { + cache->nr_cached++; + hlist_add_head(&entry->node, &cache->list); + return true; + } + return false; } static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) @@ -26,6 +36,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c static inline void io_alloc_cache_init(struct io_alloc_cache *cache) { INIT_HLIST_HEAD(&cache->list); + cache->nr_cached = 0; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, @@ -37,5 +48,6 @@ static inline void io_alloc_cache_free(struct io_alloc_cache *cache, hlist_del(node); free(container_of(node, struct io_cache_entry, node)); } + cache->nr_cached = 0; } #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a360a3d390c6ba..c9c23e45976635 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1181,7 +1181,8 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (apoll->double_poll) kfree(apoll->double_poll); - io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache); + if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) + kfree(apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) From 43e0bbbd0b0e30d232fd8e9e908125b5c49a9fbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 14:30:09 -0600 Subject: [PATCH 162/172] io_uring: add netmsg cache For recvmsg/sendmsg, if they don't complete inline, we currently need to allocate a struct io_async_msghdr for each request. This is a somewhat large struct. Hook up sendmsg/recvmsg to use the io_alloc_cache. This reduces the alloc + free overhead considerably, yielding 4-5% of extra performance running netbench. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ++-- io_uring/io_uring.c | 3 ++ io_uring/net.c | 63 +++++++++++++++++++++++++++++----- io_uring/net.h | 13 ++++++- 4 files changed, 73 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index bf8f95332edae0..d54b8b7e074629 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -222,8 +222,7 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; - struct xarray personalities; - u32 pers_next; + struct io_alloc_cache netmsg_cache; } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ @@ -241,6 +240,9 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; + struct xarray personalities; + u32 pers_next; + struct { /* * We cache a range of free CQEs we can use, once exhausted it diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c9c23e45976635..f697ca4e8f558c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -89,6 +89,7 @@ #include "kbuf.h" #include "rsrc.h" #include "cancel.h" +#include "net.h" #include "timeout.h" #include "poll.h" @@ -297,6 +298,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); io_alloc_cache_init(&ctx->apoll_cache); + io_alloc_cache_init(&ctx->netmsg_cache); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -2469,6 +2471,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_cqring_overflow_flush(ctx, true); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) diff --git a/io_uring/net.c b/io_uring/net.c index 6679069eeef162..185553174437f3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -12,6 +12,7 @@ #include "io_uring.h" #include "kbuf.h" +#include "alloc_cache.h" #include "net.h" #if defined(CONFIG_NET) @@ -97,18 +98,55 @@ static bool io_net_retry(struct socket *sock, int flags) return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } +static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_async_msghdr *hdr = req->async_data; + + if (!hdr || issue_flags & IO_URING_F_UNLOCKED) + return; + + /* Let normal cleanup path reap it if we fail adding to the cache */ + if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +static struct io_async_msghdr *io_recvmsg_alloc_async(struct io_kiocb *req, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_cache_entry *entry; + + if (!(issue_flags & IO_URING_F_UNLOCKED) && + (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) { + struct io_async_msghdr *hdr; + + hdr = container_of(entry, struct io_async_msghdr, cache); + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = hdr; + return hdr; + } + + if (!io_alloc_async_data(req)) + return req->async_data; + + return NULL; +} + static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg) + struct io_async_msghdr *kmsg, + unsigned int issue_flags) { struct io_async_msghdr *async_msg = req->async_data; if (async_msg) return -EAGAIN; - if (io_alloc_async_data(req)) { + async_msg = io_recvmsg_alloc_async(req, issue_flags); + if (!async_msg) { kfree(kmsg->free_iov); return -ENOMEM; } - async_msg = req->async_data; req->flags |= REQ_F_NEED_CLEANUP; memcpy(async_msg, kmsg, sizeof(*kmsg)); async_msg->msg.msg_name = &async_msg->addr; @@ -195,7 +233,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -207,13 +245,13 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); } req_set_fail(req); } @@ -221,6 +259,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -495,7 +534,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); if (io_do_buffer_select(req)) { void __user *buf; @@ -519,13 +558,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { @@ -535,6 +574,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); + io_netmsg_recycle(req, issue_flags); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret > 0) ret += sr->done_io; @@ -848,4 +888,9 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_netmsg_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_async_msghdr, cache)); +} #endif diff --git a/io_uring/net.h b/io_uring/net.h index 81d71d1647704d..178a6d8b76e0a8 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,9 +3,14 @@ #include #include +#include "alloc_cache.h" + #if defined(CONFIG_NET) struct io_async_msghdr { - struct iovec fast_iov[UIO_FASTIOV]; + union { + struct iovec fast_iov[UIO_FASTIOV]; + struct io_cache_entry cache; + }; /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; struct sockaddr __user *uaddr; @@ -40,4 +45,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags); int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); + +void io_netmsg_cache_free(struct io_cache_entry *entry); +#else +static inline void io_netmsg_cache_free(struct io_cache_entry *entry) +{ +} #endif From e2df2ccb753e3b598a9045760e79f1847f7b31cd Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 8 Jul 2022 11:18:35 -0700 Subject: [PATCH 163/172] io_uring: fix multishot ending when not polled If multishot is not actually polling then return IOU_OK rather than the result. If the result was > 0 this will confuse things further up the callstack which expect a return <= 0. Fixes: 1300ebb20286 ("io_uring: multishot recv") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220708181838.1495428-2-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 185553174437f3..eb939899e9c5a0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -506,6 +506,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c if (req->flags & REQ_F_POLLED) *ret = IOU_STOP_MULTISHOT; + else + *ret = IOU_OK; return true; } From 6d2f75a0cf3048ef38c48493f66a923353bf664b Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 8 Jul 2022 11:18:36 -0700 Subject: [PATCH 164/172] io_uring: support 0 length iov in buffer select in compat Match up work done in "io_uring: allow iov_len = 0 for recvmsg and buffer select", but for compat code path. Fixes: a68caad69ce5 ("io_uring: allow iov_len = 0 for recvmsg and buffer select") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220708181838.1495428-3-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index eb939899e9c5a0..dc9190eafbe703 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -382,16 +382,21 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - if (len > 1) - return -EINVAL; - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) + if (len == 0) { + sr->len = 0; + iomsg->free_iov = NULL; + } else if (len > 1) { return -EINVAL; - sr->len = clen; - iomsg->free_iov = NULL; + } else { + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = clen; + iomsg->free_iov = NULL; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, (struct iovec __user *)uiov, len, From 7fa875b8e53c288d616234b9daf417b0650ce1cc Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jul 2022 04:02:56 -0700 Subject: [PATCH 165/172] net: copy from user before calling __copy_msghdr this is in preparation for multishot receive from io_uring, where it needs to have access to the original struct user_msghdr. functionally this should be a no-op. Acked-by: Paolo Abeni Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220714110258.1336200-2-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/socket.h | 7 +++---- io_uring/net.c | 17 +++++++++-------- net/socket.c | 37 ++++++++++++++++--------------------- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 17311ad9f9af24..be24f1c8568a38 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); -extern int __copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec __user **uiov, size_t *nsegs); +extern int __copy_msghdr(struct msghdr *kmsg, + struct user_msghdr *umsg, + struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/io_uring/net.c b/io_uring/net.c index dc9190eafbe703..da7667ed36106d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -329,31 +329,32 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct iovec __user *uiov; - size_t iov_len; + struct user_msghdr msg; int ret; - ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, - &iomsg->uaddr, &uiov, &iov_len); + if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg))) + return -EFAULT; + + ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len == 0) { + if (msg.msg_iovlen == 0) { sr->len = iomsg->fast_iov[0].iov_len = 0; iomsg->fast_iov[0].iov_base = NULL; iomsg->free_iov = NULL; - } else if (iov_len > 1) { + } else if (msg.msg_iovlen > 1) { return -EINVAL; } else { - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov))) return -EFAULT; sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL; } } else { iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, false); if (ret > 0) diff --git a/net/socket.c b/net/socket.c index 96300cdc06251f..843545c21ec235 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2358,25 +2358,20 @@ struct used_address { unsigned int name_len; }; -int __copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec __user **uiov, size_t *nsegs) +int __copy_msghdr(struct msghdr *kmsg, + struct user_msghdr *msg, + struct sockaddr __user **save_addr) { - struct user_msghdr msg; ssize_t err; - if (copy_from_user(&msg, umsg, sizeof(*umsg))) - return -EFAULT; - kmsg->msg_control_is_user = true; kmsg->msg_get_inq = 0; - kmsg->msg_control_user = msg.msg_control; - kmsg->msg_controllen = msg.msg_controllen; - kmsg->msg_flags = msg.msg_flags; + kmsg->msg_control_user = msg->msg_control; + kmsg->msg_controllen = msg->msg_controllen; + kmsg->msg_flags = msg->msg_flags; - kmsg->msg_namelen = msg.msg_namelen; - if (!msg.msg_name) + kmsg->msg_namelen = msg->msg_namelen; + if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) @@ -2386,11 +2381,11 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) - *save_addr = msg.msg_name; + *save_addr = msg->msg_name; - if (msg.msg_name && kmsg->msg_namelen) { + if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(msg.msg_name, + err = move_addr_to_kernel(msg->msg_name, kmsg->msg_namelen, kmsg->msg_name); if (err < 0) @@ -2401,12 +2396,10 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (msg.msg_iovlen > UIO_MAXIOV) + if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; - *uiov = msg.msg_iov; - *nsegs = msg.msg_iovlen; return 0; } @@ -2418,8 +2411,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, struct user_msghdr msg; ssize_t err; - err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov, - &msg.msg_iovlen); + if (copy_from_user(&msg, umsg, sizeof(*umsg))) + return -EFAULT; + + err = __copy_msghdr(kmsg, &msg, save_addr); if (err) return err; From 72c531f8ef3052c682d39dc21dcb5576afda208c Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jul 2022 04:02:57 -0700 Subject: [PATCH 166/172] net: copy from user before calling __get_compat_msghdr this is in preparation for multishot receive from io_uring, where it needs to have access to the original struct user_msghdr. functionally this should be a no-op. Acked-by: Paolo Abeni Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220714110258.1336200-3-dylany@fb.com Signed-off-by: Jens Axboe --- include/net/compat.h | 5 ++--- io_uring/net.c | 17 +++++++++-------- net/compat.c | 39 +++++++++++++++++---------------------- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/include/net/compat.h b/include/net/compat.h index 595fee069b8250..84c163f40f38a5 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -46,9 +46,8 @@ struct compat_rtentry { unsigned short rt_irtt; /* Initial RTT */ }; -int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, compat_uptr_t *ptr, - compat_size_t *len); +int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg, + struct sockaddr __user **save_addr); int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); int put_cmsg_compat(struct msghdr*, int, int, int, void *); diff --git a/io_uring/net.c b/io_uring/net.c index da7667ed36106d..5bc3440a829012 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -369,24 +369,25 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct compat_msghdr msg; struct compat_iovec __user *uiov; - compat_uptr_t ptr; - compat_size_t len; int ret; - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, - &ptr, &len); + if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) + return -EFAULT; + + ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr); if (ret) return ret; - uiov = compat_ptr(ptr); + uiov = compat_ptr(msg.msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - if (len == 0) { + if (msg.msg_iovlen == 0) { sr->len = 0; iomsg->free_iov = NULL; - } else if (len > 1) { + } else if (msg.msg_iovlen > 1) { return -EINVAL; } else { if (!access_ok(uiov, sizeof(*uiov))) @@ -400,7 +401,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } } else { iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen, UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, true); if (ret < 0) diff --git a/net/compat.c b/net/compat.c index 210fc3b4d0d833..513aa9a3fc6466 100644 --- a/net/compat.c +++ b/net/compat.c @@ -34,20 +34,15 @@ #include int __get_compat_msghdr(struct msghdr *kmsg, - struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, - compat_uptr_t *ptr, compat_size_t *len) + struct compat_msghdr *msg, + struct sockaddr __user **save_addr) { - struct compat_msghdr msg; ssize_t err; - if (copy_from_user(&msg, umsg, sizeof(*umsg))) - return -EFAULT; - - kmsg->msg_flags = msg.msg_flags; - kmsg->msg_namelen = msg.msg_namelen; + kmsg->msg_flags = msg->msg_flags; + kmsg->msg_namelen = msg->msg_namelen; - if (!msg.msg_name) + if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) @@ -57,15 +52,15 @@ int __get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_control_is_user = true; - kmsg->msg_control_user = compat_ptr(msg.msg_control); - kmsg->msg_controllen = msg.msg_controllen; + kmsg->msg_control_user = compat_ptr(msg->msg_control); + kmsg->msg_controllen = msg->msg_controllen; if (save_addr) - *save_addr = compat_ptr(msg.msg_name); + *save_addr = compat_ptr(msg->msg_name); - if (msg.msg_name && kmsg->msg_namelen) { + if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(compat_ptr(msg.msg_name), + err = move_addr_to_kernel(compat_ptr(msg->msg_name), kmsg->msg_namelen, kmsg->msg_name); if (err < 0) @@ -76,12 +71,10 @@ int __get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (msg.msg_iovlen > UIO_MAXIOV) + if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; - *ptr = msg.msg_iov; - *len = msg.msg_iovlen; return 0; } @@ -90,15 +83,17 @@ int get_compat_msghdr(struct msghdr *kmsg, struct sockaddr __user **save_addr, struct iovec **iov) { - compat_uptr_t ptr; - compat_size_t len; + struct compat_msghdr msg; ssize_t err; - err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len); + if (copy_from_user(&msg, umsg, sizeof(*umsg))) + return -EFAULT; + + err = __get_compat_msghdr(kmsg, umsg, save_addr); if (err) return err; - err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len, + err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } From 9bb66906f23e50d6db1e11f7498b72dfca1982a2 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jul 2022 04:02:58 -0700 Subject: [PATCH 167/172] io_uring: support multishot in recvmsg Similar to multishot recv, this will require provided buffers to be used. However recvmsg is much more complex than recv as it has multiple outputs. Specifically flags, name, and control messages. Support this by introducing a new struct io_uring_recvmsg_out with 4 fields. namelen, controllen and flags match the similar out fields in msghdr from standard recvmsg(2), payloadlen is the length of the payload following the header. This struct is placed at the start of the returned buffer. Based on what the user specifies in struct msghdr, the next bytes of the buffer will be name (the next msg_namelen bytes), and then control (the next msg_controllen bytes). The payload will come at the end. The return value in the CQE is the total used size of the provided buffer. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220714110258.1336200-4-dylany@fb.com [axboe: style fixups, see link] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 ++ io_uring/net.c | 180 ++++++++++++++++++++++++++++++---- io_uring/net.h | 6 ++ 3 files changed, 174 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 499679134961b2..4c9b11e2e99158 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -613,4 +613,11 @@ struct io_uring_file_index_range { __u64 resv; }; +struct io_uring_recvmsg_out { + __u32 namelen; + __u32 controllen; + __u32 payloadlen; + __u32 flags; +}; + #endif diff --git a/io_uring/net.c b/io_uring/net.c index 5bc3440a829012..616d5f04cc7435 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -325,6 +325,21 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) +{ + unsigned long hdr; + + if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), + (unsigned long)iomsg->namelen, &hdr)) + return true; + if (check_add_overflow(hdr, iomsg->controllen, &hdr)) + return true; + if (hdr > INT_MAX) + return true; + + return false; +} + static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { @@ -352,6 +367,13 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL; } + + if (req->flags & REQ_F_APOLL_MULTISHOT) { + iomsg->namelen = msg.msg_namelen; + iomsg->controllen = msg.msg_controllen; + if (io_recvmsg_multishot_overflow(iomsg)) + return -EOVERFLOW; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, @@ -399,6 +421,13 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = clen; iomsg->free_iov = NULL; } + + if (req->flags & REQ_F_APOLL_MULTISHOT) { + iomsg->namelen = msg.msg_namelen; + iomsg->controllen = msg.msg_controllen; + if (io_recvmsg_multishot_overflow(iomsg)) + return -EOVERFLOW; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen, @@ -455,8 +484,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; if (sr->flags & IORING_RECV_MULTISHOT) { - if (req->opcode == IORING_OP_RECVMSG) - return -EINVAL; if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) @@ -483,12 +510,13 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) } /* - * Finishes io_recv + * Finishes io_recv and io_recvmsg. * * Returns true if it is actually finished, or false if it should run * again (for multishot). */ -static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags) +static inline bool io_recv_finish(struct io_kiocb *req, int *ret, + unsigned int cflags, bool mshot_finished) { if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { io_req_set_res(req, *ret, cflags); @@ -496,7 +524,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c return true; } - if (*ret > 0) { + if (!mshot_finished) { if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, false)) { io_recv_prep_retry(req); @@ -518,6 +546,90 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c return true; } +static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, + struct io_sr_msg *sr, void __user **buf, + size_t *len) +{ + unsigned long ubuf = (unsigned long) *buf; + unsigned long hdr; + + hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + + kmsg->controllen; + if (*len < hdr) + return -EFAULT; + + if (kmsg->controllen) { + unsigned long control = ubuf + hdr - kmsg->controllen; + + kmsg->msg.msg_control_user = (void *) control; + kmsg->msg.msg_controllen = kmsg->controllen; + } + + sr->buf = *buf; /* stash for later copy */ + *buf = (void *) (ubuf + hdr); + kmsg->payloadlen = *len = *len - hdr; + return 0; +} + +struct io_recvmsg_multishot_hdr { + struct io_uring_recvmsg_out msg; + struct sockaddr_storage addr; +}; + +static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, + struct io_async_msghdr *kmsg, + unsigned int flags, bool *finished) +{ + int err; + int copy_len; + struct io_recvmsg_multishot_hdr hdr; + + if (kmsg->namelen) + kmsg->msg.msg_name = &hdr.addr; + kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); + kmsg->msg.msg_namelen = 0; + + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + + err = sock_recvmsg(sock, &kmsg->msg, flags); + *finished = err <= 0; + if (err < 0) + return err; + + hdr.msg = (struct io_uring_recvmsg_out) { + .controllen = kmsg->controllen - kmsg->msg.msg_controllen, + .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT + }; + + hdr.msg.payloadlen = err; + if (err > kmsg->payloadlen) + err = kmsg->payloadlen; + + copy_len = sizeof(struct io_uring_recvmsg_out); + if (kmsg->msg.msg_namelen > kmsg->namelen) + copy_len += kmsg->namelen; + else + copy_len += kmsg->msg.msg_namelen; + + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + hdr.msg.namelen = kmsg->msg.msg_namelen; + + /* ensure that there is no gap between hdr and sockaddr_storage */ + BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != + sizeof(struct io_uring_recvmsg_out)); + if (copy_to_user(io->buf, &hdr, copy_len)) { + *finished = true; + return -EFAULT; + } + + return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + + kmsg->controllen + err; +} + int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -527,6 +639,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool mshot_finished = true; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -545,16 +658,27 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); +retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; + size_t len = sr->len; - buf = io_buffer_select(req, &sr->len, issue_flags); + buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; + + if (req->flags & REQ_F_APOLL_MULTISHOT) { + ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); + if (ret) { + io_kbuf_recycle(req, issue_flags); + return ret; + } + } + kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; + kmsg->fast_iov[0].iov_len = len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); + len); } flags = sr->msg_flags; @@ -564,10 +688,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_get_inq = 1; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + if (req->flags & REQ_F_APOLL_MULTISHOT) + ret = io_recvmsg_multishot(sock, sr, kmsg, flags, + &mshot_finished); + else + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, + kmsg->uaddr, flags); + if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg, issue_flags); + if (ret == -EAGAIN && force_nonblock) { + ret = io_setup_async_msg(req, kmsg, issue_flags); + if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) { + io_kbuf_recycle(req, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -580,11 +717,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - io_netmsg_recycle(req, issue_flags); - req->flags &= ~REQ_F_NEED_CLEANUP; if (ret > 0) ret += sr->done_io; else if (sr->done_io) @@ -596,8 +728,18 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; + if (!io_recv_finish(req, &ret, cflags, mshot_finished)) + goto retry_multishot; + + if (mshot_finished) { + io_netmsg_recycle(req, issue_flags); + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; + } + + return ret; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) @@ -684,7 +826,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!io_recv_finish(req, &ret, cflags)) + if (!io_recv_finish(req, &ret, cflags, ret <= 0)) goto retry_multishot; return ret; diff --git a/io_uring/net.h b/io_uring/net.h index 178a6d8b76e0a8..db20ce9d6546d4 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -9,6 +9,12 @@ struct io_async_msghdr { union { struct iovec fast_iov[UIO_FASTIOV]; + struct { + struct iovec fast_iov_one; + __kernel_size_t controllen; + int namelen; + __kernel_size_t payloadlen; + }; struct io_cache_entry cache; }; /* points to an allocated iov, if NULL we use fast_iov instead */ From 4ccc6db0900fe337212b61650663a5dcedb69f25 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 18:33:01 +0200 Subject: [PATCH 168/172] io_uring: Use atomic_long_try_cmpxchg in __io_account_mem Use atomic_long_try_cmpxchg instead of atomic_long_cmpxchg (*ptr, old, new) == old in __io_account_mem. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_long_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Signed-off-by: Uros Bizjak Cc: Jens Axboe Cc: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0250c13ae1cdd0..7f66b0e2567432 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -56,14 +56,13 @@ static inline int __io_account_mem(struct user_struct *user, /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur_pages = atomic_long_read(&user->locked_vm); do { - cur_pages = atomic_long_read(&user->locked_vm); new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); - + } while (!atomic_long_try_cmpxchg(&user->locked_vm, + &cur_pages, new_pages)); return 0; } From 9b0fc3c054ff2eb13753104884f1045b5bb3a627 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 15 Jul 2022 06:02:52 -0700 Subject: [PATCH 169/172] io_uring: fix types in io_recvmsg_multishot_overflow io_recvmsg_multishot_overflow had incorrect types on non x64 system. But also it had an unnecessary INT_MAX check, which could just be done by changing the type of the accumulator to int (also simplifying the casts). Reported-by: Stephen Rothwell Fixes: a8b38c4ce724 ("io_uring: support multishot in recvmsg") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220715130252.610639-1-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 616d5f04cc7435..6b7d5f33e642a0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -327,14 +327,14 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) { - unsigned long hdr; + int hdr; - if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), - (unsigned long)iomsg->namelen, &hdr)) + if (iomsg->namelen < 0) return true; - if (check_add_overflow(hdr, iomsg->controllen, &hdr)) + if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out), + iomsg->namelen, &hdr)) return true; - if (hdr > INT_MAX) + if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr)) return true; return false; From 48904229928d941ce1db181b991948387ab463cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 15 Jul 2022 19:45:01 +0200 Subject: [PATCH 170/172] io_uring: Don't require reinitable percpu_ref MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 8bb649ee1da3 ("io_uring: remove ring quiesce for io_uring_register") removed the worklow relying on reinit/resurrection of the percpu_ref, hence, initialization with that requested is a relic. This is based on code review, this causes no real bug (and theoretically can't). Technically it's a revert of commit 214828962dea ("io_uring: initialize percpu refcounters using PERCU_REF_ALLOW_REINIT") but since the flag omission is now justified, I'm not making this a revert. Fixes: 8bb649ee1da3 ("io_uring: remove ring quiesce for io_uring_register") Signed-off-by: Michal Koutný Acked-by: Roman Gushchin Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f697ca4e8f558c..624535c625652f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -289,7 +289,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->dummy_ubuf->ubuf = -1UL; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; From 4f6a94d337408f23e199eee7d35ed6edc201df0c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Jul 2022 15:54:47 -0600 Subject: [PATCH 171/172] net: fix compat pointer in get_compat_msghdr() A previous change enabled external users to copy the data before calling __get_compat_msghdr(), but didn't modify get_compat_msghdr() or __io_compat_recvmsg_copy_hdr() to take that into account. They are both stil passing in the __user pointer rather than the copied version. Ensure we pass in the kernel struct, not the pointer to the user data. Link: https://lore.kernel.org/all/46439555-644d-08a1-7d66-16f8f9a320f0@samsung.com/ Fixes: 1a3e4e94a1b9 ("net: copy from user before calling __get_compat_msghdr") Reported-by: Marek Szyprowski Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- net/compat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 6b7d5f33e642a0..e61efa31c729c6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -398,7 +398,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) return -EFAULT; - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr); + ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); if (ret) return ret; diff --git a/net/compat.c b/net/compat.c index 513aa9a3fc6466..ed880729d159bd 100644 --- a/net/compat.c +++ b/net/compat.c @@ -89,7 +89,7 @@ int get_compat_msghdr(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - err = __get_compat_msghdr(kmsg, umsg, save_addr); + err = __get_compat_msghdr(kmsg, &msg, save_addr); if (err) return err; From f6b543fd03d347e8bf245cee4f2d54eb6ffd8fcb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Jul 2022 09:06:47 -0600 Subject: [PATCH 172/172] io_uring: ensure REQ_F_ISREG is set async offload If we're offloading requests directly to io-wq because IOSQE_ASYNC was set in the sqe, we can miss hashing writes appropriately because we haven't set REQ_F_ISREG yet. This can cause a performance regression with buffered writes, as io-wq then no longer correctly serializes writes to that file. Ensure that we set the flags in io_prep_async_work(), which will cause the io-wq work item to be hashed appropriately. Fixes: 584b0180f0f4 ("io_uring: move read/write file prep state into actual opcode handler") Link: https://lore.kernel.org/io-uring/20220608080054.GB22428@xsang-OptiPlex-9020/ Reported-by: kernel test robot Tested-by: Yin Fengwei Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +++ io_uring/io_uring.h | 5 +++++ io_uring/rw.c | 5 ----- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 624535c625652f..4b3fd645d0231a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -406,6 +406,9 @@ static void io_prep_async_work(struct io_kiocb *req) if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; + if (req->file && !io_req_ffs_set(req)) + req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 868f45d55543bc..5db0a60dc04efc 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -41,6 +41,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); +static inline bool io_req_ffs_set(struct io_kiocb *req) +{ + return req->flags & REQ_F_FIXED_FILE; +} + bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); diff --git a/io_uring/rw.c b/io_uring/rw.c index ade3e235f2770c..c50a0d66d67a66 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -647,11 +647,6 @@ static bool need_read_all(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) { struct io_rw *rw = io_kiocb_to_cmd(req);