From 04491732fc996305e1de80255d64ed6d1c472df5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Mar 2025 15:02:20 +0000 Subject: [PATCH 01/44] io_uring/net: account memory for zc sendmsg Account pinned pages for IORING_OP_SENDMSG_ZC, just as we for IORING_OP_SEND_ZC and net/ does for MSG_ZEROCOPY. Fixes: 493108d95f146 ("io_uring/net: zerocopy sendmsg") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4f00f67ca6ac8e8ed62343ae92b5816b1e0c9c4b.1743086313.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 8944eb679024..228b4f13d34c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1303,6 +1303,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *notif; + int ret; zc->done_io = 0; zc->retry = false; @@ -1355,7 +1356,16 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_IMPORT_BUFFER; return io_send_setup(req, sqe); } - return io_sendmsg_zc_setup(req, sqe); + ret = io_sendmsg_zc_setup(req, sqe); + if (unlikely(ret)) + return ret; + + if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { + struct io_async_msghdr *iomsg = req->async_data; + + return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); + } + return 0; } static int io_sg_from_iter_iovec(struct sk_buff *skb, From 8741d0737921ec1c03cf59aebf4d01400c2b461a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:10 +0800 Subject: [PATCH 02/44] ublk: make sure ubq->canceling is set when queue is frozen Now ublk driver depends on `ubq->canceling` for deciding if the request can be dispatched via uring_cmd & io_uring_cmd_complete_in_task(). Once ubq->canceling is set, the uring_cmd can be done via ublk_cancel_cmd() and io_uring_cmd_done(). So set ubq->canceling when queue is frozen, this way makes sure that the flag can be observed from ublk_queue_rq() reliably, and avoids use-after-free on uring_cmd. Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 41 +++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c060da409ed8..fbcb7c2ff851 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1446,18 +1446,28 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) } } -static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) +/* Must be called when queue is frozen */ +static bool ublk_mark_queue_canceling(struct ublk_queue *ubq) { - struct gendisk *disk; + bool canceled; spin_lock(&ubq->cancel_lock); - if (ubq->canceling) { - spin_unlock(&ubq->cancel_lock); - return false; - } - ubq->canceling = true; + canceled = ubq->canceling; + if (!canceled) + ubq->canceling = true; spin_unlock(&ubq->cancel_lock); + return canceled; +} + +static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) +{ + bool was_canceled = ubq->canceling; + struct gendisk *disk; + + if (was_canceled) + return false; + spin_lock(&ub->lock); disk = ub->ub_disk; if (disk) @@ -1468,14 +1478,23 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) if (!disk) return false; - /* Now we are serialized with ublk_queue_rq() */ + /* + * Now we are serialized with ublk_queue_rq() + * + * Make sure that ubq->canceling is set when queue is frozen, + * because ublk_queue_rq() has to rely on this flag for avoiding to + * touch completed uring_cmd + */ blk_mq_quiesce_queue(disk->queue); - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); + was_canceled = ublk_mark_queue_canceling(ubq); + if (!was_canceled) { + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); + } blk_mq_unquiesce_queue(disk->queue); put_device(disk_to_dev(disk)); - return true; + return !was_canceled; } static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, From 7e2fe01a69f6be3e284b38cfd2e4e0598a3b0a8f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:11 +0800 Subject: [PATCH 03/44] ublk: comment on ubq->canceling handling in ublk_queue_rq() In ublk_queue_rq(), ubq->canceling has to be handled after ->fail_io and ->force_abort are dealt with, otherwise the request may not be failed when deleting disk. Add comment on this usage. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index fbcb7c2ff851..5b0c885dc38f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1310,6 +1310,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; + /* + * ->canceling has to be handled after ->force_abort and ->fail_io + * is dealt with, otherwise this request may not be failed in case + * of recovery, and cause hang when deleting disk + */ if (unlikely(ubq->canceling)) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; From 705b80841eda212df79a43371f5ccb3bcadbb893 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:12 +0800 Subject: [PATCH 04/44] ublk: remove two unused fields from 'struct ublk_queue' Remove two unused fields(`io_addr` & `max_io_sz`) from `struct ublk_queue`. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5b0c885dc38f..b60f4bd647a1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -143,8 +143,6 @@ struct ublk_queue { struct task_struct *ubq_daemon; char *io_cmd_buf; - unsigned long io_addr; /* mapped vm address */ - unsigned int max_io_sz; bool force_abort; bool timeout; bool canceling; From 1d781c0de08c0b35948ad4aaf609a4cc9995d9f6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:13 +0800 Subject: [PATCH 05/44] ublk: add helper of ublk_need_map_io() ublk_need_map_io() is more readable. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b60f4bd647a1..200504dd67a9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -596,6 +596,11 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } +static inline bool ublk_need_map_io(const struct ublk_queue *ubq) +{ + return !ublk_support_user_copy(ubq); +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { /* @@ -923,7 +928,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes; /* @@ -947,7 +952,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes; if (ublk_need_unmap_req(req)) { @@ -1867,7 +1872,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled @@ -1889,7 +1894,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO. From b460f328e257db6af0d127fc8a2437f64ad01d3a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:14 +0800 Subject: [PATCH 06/44] ublk: call io_uring_cmd_to_pdu to get uring_cmd pdu Call io_uring_cmd_to_pdu() to get uring_cmd pdu, and one big benefit is the automatic pdu size build check. Suggested-by: Uday Shankar Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250327095123.179113-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 200504dd67a9..1e11816d0b90 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1040,7 +1040,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( struct io_uring_cmd *ioucmd) { - return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; + return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) From ebf695f129367ed4b26df6baec2ea7fc50c9e6f0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:15 +0800 Subject: [PATCH 07/44] ublk: add segment parameter IO split is usually bad in io_uring world, since -EAGAIN is caused and IO handling may have to fallback to io-wq, this way does hurt performance. ublk starts to support zero copy recently, for avoiding unnecessary IO split, ublk driver's segment limit should be aligned with backend device's segment limit. Another reason is that io_buffer_register_bvec() needs to allocate bvecs, which number is aligned with ublk request segment number, so that big memory allocation can be avoided by setting reasonable max_segments limit. So add segment parameter for providing ublk server chance to align segment limit with backend, and keep it reasonable from implementation viewpoint. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 20 +++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 25 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1e11816d0b90..a5bcf3aa9d8c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -74,7 +74,7 @@ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ - UBLK_PARAM_TYPE_DMA_ALIGN) + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) struct ublk_rq_data { struct kref ref; @@ -580,6 +580,18 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + const struct ublk_param_segment *p = &ub->params.seg; + + if (!is_power_of_2(p->seg_boundary_mask + 1)) + return -EINVAL; + + if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + } + return 0; } @@ -2370,6 +2382,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) lim.dma_alignment = ub->params.dma.alignment; + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; + lim.max_segment_size = ub->params.seg.max_segment_size; + lim.max_segments = ub->params.seg.max_segments; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 7255b36b5cf6..583b86681c93 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -410,6 +410,29 @@ struct ublk_param_dma_align { __u8 pad[4]; }; +#define UBLK_MIN_SEGMENT_SIZE 4096 +/* + * If any one of the three segment parameter is set as 0, the behavior is + * undefined. + */ +struct ublk_param_segment { + /* + * seg_boundary_mask + 1 needs to be power_of_2(), and the sum has + * to be >= UBLK_MIN_SEGMENT_SIZE(4096) + */ + __u64 seg_boundary_mask; + + /* + * max_segment_size could be override by virt_boundary_mask, so be + * careful when setting both. + * + * max_segment_size has to be >= UBLK_MIN_SEGMENT_SIZE(4096) + */ + __u32 max_segment_size; + __u16 max_segments; + __u8 pad[2]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -423,6 +446,7 @@ struct ublk_params { #define UBLK_PARAM_TYPE_DEVT (1 << 2) #define UBLK_PARAM_TYPE_ZONED (1 << 3) #define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) +#define UBLK_PARAM_TYPE_SEGMENT (1 << 5) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; @@ -430,6 +454,7 @@ struct ublk_params { struct ublk_param_devt devt; struct ublk_param_zoned zoned; struct ublk_param_dma_align dma; + struct ublk_param_segment seg; }; #endif From 17970209167d521da2f48d45a4242a57fd39d223 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:16 +0800 Subject: [PATCH 08/44] ublk: document zero copy feature Add words to explain how zero copy feature works, and why it has to be trusted for handling IO read command. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 1e0e7358e14a..74c57488dc9a 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -309,18 +309,35 @@ with specified IO tag in the command data: ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy the server buffer (pages) read to the IO request pages. -Future development -================== - Zero copy --------- -Zero copy is a generic requirement for nbd, fuse or similar drivers. A -problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace -can't be remapped any more in kernel with existing mm interfaces. This can -occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that -big requests (IO size >= 256 KB) may benefit a lot from zero copy. +ublk zero copy relies on io_uring's fixed kernel buffer, which provides +two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`. +ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call +`io_buffer_register_bvec()` for ublk server to register client request +buffer into io_uring buffer table, then ublk server can submit io_uring +IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF` +calls `io_buffer_unregister_bvec()` to unregister the buffer, which is +guaranteed to be live between calling `io_buffer_register_bvec()` and +`io_buffer_unregister_bvec()`. Any io_uring operation which supports this +kind of kernel buffer will grab one reference of the buffer until the +operation is completed. + +ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and +be trusted, because it is ublk server's responsibility to make sure IO buffer +filled with data for handling read command, and ublk server has to return +correct result to ublk driver when handling READ command, and the result +has to match with how many bytes filled to the IO buffer. Otherwise, +uninitialized kernel IO buffer will be exposed to client application. + +ublk server needs to align the parameter of `struct ublk_param_dma_align` +with backend for zero copy to work correctly. + +For reaching best IO performance, ublk server should align its segment +parameter of `struct ublk_param_segment` with backend for avoiding +unnecessary IO split, which usually hurts io_uring performance. References ========== From d796cea7b9f33b6315362f504b15fcc26d678493 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:17 +0800 Subject: [PATCH 09/44] ublk: implement ->queue_rqs() Implement ->queue_rqs() for improving perf in case of MQ. In this way, we just need to call io_uring_cmd_complete_in_task() once for whole IO batch, then both io_uring and ublk server can get exact batch from ublk frontend. Follows IOPS improvement: - tests tools/testing/selftests/ublk/kublk add -t null -q 2 [-z] fio/t/io_uring -p0 /dev/ublkb0 - results: more than 10% IOPS boost observed Pass all ublk selftests, especially the io dispatch order test. Cc: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-9-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 131 +++++++++++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 20 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a5bcf3aa9d8c..f97919460515 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -81,6 +81,20 @@ struct ublk_rq_data { }; struct ublk_uring_cmd_pdu { + /* + * Store requests in same batch temporarily for queuing them to + * daemon context. + * + * It should have been stored to request payload, but we do want + * to avoid extra pre-allocation, and uring_cmd payload is always + * free for us + */ + struct request *req_list; + + /* + * The following two are valid in this cmd whole lifetime, and + * setup in ublk uring_cmd handler + */ struct ublk_queue *ubq; u16 tag; }; @@ -1170,14 +1184,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_dispatch_req(struct ublk_queue *ubq, + struct io_uring_cmd *cmd, + struct request *req, + unsigned int issue_flags) { - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - struct ublk_queue *ubq = pdu->ubq; - int tag = pdu->tag; - struct request *req = blk_mq_tag_to_rq( - ubq->dev->tag_set.tags[ubq->q_id], tag); + int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; unsigned int mapped_bytes; @@ -1252,6 +1264,18 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + int tag = pdu->tag; + struct request *req = blk_mq_tag_to_rq( + ubq->dev->tag_set.tags[ubq->q_id], tag); + + ublk_dispatch_req(ubq, cmd, req, issue_flags); +} + static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct ublk_io *io = &ubq->ios[rq->tag]; @@ -1259,6 +1283,35 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); } +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct request *rq = pdu->req_list; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + struct request *next; + + while (rq) { + struct ublk_io *io = &ubq->ios[rq->tag]; + + next = rq->rq_next; + rq->rq_next = NULL; + ublk_dispatch_req(ubq, io->cmd, rq, issue_flags); + rq = next; + } +} + +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + struct request *rq = rq_list_peek(l); + struct ublk_io *io = &ubq->ios[rq->tag]; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); + + pdu->req_list = rq; + rq_list_init(l); + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); +} + static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; @@ -1297,21 +1350,12 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_RESET_TIMER; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; - if (unlikely(ubq->fail_io)) { + if (unlikely(ubq->fail_io)) return BLK_STS_TARGET; - } - - /* fill iod to slot in io cmd buffer */ - res = ublk_setup_iod(ubq, rq); - if (unlikely(res != BLK_STS_OK)) - return BLK_STS_IOERR; /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to @@ -1325,6 +1369,29 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; + if (unlikely(ubq->canceling)) + return BLK_STS_IOERR; + + /* fill iod to slot in io cmd buffer */ + res = ublk_setup_iod(ubq, rq); + if (unlikely(res != BLK_STS_OK)) + return BLK_STS_IOERR; + + blk_mq_start_request(rq); + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; + + res = ublk_prep_req(ubq, rq); + if (res != BLK_STS_OK) + return res; + /* * ->canceling has to be handled after ->force_abort and ->fail_io * is dealt with, otherwise this request may not be failed in case @@ -1335,12 +1402,35 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } - blk_mq_start_request(bd->rq); ublk_queue_cmd(ubq, rq); - return BLK_STS_OK; } +static void ublk_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + + if (ublk_prep_req(ubq, req) == BLK_STS_OK) + rq_list_add_tail(&submit_list, req); + else + rq_list_add_tail(&requeue_list, req); + } + + if (ubq && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1353,6 +1443,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_queue_rqs, .init_hctx = ublk_init_hctx, .timeout = ublk_timeout, }; From daabfb50a56b11a4f15d2bdbfae129e61c08c0ac Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:18 +0800 Subject: [PATCH 10/44] ublk: rename ublk_rq_task_work_cb as ublk_cmd_tw_cb The new name is aligned with ublk_cmd_list_tw_cb(), and looks more readable. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-10-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f97919460515..355a59c78539 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1264,8 +1264,8 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; @@ -1280,7 +1280,7 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct ublk_io *io = &ubq->ios[rq->tag]; - io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_tw_cb); } static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, From 8c778614361f288ef552fd6a52a17460a45b2f4f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:19 +0800 Subject: [PATCH 11/44] selftests: ublk: add more tests for covering MQ Add test test_generic_02.sh for covering IO dispatch order in case of MQ. Especially we just support ->queue_rqs() which may affect IO dispatch order. Add test_loop_05.sh and test_stripe_03.sh for covering MQ. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-11-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 3 ++ tools/testing/selftests/ublk/test_common.sh | 6 +++ .../testing/selftests/ublk/test_generic_02.sh | 44 +++++++++++++++++++ tools/testing/selftests/ublk/test_loop_01.sh | 14 +++--- tools/testing/selftests/ublk/test_loop_03.sh | 14 +++--- tools/testing/selftests/ublk/test_loop_05.sh | 28 ++++++++++++ .../testing/selftests/ublk/test_stress_01.sh | 6 +-- .../testing/selftests/ublk/test_stress_02.sh | 6 +-- .../testing/selftests/ublk/test_stripe_01.sh | 14 +++--- .../testing/selftests/ublk/test_stripe_03.sh | 30 +++++++++++++ 10 files changed, 132 insertions(+), 33 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_generic_02.sh create mode 100755 tools/testing/selftests/ublk/test_loop_05.sh create mode 100755 tools/testing/selftests/ublk/test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 7817afe29005..7a8c994de244 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -4,6 +4,7 @@ CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh +TEST_PROGS += test_generic_02.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh @@ -11,8 +12,10 @@ TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh TEST_PROGS += test_loop_04.sh +TEST_PROGS += test_loop_05.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh +TEST_PROGS += test_stripe_03.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 75f54ac6b1c4..a88b35943227 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -23,6 +23,12 @@ _get_disk_dev_t() { echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) } +_run_fio_verify_io() { + fio --name=verify --rw=randwrite --direct=1 --ioengine=libaio \ + --bs=8k --iodepth=32 --verify=crc32c --do_verify=1 \ + --verify_state_save=0 "$@" > /dev/null +} + _create_backfile() { local my_size=$1 local my_file diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh new file mode 100755 index 000000000000..3e80121e3bf5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_02" +ERR_CODE=0 + +if ! _have_program bpftrace; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "null" "sequential io order for MQ" + +dev_id=$(_add_ublk_dev -t null -q 2) +_check_add_dev $TID $? + +dev_t=$(_get_disk_dev_t "$dev_id") +bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & +btrace_pid=$! +sleep 2 + +if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# run fio over this ublk disk +fio --name=write_seq \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? +kill "$btrace_pid" +wait +if grep -q "io_out_of_order" "$UBLK_TMP"; then + cat "$UBLK_TMP" + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index c882d2a08e13..1ef8b6044777 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -6,6 +6,10 @@ TID="loop_01" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "loop" "write and verify test" backfile_0=$(_create_backfile 256M) @@ -14,15 +18,7 @@ dev_id=$(_add_ublk_dev -t loop "$backfile_0") _check_add_dev $TID $? "${backfile_0}" # run fio over the ublk disk -fio --name=write_and_verify \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=256M \ - --direct=1 \ - --verify=crc32c \ - --do_verify=1 \ - --bs=4k > /dev/null 2>&1 +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M ERR_CODE=$? _cleanup_test "loop" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index 269c96787d7d..e9ca744de8b1 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -6,6 +6,10 @@ TID="loop_03" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "loop" "write and verify over zero copy" backfile_0=$(_create_backfile 256M) @@ -13,15 +17,7 @@ dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") _check_add_dev $TID $? "$backfile_0" # run fio over the ublk disk -fio --name=write_and_verify \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=64 \ - --rw=write \ - --size=256M \ - --direct=1 \ - --verify=crc32c \ - --do_verify=1 \ - --bs=4k > /dev/null 2>&1 +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M ERR_CODE=$? _cleanup_test "loop" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh new file mode 100755 index 000000000000..2e6e2e6978fc --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_05" +ERR_CODE=0 + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "loop" "write and verify test" + +backfile_0=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -q 2 -t loop "$backfile_0") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7177f6c57bc5..a8be24532b24 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -27,20 +27,20 @@ ublk_io_and_remove() _prep_test "stress" "run IO and remove device" -ublk_io_and_remove 8G -t null +ublk_io_and_remove 8G -t null -q 4 ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi BACK_FILE=$(_create_backfile 256M) -ublk_io_and_remove 256M -t loop "${BACK_FILE}" +ublk_io_and_remove 256M -t loop -q 4 "${BACK_FILE}" ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi -ublk_io_and_remove 256M -t loop -z "${BACK_FILE}" +ublk_io_and_remove 256M -t loop -q 4 -z "${BACK_FILE}" ERR_CODE=$? _cleanup_test "stress" _remove_backfile "${BACK_FILE}" diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 2a8e60579a06..2159e4cc8140 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -27,20 +27,20 @@ ublk_io_and_kill_daemon() _prep_test "stress" "run IO and kill ublk server" -ublk_io_and_kill_daemon 8G -t null +ublk_io_and_kill_daemon 8G -t null -q 4 ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi BACK_FILE=$(_create_backfile 256M) -ublk_io_and_kill_daemon 256M -t loop "${BACK_FILE}" +ublk_io_and_kill_daemon 256M -t loop -q 4 "${BACK_FILE}" ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi -ublk_io_and_kill_daemon 256M -t loop -z "${BACK_FILE}" +ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${BACK_FILE}" ERR_CODE=$? _cleanup_test "stress" _remove_backfile "${BACK_FILE}" diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index c01f3dc325ab..7e387ef656ea 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -6,6 +6,10 @@ TID="stripe_01" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "stripe" "write and verify test" backfile_0=$(_create_backfile 256M) @@ -15,15 +19,7 @@ dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") _check_add_dev $TID $? "${backfile_0}" # run fio over the ublk disk -fio --name=write_and_verify \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=32 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --verify=crc32c \ - --do_verify=1 \ - --bs=4k > /dev/null 2>&1 +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=512M ERR_CODE=$? _cleanup_test "stripe" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh new file mode 100755 index 000000000000..c1b34af36145 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_03" +ERR_CODE=0 + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stripe" "write and verify test" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -q 2 -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=512M +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE From c78ae7b71ed66a180708377b45042ef77efc840e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:20 +0800 Subject: [PATCH 12/44] selftests: ublk: add test for checking zero copy related parameter ublk zero copy usually requires to set dma and segment parameter correctly, so hard-code null target's dma & segment parameter in non-default value, and verify if they are setup correctly by ublk driver. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-12-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/null.c | 11 +++++++- .../testing/selftests/ublk/test_generic_03.sh | 28 +++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/ublk/test_generic_03.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 7a8c994de244..d98680d64a2f 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -5,6 +5,7 @@ LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh TEST_PROGS += test_generic_02.sh +TEST_PROGS += test_generic_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 899875ff50fe..91fec3690d4b 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -17,7 +17,8 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->tgt.dev_size = dev_size; dev->tgt.params = (struct ublk_params) { - .types = UBLK_PARAM_TYPE_BASIC, + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN | + UBLK_PARAM_TYPE_SEGMENT, .basic = { .logical_bs_shift = 9, .physical_bs_shift = 12, @@ -26,6 +27,14 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_sectors = info->max_io_buf_bytes >> 9, .dev_sectors = dev_size >> 9, }, + .dma = { + .alignment = 4095, + }, + .seg = { + .seg_boundary_mask = 4095, + .max_segment_size = 32 << 10, + .max_segments = 32, + }, }; if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh new file mode 100755 index 000000000000..b551aa76cb0d --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_03" +ERR_CODE=0 + +_prep_test "null" "check dma & segment limits for zero copy" + +dev_id=$(_add_ublk_dev -t null -z) +_check_add_dev $TID $? + +sysfs_path=/sys/block/ublkb"${dev_id}" +dma_align=$(cat "$sysfs_path"/queue/dma_alignment) +max_segments=$(cat "$sysfs_path"/queue/max_segments) +max_segment_size=$(cat "$sysfs_path"/queue/max_segment_size) +if [ "$dma_align" != "4095" ]; then + ERR_CODE=255 +fi +if [ "$max_segments" != "32" ]; then + ERR_CODE=255 +fi +if [ "$max_segment_size" != "32768" ]; then + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE From dfbce8b798fb848a42706e2e544b78b3db22aaae Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 12:04:07 -0600 Subject: [PATCH 13/44] ublk: remove unused cmd argument to ublk_dispatch_req() ublk_dispatch_req() never uses its struct io_uring_cmd *cmd argument. Drop it so callers don't have to pass a value. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250328180411.2696494-2-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 355a59c78539..39efe443e235 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1185,7 +1185,6 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, } static void ublk_dispatch_req(struct ublk_queue *ubq, - struct io_uring_cmd *cmd, struct request *req, unsigned int issue_flags) { @@ -1273,7 +1272,7 @@ static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, struct request *req = blk_mq_tag_to_rq( ubq->dev->tag_set.tags[ubq->q_id], tag); - ublk_dispatch_req(ubq, cmd, req, issue_flags); + ublk_dispatch_req(ubq, req, issue_flags); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) @@ -1292,11 +1291,9 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, struct request *next; while (rq) { - struct ublk_io *io = &ubq->ios[rq->tag]; - next = rq->rq_next; rq->rq_next = NULL; - ublk_dispatch_req(ubq, io->cmd, rq, issue_flags); + ublk_dispatch_req(ubq, rq, issue_flags); rq = next; } } From 9d7fa99189709b80eb16094aad18f7e492b835de Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 12:04:08 -0600 Subject: [PATCH 14/44] ublk: skip 1 NULL check in ublk_cmd_list_tw_cb() loop ublk_cmd_list_tw_cb() is always performed on a non-empty request list. So don't check whether rq is NULL on the first iteration of the loop, just on subsequent iterations. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250328180411.2696494-3-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 39efe443e235..8b9780c0feab 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1290,12 +1290,12 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, struct ublk_queue *ubq = rq->mq_hctx->driver_data; struct request *next; - while (rq) { + do { next = rq->rq_next; rq->rq_next = NULL; ublk_dispatch_req(ubq, rq, issue_flags); rq = next; - } + } while (rq); } static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) From 6a87fc437a034e4be2a63d8dfd4d2985c6c574bc Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 12:04:09 -0600 Subject: [PATCH 15/44] ublk: get ubq from pdu in ublk_cmd_list_tw_cb() Save a few pointer dereferences by obtaining struct ublk_queue *ubq from the ublk_uring_cmd_pdu instead of the request's mq_hctx. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250328180411.2696494-4-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8b9780c0feab..9276d1fcc100 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1287,7 +1287,7 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct request *rq = pdu->req_list; - struct ublk_queue *ubq = rq->mq_hctx->driver_data; + struct ublk_queue *ubq = pdu->ubq; struct request *next; do { From 108d8aecaeeb52f5fbe98ac94da534954db1da44 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 12:04:10 -0600 Subject: [PATCH 16/44] ublk: avoid redundant io->cmd in ublk_queue_cmd_list() ublk_queue_cmd_list() loads io->cmd twice. The intervening stores prevent the compiler from combining the loads. Since struct ublk_io *io is only used to compute io->cmd, replace the variable with io->cmd. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250328180411.2696494-5-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9276d1fcc100..23250471562a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1301,12 +1301,12 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) { struct request *rq = rq_list_peek(l); - struct ublk_io *io = &ubq->ios[rq->tag]; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); pdu->req_list = rq; rq_list_init(l); - io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); + io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); } static enum blk_eh_timer_return ublk_timeout(struct request *rq) From 00cfc05cf81f58b1bc2650e18228350a094b1f6d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 12:04:11 -0600 Subject: [PATCH 17/44] ublk: store req in ublk_uring_cmd_pdu for ublk_cmd_tw_cb() Pass struct request *rq to ublk_cmd_tw_cb() through ublk_uring_cmd_pdu, mirroring how it works for ublk_cmd_list_tw_cb(). This saves some pointer dereferences, as well as the bounds check in blk_mq_tag_to_rq(). Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250328180411.2696494-6-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 23250471562a..466a23b89379 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -89,7 +89,10 @@ struct ublk_uring_cmd_pdu { * to avoid extra pre-allocation, and uring_cmd payload is always * free for us */ - struct request *req_list; + union { + struct request *req; + struct request *req_list; + }; /* * The following two are valid in this cmd whole lifetime, and @@ -1268,18 +1271,17 @@ static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - int tag = pdu->tag; - struct request *req = blk_mq_tag_to_rq( - ubq->dev->tag_set.tags[ubq->q_id], tag); - ublk_dispatch_req(ubq, req, issue_flags); + ublk_dispatch_req(ubq, pdu->req, issue_flags); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { - struct ublk_io *io = &ubq->ios[rq->tag]; + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_tw_cb); + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); } static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, From a20b8631c8885cda45a331a151d29a83dfbfdefb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:10:54 +0000 Subject: [PATCH 18/44] io_uring/net: open code io_sendmsg_copy_hdr() io_sendmsg_setup() is trivial and io_sendmsg_copy_hdr() doesn't add any good abstraction, open code one into another. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/565318ce585665e88053663eeee5178d2c15692f.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 228b4f13d34c..34d103f2469d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -325,25 +325,6 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, return 0; } -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr msg; - int ret; - - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); - if (unlikely(ret)) - return ret; - - if (!(req->flags & REQ_F_BUFFER_SELECT)) - ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, - ITER_SOURCE); - /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control_user; - return ret; -} - void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { struct io_async_msghdr *io = req->async_data; @@ -392,10 +373,19 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct user_msghdr msg; + int ret; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) + return ret; + /* save msg_control as sys_sendmsg() overwrites it */ + sr->msg_control = kmsg->msg.msg_control_user; - return io_sendmsg_copy_hdr(req, kmsg); + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) From 5f364117db942c15980111f2e8ff6025c7e5893a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:10:55 +0000 Subject: [PATCH 19/44] io_uring/net: open code io_net_vec_assign() Get rid of io_net_vec_assign() by open coding it into its only caller. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/19191c34b5cfe1161f7eeefa6e785418ea9ad56d.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 34d103f2469d..68f87d7c74df 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -176,16 +176,6 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) return hdr; } -/* assign new iovec to kmsg, if we need to */ -static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, - struct iovec *iov) -{ - if (iov) { - req->flags |= REQ_F_NEED_CLEANUP; - io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs); - } -} - static inline void io_mshot_prep_retry(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -217,7 +207,11 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg &iomsg->msg.msg_iter, io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; - io_net_vec_assign(req, iomsg, iov); + + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); + } return 0; } From c55e2845dfa72e647ed8d9a7b4c6e11a8ed0fc1e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:10:56 +0000 Subject: [PATCH 20/44] io_uring/net: combine sendzc flags writes Save an instruction / trip to the cache and assign some of sendzc flags together. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c519d6f406776c3be3ef988a8339a88e45d1ffd9.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 68f87d7c74df..ef0faa07627f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1291,7 +1291,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->done_io = 0; zc->retry = false; - req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1305,7 +1304,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif->cqe.user_data = req->cqe.user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; - req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; zc->flags = READ_ONCE(sqe->ioprio); if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { From 63b16e4f0b90abad500ecb7bc7a625278febdc2c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:10:57 +0000 Subject: [PATCH 21/44] io_uring/net: unify sendmsg setup with zc io_sendmsg_zc_setup() duplicates parts of io_sendmsg_setup(), and the only difference between them is that the former support vectored registered buffers with nothing zerocopy specific. Merge them together, we want regular sendmsg to eventually support fixed buffers either way. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7e5ec40f9dc93355399dc6fa0cbc8b31f0b20ac5.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index ef0faa07627f..6d02c8822cc9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -377,32 +377,16 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = kmsg->msg.msg_control_user; + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; + return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, + msg.msg_iovlen); + } if (req->flags & REQ_F_BUFFER_SELECT) return 0; return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } -static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; - struct user_msghdr msg; - int ret; - - if (!(sr->flags & IORING_RECVSEND_FIXED_BUF)) - return io_sendmsg_setup(req, sqe); - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - - ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); - if (unlikely(ret)) - return ret; - sr->msg_control = kmsg->msg.msg_control_user; - kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; - - return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, msg.msg_iovlen); -} - #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1339,7 +1323,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_IMPORT_BUFFER; return io_send_setup(req, sqe); } - ret = io_sendmsg_zc_setup(req, sqe); + ret = io_sendmsg_setup(req, sqe); if (unlikely(ret)) return ret; From 49dbce5602dc50343c9794d0ddf05d1f6c9cb592 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:10:58 +0000 Subject: [PATCH 22/44] io_uring/net: clusterise send vs msghdr branches We have multiple branches at prep for send vs sendmsg handling, put them together so that the variant handling is more localised. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/33abf666d9ded74cba4da2f0d9fe58e88520dffe.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 6d02c8822cc9..9acd8d9f80b2 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -395,12 +395,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->done_io = 0; sr->retry = false; - - if (req->opcode != IORING_OP_SEND) { - if (sqe->addr2 || sqe->file_index) - return -EINVAL; - } - sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -426,6 +420,8 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) return io_send_setup(req, sqe); + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; return io_sendmsg_setup(req, sqe); } @@ -1303,11 +1299,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (req->opcode != IORING_OP_SEND_ZC) { - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; - } - zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; req->buf_index = READ_ONCE(sqe->buf_index); @@ -1323,6 +1314,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_IMPORT_BUFFER; return io_send_setup(req, sqe); } + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; ret = io_sendmsg_setup(req, sqe); if (unlikely(ret)) return ret; From ad3f6cc40084f9adb1a53bf386d966073dc6a4e9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:10:59 +0000 Subject: [PATCH 23/44] io_uring/net: set sg_from_iter in advance In preparation to the next patch, set ->sg_from_iter callback at request prep time. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5fe2972701df3bacdb3d760bce195fa640bee201.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 9acd8d9f80b2..749dd298c502 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -97,6 +97,11 @@ struct io_recvzc { struct io_zcrx_ifq *ifq; }; +static int io_sg_from_iter_iovec(struct sk_buff *skb, + struct iov_iter *from, size_t length); +static int io_sg_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length); + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -1266,6 +1271,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_ring_ctx *ctx = req->ctx; + struct io_async_msghdr *iomsg; struct io_kiocb *notif; int ret; @@ -1308,8 +1314,15 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (io_is_compat(req->ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; - if (unlikely(!io_msg_alloc_async(req))) + iomsg = io_msg_alloc_async(req); + if (unlikely(!iomsg)) return -ENOMEM; + + if (zc->flags & IORING_RECVSEND_FIXED_BUF) + iomsg->msg.sg_from_iter = io_sg_from_iter; + else + iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; + if (req->opcode == IORING_OP_SEND_ZC) { req->flags |= REQ_F_IMPORT_BUFFER; return io_send_setup(req, sqe); @@ -1320,11 +1333,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ret)) return ret; - if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { - struct io_async_msghdr *iomsg = req->async_data; - + if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); - } return 0; } @@ -1391,7 +1401,6 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) ITER_SOURCE, issue_flags); if (unlikely(ret)) return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter; } else { ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); if (unlikely(ret)) @@ -1399,7 +1408,6 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) ret = io_notif_account_mem(sr->notif, sr->len); if (unlikely(ret)) return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; } return ret; @@ -1483,8 +1491,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; - if (req->flags & REQ_F_IMPORT_BUFFER) { unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; int ret; @@ -1493,7 +1499,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) &kmsg->vec, uvec_segs, issue_flags); if (unlikely(ret)) return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter; req->flags &= ~REQ_F_IMPORT_BUFFER; } From fbe1a30c5d3e6f184ddd63deded6f30c3ecc4c3f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:11:00 +0000 Subject: [PATCH 24/44] io_uring/net: import zc ubuf earlier io_send_setup() already sets up the iterator for IORING_OP_SEND_ZC, we don't need repeating that at issue time. Move it all together with mem accounting at prep time, which is more consistent with how the non-zc version does that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/eb54f007c493ad9f4ca89aa8e715baf30d83fb88.1743202294.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 749dd298c502..eaa627eddb4a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1318,23 +1318,23 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!iomsg)) return -ENOMEM; - if (zc->flags & IORING_RECVSEND_FIXED_BUF) - iomsg->msg.sg_from_iter = io_sg_from_iter; - else - iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; - if (req->opcode == IORING_OP_SEND_ZC) { - req->flags |= REQ_F_IMPORT_BUFFER; - return io_send_setup(req, sqe); + if (zc->flags & IORING_RECVSEND_FIXED_BUF) + req->flags |= REQ_F_IMPORT_BUFFER; + ret = io_send_setup(req, sqe); + } else { + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; + ret = io_sendmsg_setup(req, sqe); } - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; - ret = io_sendmsg_setup(req, sqe); if (unlikely(ret)) return ret; - if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) + if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { + iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); + } + iomsg->msg.sg_from_iter = io_sg_from_iter; return 0; } @@ -1392,25 +1392,13 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; - int ret; - if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - sr->notif->buf_index = req->buf_index; - ret = io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, - (u64)(uintptr_t)sr->buf, sr->len, - ITER_SOURCE, issue_flags); - if (unlikely(ret)) - return ret; - } else { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - ret = io_notif_account_mem(sr->notif, sr->len); - if (unlikely(ret)) - return ret; - } + WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); - return ret; + sr->notif->buf_index = req->buf_index; + return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); } int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) From b0e9570a6b19fb0e53090489838dc0de27795eb9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:11:49 +0000 Subject: [PATCH 25/44] io_uring/msg: rename io_double_lock_ctx() io_double_lock_ctx() doesn't lock both rings. Rename it to prevent any future confusion. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9e5defa000efd9b0f5e169cbb6bad4994d46ec5c.1743190078.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 0bbcbbcdebfd..bea5a96587b7 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -38,8 +38,8 @@ static void io_double_unlock_ctx(struct io_ring_ctx *octx) mutex_unlock(&octx->uring_lock); } -static int io_double_lock_ctx(struct io_ring_ctx *octx, - unsigned int issue_flags) +static int io_lock_external_ctx(struct io_ring_ctx *octx, + unsigned int issue_flags) { /* * To ensure proper ordering between the two ctxs, we can only @@ -154,7 +154,7 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; } if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) @@ -199,7 +199,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag struct file *src_file = msg->src_file; int ret; - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); From 9cc0bbdaba2a66ad90bc6ce45163b7745baffe98 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:11:50 +0000 Subject: [PATCH 26/44] io_uring/msg: initialise msg request opcode It's risky to have msg request opcode set to garbage, so at least initialise it to nop. Later we might want to add a user inaccessible opcode for such cases. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9afe650fcb348414a4529d89f52eb8969ba06efd.1743190078.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index bea5a96587b7..6c51b942d020 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -93,6 +93,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, kmem_cache_free(req_cachep, req); return -EOWNERDEAD; } + req->opcode = IORING_OP_NOP; req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); From ea9106786e264483312b9b270fca1b506223338d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Mar 2025 23:11:51 +0000 Subject: [PATCH 27/44] io_uring: don't pass ctx to tw add remote helper Unlike earlier versions, io_msg_remote_post() creates a valid request with a proper context, so don't pass a context to io_req_task_work_add_remote() explicitly but derive it from the request. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/721f51cf34996d98b48f0bfd24ad40aa2730167e.1743190078.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 ++++++-------- io_uring/io_uring.h | 3 +-- io_uring/msg_ring.c | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3ba49c628337..e4484a03e033 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1141,10 +1141,9 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned flags) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { + struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1239,17 +1238,16 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_req_local_work_add(req, req->ctx, flags); + io_req_local_work_add(req, flags); else io_req_normal_work_add(req); } -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags) +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) { - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) return; - io_req_local_work_add(req, ctx, flags); + __io_req_task_work_add(req, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 87f883130286..e4050b2d0821 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -89,8 +89,7 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 6c51b942d020..50a958e9c921 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -100,7 +100,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, req->ctx = ctx; req->tctx = NULL; req->io_task_work.func = io_msg_tw_complete; - io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE); return 0; } From 9a45714fc51321ea8f5e5567f70e06753a848f62 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 13:42:29 -0600 Subject: [PATCH 28/44] ublk: specify io_cmd_buf pointer type io_cmd_buf points to an array of ublksrv_io_desc structs but its type is char *. Indexing the array requires an explicit multiplication and cast. The compiler also can't check the pointer types. Change io_cmd_buf's type to struct ublksrv_io_desc * so it can be indexed directly and the compiler can type-check the code. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20250328194230.2726862-2-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 466a23b89379..2fd05c1bd30b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -158,7 +158,7 @@ struct ublk_queue { unsigned long flags; struct task_struct *ubq_daemon; - char *io_cmd_buf; + struct ublksrv_io_desc *io_cmd_buf; bool force_abort; bool timeout; @@ -706,11 +706,11 @@ static inline bool ublk_rq_has_data(const struct request *rq) static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag) { - return (struct ublksrv_io_desc *) - &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); + return &ubq->io_cmd_buf[tag]; } -static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) +static inline struct ublksrv_io_desc * +ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) { return ublk_get_queue(ub, q_id)->io_cmd_buf; } From 25aaa81371e7db34ddb42c69ed4f6c5bc8de2afa Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 13:42:30 -0600 Subject: [PATCH 29/44] selftests: ublk: specify io_cmd_buf pointer type Matching the ublk driver, change the type of io_cmd_buf from char * to struct ublksrv_io_desc *. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20250328194230.2726862-3-csander@purestorage.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 2 +- tools/testing/selftests/ublk/kublk.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 05147b53c361..83756f97c26e 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -322,7 +322,7 @@ static int ublk_queue_init(struct ublk_queue *q) cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); - q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, + q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ, MAP_SHARED | MAP_POPULATE, dev->fds[0], off); if (q->io_cmd_buf == MAP_FAILED) { ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index f31a5c4d4143..760ff8ffb810 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -128,7 +128,7 @@ struct ublk_queue { unsigned int io_inflight; struct ublk_dev *dev; const struct ublk_tgt_ops *tgt_ops; - char *io_cmd_buf; + struct ublksrv_io_desc *io_cmd_buf; struct io_uring ring; struct ublk_io ios[UBLK_QUEUE_DEPTH]; #define UBLKSRV_QUEUE_STOPPING (1U << 0) @@ -302,7 +302,7 @@ static inline void ublk_mark_io_done(struct ublk_io *io, int res) static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) { - return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); + return &q->io_cmd_buf[tag]; } static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) From f28a71bc979392234cc110cd1e6787fb5b432116 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 31 Mar 2025 07:06:22 -0600 Subject: [PATCH 30/44] Documentation: ublk: remove dead footnote A previous commit removed the use of this footnote, delete it. Reported-by: Stephen Rothwell Fixes: 3fdf2ec7da1c ("Documentation: ublk: Drop Stefan Hajnoczi's message footnote") Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 74c57488dc9a..854f823b46c2 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -349,5 +349,3 @@ References .. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk .. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README - -.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ From 697b2876ac037545ba2761e2ffe9a5c2af6424e6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 08:54:00 +0100 Subject: [PATCH 31/44] io_uring: add req flag invariant build assertion We're caching some of file related request flags in a tricky way, put a build check to make sure flags don't get reshuffled. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9877577b83c25dd78224a8274f799187e7ec7639.1743407551.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e4484a03e033..a4065e3d13d0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1643,6 +1643,8 @@ io_req_flags_t io_file_get_flags(struct file *file) { io_req_flags_t res = 0; + BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); + if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) From 487a0710f87e7fa1f260a1b2213b77f0496cea43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 08:55:32 +0100 Subject: [PATCH 32/44] io_uring: make zcrx depend on CONFIG_IO_URING Reflect in the kconfig that zcrx requires io_uring compiled. Fixes: 6f377873cb239 ("io_uring/zcrx: add interface queue and refill queue") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8047135a344e79dbd04ee36a7a69cc260aabc2ca.1743356260.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/Kconfig b/io_uring/Kconfig index 9e2a4beba1ef..4b949c42c0bf 100644 --- a/io_uring/Kconfig +++ b/io_uring/Kconfig @@ -5,6 +5,7 @@ config IO_URING_ZCRX def_bool y + depends on IO_URING depends on PAGE_POOL depends on INET depends on NET_RX_BUSY_POLL From 296e16961817e8e5f574661febc608ac0c0c0108 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 08:55:47 +0100 Subject: [PATCH 33/44] io_uring: hide caches sqes from drivers There is now an io_uring private part of cmd async_data, move saved sqe into it. Drivers are accessing it via struct io_uring_cmd::cmd. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ecbe078dd57acefdbc4366d083327086c0879378.1743357121.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 1 - io_uring/uring_cmd.c | 4 ++-- io_uring/uring_cmd.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index e6723fa95160..0634a3de1782 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -21,7 +21,6 @@ struct io_uring_cmd { struct io_uring_cmd_data { void *op_data; - struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f2cfc371f3d0..89cee2af0ec1 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -205,8 +205,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, * that it doesn't read in per-op data, play it safe and ensure that * any SQE data is stable beyond prep. This can later get relaxed. */ - memcpy(ac->data.sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = ac->data.sqes; + memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->sqes; return 0; } diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 14e525255854..b04686b6b5d2 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -6,6 +6,7 @@ struct io_async_cmd { struct io_uring_cmd_data data; struct iou_vec vec; + struct io_uring_sqe sqes[2]; }; int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); From ed344511c584479ce2130d7e01a9a1e638850b0c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 08:55:11 +0100 Subject: [PATCH 34/44] io_uring: cleanup {g,s]etsockopt sqe reading Add a local variable for the sqe pointer to avoid repetition. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8dbac0f9acda2d3842534eeb7ce10d9276b021ae.1743357108.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 89cee2af0ec1..a9ea7d29cdd9 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -307,17 +307,18 @@ static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) { + const struct io_uring_sqe *sqe = cmd->sqe; bool compat = !!(issue_flags & IO_URING_F_COMPAT); int optlen, optname, level, err; void __user *optval; - level = READ_ONCE(cmd->sqe->level); + level = READ_ONCE(sqe->level); if (level != SOL_SOCKET) return -EOPNOTSUPP; - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); err = do_sock_getsockopt(sock, compat, level, optname, USER_SOCKPTR(optval), @@ -333,15 +334,16 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) { + const struct io_uring_sqe *sqe = cmd->sqe; bool compat = !!(issue_flags & IO_URING_F_COMPAT); int optname, optlen, level; void __user *optval; sockptr_t optval_s; - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); - level = READ_ONCE(cmd->sqe->level); + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); optval_s = USER_SOCKPTR(optval); return do_sock_setsockopt(sock, compat, level, optname, optval_s, From a1fbe0a12178a006b04a7fa528457f9901d6c6d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 19:40:21 +0100 Subject: [PATCH 35/44] io_uring/rsrc: check size when importing reg buffer We're relying on callers to verify the IO size, do it inside of io_import_fixed() instead. It's safer, easier to deal with, and more consistent as now it's done close to the iter init site. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f9c2c75ec4d356a0c61289073f68d98e8a9db190.1743446271.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3f195e24777e..59b4317b04a7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1016,6 +1016,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (unlikely(len > MAX_RW_COUNT)) + return -EFAULT; if (!(imu->dir & (1 << ddir))) return -EFAULT; From 81ed18015d65f111ddbc88599c48338a5e1927d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:17:58 +0100 Subject: [PATCH 36/44] io_uring/net: avoid import_ubuf for regvec send With registered buffers we set up iterators in helpers like io_import_fixed(), and there is no need for a import_ubuf() before that. It was fine as we used real pointers for offset calculation, but that's not the case anymore since introduction of ublk kernel buffers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9b2de1a50844f848f62c8de609b494971033a6b9.1743437358.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index eaa627eddb4a..24040bc3916a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -359,6 +359,8 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = addr_len; } + if (sr->flags & IORING_RECVSEND_FIXED_BUF) + return 0; if (!io_do_buffer_select(req)) { ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); From fcfd94d6967a98e88b834c9fd81e73c5f04d83dc Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 1 Apr 2025 12:53:55 -0700 Subject: [PATCH 37/44] io_uring/zcrx: return early from io_zcrx_recv_skb if readlen is 0 When readlen is set for a recvzc request, tcp_read_sock() will call io_zcrx_recv_skb() one final time with len == desc->count == 0. This is caused by the !desc->count check happening too late. The offset + 1 != skb->len happens earlier and causes the while loop to continue. Fix this in io_zcrx_recv_skb() instead of tcp_read_sock(). Return early if len is 0 i.e. the read is done. Fixes: 6699ec9a23f8 ("io_uring/zcrx: add a read limit to recvzc requests") Signed-off-by: David Wei Link: https://lore.kernel.org/r/20250401195355.1613813-1-dw@davidwei.uk Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 9c95b5b6ec4e..80d4a6f71d29 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -818,6 +818,14 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, int ret = 0; len = min_t(size_t, len, desc->count); + /* + * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even + * if desc->count is already 0. This is caused by the if (offset + 1 != + * skb->len) check. Return early in this case to break out of + * __tcp_read_sock(). + */ + if (!len) + return 0; if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) return -EAGAIN; From 53c959295bc3cccc1d9eec6711a5fcdd28602fc5 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Tue, 1 Apr 2025 14:49:08 -0600 Subject: [PATCH 38/44] selftests: ublk: kublk: use ioctl-encoded opcodes There are a couple of places in the kublk selftests ublk server which use the legacy ublk opcodes. These operations fail (with -EOPNOTSUPP) on a kernel compiled without CONFIG_BLKDEV_UBLK_LEGACY_OPCODES set. We could easily require it to be set as a prerequisite for these selftests, but since new applications should not be using the legacy opcodes, use the ioctl-encoded opcodes everywhere in kublk. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250401-ublk_selftests-v1-1-98129c9bc8bb@purestorage.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 83756f97c26e..d39f166c9dc3 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -99,7 +99,7 @@ static int __ublk_ctrl_cmd(struct ublk_dev *dev, static int ublk_ctrl_stop_dev(struct ublk_dev *dev) { struct ublk_ctrl_cmd_data data = { - .cmd_op = UBLK_CMD_STOP_DEV, + .cmd_op = UBLK_U_CMD_STOP_DEV, }; return __ublk_ctrl_cmd(dev, &data); @@ -169,7 +169,7 @@ static int ublk_ctrl_get_params(struct ublk_dev *dev, struct ublk_params *params) { struct ublk_ctrl_cmd_data data = { - .cmd_op = UBLK_CMD_GET_PARAMS, + .cmd_op = UBLK_U_CMD_GET_PARAMS, .flags = CTRL_CMD_HAS_BUF, .addr = (__u64)params, .len = sizeof(*params), From f8554f512b8a1ecaa354fd7b1dd63906759e7335 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Tue, 1 Apr 2025 14:49:09 -0600 Subject: [PATCH 39/44] selftests: ublk: kublk: fix an error log line When doing io_uring operations using liburing, errno is not used to indicate errors, so the %m format specifier does not provide any relevant information for failed io_uring commands. Fix a log line emitted on get_params failure to translate the error code returned in the cqe->res field instead. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250401-ublk_selftests-v1-2-98129c9bc8bb@purestorage.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index d39f166c9dc3..91c282bc7674 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -215,7 +215,7 @@ static void ublk_ctrl_dump(struct ublk_dev *dev) ret = ublk_ctrl_get_params(dev, &p); if (ret < 0) { - ublk_err("failed to get params %m\n"); + ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); return; } From 8622b20f23ed165f48b8ca61503a107d17f8d585 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Mar 2025 21:51:50 +0800 Subject: [PATCH 40/44] io_uring: add validate_fixed_range() for validate fixed buffer Add helper of validate_fixed_range() for validating fixed buffer range. Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250325135155.935398-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 59b4317b04a7..1e56cd3f55d1 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1002,15 +1002,11 @@ unlock: } EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); -static int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +static int validate_fixed_range(u64 buf_addr, size_t len, + const struct io_mapped_ubuf *imu) { u64 buf_end; - size_t offset; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ @@ -1018,6 +1014,21 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, return -EFAULT; if (unlikely(len > MAX_RW_COUNT)) return -EFAULT; + return 0; +} + +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + size_t offset; + int ret; + + if (WARN_ON_ONCE(!imu)) + return -EFAULT; + ret = validate_fixed_range(buf_addr, len, imu); + if (unlikely(ret)) + return ret; if (!(imu->dir & (1 << ddir))) return -EFAULT; @@ -1307,12 +1318,12 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; struct bio_vec *src_bvec; size_t offset; - u64 buf_end; + int ret; + + ret = validate_fixed_range(buf_addr, iov_len, imu); + if (unlikely(ret)) + return ret; - if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) - return -EFAULT; - if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) - return -EFAULT; if (unlikely(!iov_len)) return -EFAULT; if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) From 149974fdb8e186a1c72b87cee806373624a8a375 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Mar 2025 21:51:51 +0800 Subject: [PATCH 41/44] block: add for_each_mp_bvec() Add helper of for_each_mp_bvec() for io_uring to import fixed kernel buffer. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250325135155.935398-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/bvec.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ba8f52d48b94..204b22a99c4b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -184,6 +184,12 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) +#define for_each_mp_bvec(bvl, bio_vec, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ + bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) + /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ { \ From 1045afae4b8892dab99d320b17bff3b9c1f407d8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Mar 2025 21:51:52 +0800 Subject: [PATCH 42/44] io_uring: support vectored kernel fixed buffer io_uring has supported fixed kernel buffer via io_buffer_register_bvec() and io_buffer_unregister_bvec(). The vectored fixed buffer has been ready, so it is natural to support fixed kernel buffer, one use case is ublk. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250325135155.935398-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1e56cd3f55d1..5e64a8bb30a4 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1362,6 +1362,82 @@ static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, return max_segs; } +static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + const struct bio_vec *src_bvec = imu->bvec; + struct bio_vec *res_bvec = vec->bvec; + unsigned res_idx = 0; + size_t total_len = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; + size_t iov_len = iovec[iov_idx].iov_len; + struct bvec_iter bi = { + .bi_size = offset + iov_len, + }; + struct bio_vec bv; + + bvec_iter_advance(src_bvec, &bi, offset); + for_each_mp_bvec(bv, src_bvec, bi, bi) + res_bvec[res_idx++] = bv; + total_len += iov_len; + } + iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); + return 0; +} + +static int iov_kern_bvec_size(const struct iovec *iov, + const struct io_mapped_ubuf *imu, + unsigned int *nr_seg) +{ + size_t offset = (size_t)(uintptr_t)iov->iov_base; + const struct bio_vec *bvec = imu->bvec; + int start = 0, i = 0; + size_t off = 0; + int ret; + + ret = validate_fixed_range(offset, iov->iov_len, imu); + if (unlikely(ret)) + return ret; + + for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; + off += bvec[i].bv_len, i++) { + if (offset >= off && offset < off + bvec[i].bv_len) + start = i; + } + *nr_seg = i - start; + return 0; +} + +static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu, unsigned *nr_segs) +{ + unsigned max_segs = 0; + size_t total_len = 0; + unsigned i; + int ret; + + *nr_segs = 0; + for (i = 0; i < nr_iovs; i++) { + if (unlikely(!iov[i].iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov[i].iov_len, + &total_len))) + return -EOVERFLOW; + ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); + if (unlikely(ret)) + return ret; + *nr_segs += max_segs; + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + return 0; +} + int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, unsigned nr_iovs, unsigned issue_flags) @@ -1376,14 +1452,20 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, if (!node) return -EFAULT; imu = node->buf; - if (imu->is_kbuf) - return -EOPNOTSUPP; if (!(imu->dir & (1 << ddir))) return -EFAULT; iovec_off = vec->nr - nr_iovs; iov = vec->iovec + iovec_off; - nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (imu->is_kbuf) { + int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); + + if (unlikely(ret)) + return ret; + } else { + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + } if (sizeof(struct bio_vec) > sizeof(struct iovec)) { size_t bvec_bytes; @@ -1410,6 +1492,9 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, req->flags |= REQ_F_NEED_CLEANUP; } + if (imu->is_kbuf) + return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); } From 57ed58c1325690ff6a46da776e9b42b14a7c37b1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Mar 2025 21:51:53 +0800 Subject: [PATCH 43/44] selftests: ublk: enable zero copy for stripe target Use io_uring vectored fixed kernel buffer for handling stripe IO. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250325135155.935398-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/stripe.c | 69 ++++++++++++++++++++------- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index d98680d64a2f..c7781efea0f3 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -17,6 +17,7 @@ TEST_PROGS += test_loop_05.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh +TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 98c564b12f3c..179731c3dd6f 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -111,43 +111,67 @@ static void calculate_stripe_array(const struct stripe_conf *conf, } } -static inline enum io_uring_op stripe_to_uring_op(const struct ublksrv_io_desc *iod) +static inline enum io_uring_op stripe_to_uring_op( + const struct ublksrv_io_desc *iod, int zc) { unsigned ublk_op = ublksrv_get_op(iod); if (ublk_op == UBLK_IO_OP_READ) - return IORING_OP_READV; + return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) - return IORING_OP_WRITEV; + return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) { const struct stripe_conf *conf = get_chunk_shift(q); - enum io_uring_op op = stripe_to_uring_op(iod); + int zc = !!(ublk_queue_use_zc(q) != 0); + enum io_uring_op op = stripe_to_uring_op(iod, zc); struct io_uring_sqe *sqe[NR_STRIPE]; struct stripe_array *s = alloc_stripe_array(conf, iod); struct ublk_io *io = ublk_get_io(q, tag); - int i; + int i, extra = zc ? 2 : 0; io->private_data = s; calculate_stripe_array(conf, iod, s); - ublk_queue_alloc_sqes(q, sqe, s->nr); - for (i = 0; i < s->nr; i++) { - struct stripe *t = &s->s[i]; + ublk_queue_alloc_sqes(q, sqe, s->nr + extra); + + if (zc) { + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + } + + for (i = zc; i < s->nr + extra - zc; i++) { + struct stripe *t = &s->s[i - zc]; io_uring_prep_rw(op, sqe[i], t->seq + 1, (void *)t->vec, t->nr_vec, t->start << 9); - io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + if (zc) { + sqe[i]->buf_index = tag; + io_uring_sqe_set_flags(sqe[i], + IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK); + } else { + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + } /* bit63 marks us as tgt io */ - sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i, 1); + sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1); } - return s->nr; + if (zc) { + struct io_uring_sqe *unreg = sqe[s->nr + 1]; + + io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag); + unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1); + } + + /* register buffer is skip_success */ + return s->nr + zc; } static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) @@ -208,19 +232,27 @@ static void ublk_stripe_io_done(struct ublk_queue *q, int tag, struct ublk_io *io = ublk_get_io(q, tag); int res = cqe->res; - if (res < 0) { + if (res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { if (!io->result) io->result = res; - ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); + if (res < 0) + ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); } + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + /* fail short READ/WRITE simply */ if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) { unsigned seq = user_data_to_tgt_data(cqe->user_data); struct stripe_array *s = io->private_data; - if (res < s->s[seq].vec->iov_len) + if (res < s->s[seq].nr_sects << 9) { io->result = -EIO; + ublk_err("%s: short rw op %u res %d exp %u tag %u\n", + __func__, op, res, s->s[seq].vec->iov_len, tag); + } } if (ublk_completed_tgt_io(q, tag)) { @@ -253,7 +285,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) struct stripe_conf *conf; unsigned chunk_shift; loff_t bytes = 0; - int ret, i; + int ret, i, mul = 1; if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); @@ -295,8 +327,11 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; - dev->tgt.sq_depth = dev->dev_info.queue_depth * conf->nr_files; - dev->tgt.cq_depth = dev->dev_info.queue_depth * conf->nr_files; + + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) + mul = 2; + dev->tgt.sq_depth = mul * dev->dev_info.queue_depth * conf->nr_files; + dev->tgt.cq_depth = mul * dev->dev_info.queue_depth * conf->nr_files; printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files); From 390513642ee6763c7ada07f0a1470474986e6c1c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 3 Apr 2025 12:29:30 +0100 Subject: [PATCH 44/44] io_uring: always do atomic put from iowq io_uring always switches requests to atomic refcounting for iowq execution before there is any parallilism by setting REQ_F_REFCOUNT, and the flag is not cleared until the request completes. That should be fine as long as the compiler doesn't make up a non existing value for the flags, however KCSAN still complains when the request owner changes oter flag bits: BUG: KCSAN: data-race in io_req_task_cancel / io_wq_free_work ... read to 0xffff888117207448 of 8 bytes by task 3871 on cpu 0: req_ref_put_and_test io_uring/refs.h:22 [inline] Skip REQ_F_REFCOUNT checks for iowq, we know it's set. Reported-by: syzbot+903a2ad71fb3f1e47cf5@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d880bc27fb8c3209b54641be4ff6ac02b0e5789a.1743679736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/refs.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a4065e3d13d0..c6209fe44cb1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1796,7 +1796,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - if (req_ref_put_and_test(req)) { + if (req_ref_put_and_test_atomic(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); diff --git a/io_uring/refs.h b/io_uring/refs.h index 63982ead9f7d..0d928d87c4ed 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -17,6 +17,13 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req) return atomic_inc_not_zero(&req->refs); } +static inline bool req_ref_put_and_test_atomic(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(data_race(req->flags) & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + static inline bool req_ref_put_and_test(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_REFCOUNT)))