diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 1e0e7358e14a..854f823b46c2 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -309,18 +309,35 @@ with specified IO tag in the command data: ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy the server buffer (pages) read to the IO request pages. -Future development -================== - Zero copy --------- -Zero copy is a generic requirement for nbd, fuse or similar drivers. A -problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace -can't be remapped any more in kernel with existing mm interfaces. This can -occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that -big requests (IO size >= 256 KB) may benefit a lot from zero copy. +ublk zero copy relies on io_uring's fixed kernel buffer, which provides +two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`. +ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call +`io_buffer_register_bvec()` for ublk server to register client request +buffer into io_uring buffer table, then ublk server can submit io_uring +IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF` +calls `io_buffer_unregister_bvec()` to unregister the buffer, which is +guaranteed to be live between calling `io_buffer_register_bvec()` and +`io_buffer_unregister_bvec()`. Any io_uring operation which supports this +kind of kernel buffer will grab one reference of the buffer until the +operation is completed. + +ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and +be trusted, because it is ublk server's responsibility to make sure IO buffer +filled with data for handling read command, and ublk server has to return +correct result to ublk driver when handling READ command, and the result +has to match with how many bytes filled to the IO buffer. Otherwise, +uninitialized kernel IO buffer will be exposed to client application. + +ublk server needs to align the parameter of `struct ublk_param_dma_align` +with backend for zero copy to work correctly. + +For reaching best IO performance, ublk server should align its segment +parameter of `struct ublk_param_segment` with backend for avoiding +unnecessary IO split, which usually hurts io_uring performance. References ========== @@ -332,5 +349,3 @@ References .. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk .. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README - -.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c060da409ed8..2fd05c1bd30b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -74,13 +74,30 @@ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ - UBLK_PARAM_TYPE_DMA_ALIGN) + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) struct ublk_rq_data { struct kref ref; }; struct ublk_uring_cmd_pdu { + /* + * Store requests in same batch temporarily for queuing them to + * daemon context. + * + * It should have been stored to request payload, but we do want + * to avoid extra pre-allocation, and uring_cmd payload is always + * free for us + */ + union { + struct request *req; + struct request *req_list; + }; + + /* + * The following two are valid in this cmd whole lifetime, and + * setup in ublk uring_cmd handler + */ struct ublk_queue *ubq; u16 tag; }; @@ -141,10 +158,8 @@ struct ublk_queue { unsigned long flags; struct task_struct *ubq_daemon; - char *io_cmd_buf; + struct ublksrv_io_desc *io_cmd_buf; - unsigned long io_addr; /* mapped vm address */ - unsigned int max_io_sz; bool force_abort; bool timeout; bool canceling; @@ -582,6 +597,18 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + const struct ublk_param_segment *p = &ub->params.seg; + + if (!is_power_of_2(p->seg_boundary_mask + 1)) + return -EINVAL; + + if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + } + return 0; } @@ -598,6 +625,11 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } +static inline bool ublk_need_map_io(const struct ublk_queue *ubq) +{ + return !ublk_support_user_copy(ubq); +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { /* @@ -674,11 +706,11 @@ static inline bool ublk_rq_has_data(const struct request *rq) static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag) { - return (struct ublksrv_io_desc *) - &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); + return &ubq->io_cmd_buf[tag]; } -static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) +static inline struct ublksrv_io_desc * +ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) { return ublk_get_queue(ub, q_id)->io_cmd_buf; } @@ -925,7 +957,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes; /* @@ -949,7 +981,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes; if (ublk_need_unmap_req(req)) { @@ -1037,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( struct io_uring_cmd *ioucmd) { - return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; + return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) @@ -1155,14 +1187,11 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_dispatch_req(struct ublk_queue *ubq, + struct request *req, + unsigned int issue_flags) { - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - struct ublk_queue *ubq = pdu->ubq; - int tag = pdu->tag; - struct request *req = blk_mq_tag_to_rq( - ubq->dev->tag_set.tags[ubq->q_id], tag); + int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; unsigned int mapped_bytes; @@ -1237,11 +1266,49 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } +static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + + ublk_dispatch_req(ubq, pdu->req, issue_flags); +} + static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { - struct ublk_io *io = &ubq->ios[rq->tag]; + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); +} + +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct request *rq = pdu->req_list; + struct ublk_queue *ubq = pdu->ubq; + struct request *next; + + do { + next = rq->rq_next; + rq->rq_next = NULL; + ublk_dispatch_req(ubq, rq, issue_flags); + rq = next; + } while (rq); +} + +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + struct request *rq = rq_list_peek(l); + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->req_list = rq; + rq_list_init(l); + io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); } static enum blk_eh_timer_return ublk_timeout(struct request *rq) @@ -1282,21 +1349,12 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_RESET_TIMER; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; - if (unlikely(ubq->fail_io)) { + if (unlikely(ubq->fail_io)) return BLK_STS_TARGET; - } - - /* fill iod to slot in io cmd buffer */ - res = ublk_setup_iod(ubq, rq); - if (unlikely(res != BLK_STS_OK)) - return BLK_STS_IOERR; /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to @@ -1310,17 +1368,68 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; + if (unlikely(ubq->canceling)) + return BLK_STS_IOERR; + + /* fill iod to slot in io cmd buffer */ + res = ublk_setup_iod(ubq, rq); + if (unlikely(res != BLK_STS_OK)) + return BLK_STS_IOERR; + + blk_mq_start_request(rq); + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; + + res = ublk_prep_req(ubq, rq); + if (res != BLK_STS_OK) + return res; + + /* + * ->canceling has to be handled after ->force_abort and ->fail_io + * is dealt with, otherwise this request may not be failed in case + * of recovery, and cause hang when deleting disk + */ if (unlikely(ubq->canceling)) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - blk_mq_start_request(bd->rq); ublk_queue_cmd(ubq, rq); - return BLK_STS_OK; } +static void ublk_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + + if (ublk_prep_req(ubq, req) == BLK_STS_OK) + rq_list_add_tail(&submit_list, req); + else + rq_list_add_tail(&requeue_list, req); + } + + if (ubq && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1333,6 +1442,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_queue_rqs, .init_hctx = ublk_init_hctx, .timeout = ublk_timeout, }; @@ -1446,18 +1556,28 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) } } -static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) +/* Must be called when queue is frozen */ +static bool ublk_mark_queue_canceling(struct ublk_queue *ubq) { - struct gendisk *disk; + bool canceled; spin_lock(&ubq->cancel_lock); - if (ubq->canceling) { - spin_unlock(&ubq->cancel_lock); - return false; - } - ubq->canceling = true; + canceled = ubq->canceling; + if (!canceled) + ubq->canceling = true; spin_unlock(&ubq->cancel_lock); + return canceled; +} + +static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) +{ + bool was_canceled = ubq->canceling; + struct gendisk *disk; + + if (was_canceled) + return false; + spin_lock(&ub->lock); disk = ub->ub_disk; if (disk) @@ -1468,14 +1588,23 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) if (!disk) return false; - /* Now we are serialized with ublk_queue_rq() */ + /* + * Now we are serialized with ublk_queue_rq() + * + * Make sure that ubq->canceling is set when queue is frozen, + * because ublk_queue_rq() has to rely on this flag for avoiding to + * touch completed uring_cmd + */ blk_mq_quiesce_queue(disk->queue); - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); + was_canceled = ublk_mark_queue_canceling(ubq); + if (!was_canceled) { + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); + } blk_mq_unquiesce_queue(disk->queue); put_device(disk_to_dev(disk)); - return true; + return !was_canceled; } static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, @@ -1845,7 +1974,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled @@ -1867,7 +1996,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO. @@ -2343,6 +2472,12 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) lim.dma_alignment = ub->params.dma.alignment; + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; + lim.max_segment_size = ub->params.seg.max_segment_size; + lim.max_segments = ub->params.seg.max_segments; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ba8f52d48b94..204b22a99c4b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -184,6 +184,12 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) +#define for_each_mp_bvec(bvl, bio_vec, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ + bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) + /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ { \ diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index e6723fa95160..0634a3de1782 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -21,7 +21,6 @@ struct io_uring_cmd { struct io_uring_cmd_data { void *op_data; - struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 7255b36b5cf6..583b86681c93 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -410,6 +410,29 @@ struct ublk_param_dma_align { __u8 pad[4]; }; +#define UBLK_MIN_SEGMENT_SIZE 4096 +/* + * If any one of the three segment parameter is set as 0, the behavior is + * undefined. + */ +struct ublk_param_segment { + /* + * seg_boundary_mask + 1 needs to be power_of_2(), and the sum has + * to be >= UBLK_MIN_SEGMENT_SIZE(4096) + */ + __u64 seg_boundary_mask; + + /* + * max_segment_size could be override by virt_boundary_mask, so be + * careful when setting both. + * + * max_segment_size has to be >= UBLK_MIN_SEGMENT_SIZE(4096) + */ + __u32 max_segment_size; + __u16 max_segments; + __u8 pad[2]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -423,6 +446,7 @@ struct ublk_params { #define UBLK_PARAM_TYPE_DEVT (1 << 2) #define UBLK_PARAM_TYPE_ZONED (1 << 3) #define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) +#define UBLK_PARAM_TYPE_SEGMENT (1 << 5) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; @@ -430,6 +454,7 @@ struct ublk_params { struct ublk_param_devt devt; struct ublk_param_zoned zoned; struct ublk_param_dma_align dma; + struct ublk_param_segment seg; }; #endif diff --git a/io_uring/Kconfig b/io_uring/Kconfig index 9e2a4beba1ef..4b949c42c0bf 100644 --- a/io_uring/Kconfig +++ b/io_uring/Kconfig @@ -5,6 +5,7 @@ config IO_URING_ZCRX def_bool y + depends on IO_URING depends on PAGE_POOL depends on INET depends on NET_RX_BUSY_POLL diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3ba49c628337..c6209fe44cb1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1141,10 +1141,9 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned flags) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { + struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1239,17 +1238,16 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_req_local_work_add(req, req->ctx, flags); + io_req_local_work_add(req, flags); else io_req_normal_work_add(req); } -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags) +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) { - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) return; - io_req_local_work_add(req, ctx, flags); + __io_req_task_work_add(req, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) @@ -1645,6 +1643,8 @@ io_req_flags_t io_file_get_flags(struct file *file) { io_req_flags_t res = 0; + BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); + if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) @@ -1796,7 +1796,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - if (req_ref_put_and_test(req)) { + if (req_ref_put_and_test_atomic(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 87f883130286..e4050b2d0821 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -89,8 +89,7 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 0bbcbbcdebfd..50a958e9c921 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -38,8 +38,8 @@ static void io_double_unlock_ctx(struct io_ring_ctx *octx) mutex_unlock(&octx->uring_lock); } -static int io_double_lock_ctx(struct io_ring_ctx *octx, - unsigned int issue_flags) +static int io_lock_external_ctx(struct io_ring_ctx *octx, + unsigned int issue_flags) { /* * To ensure proper ordering between the two ctxs, we can only @@ -93,13 +93,14 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, kmem_cache_free(req_cachep, req); return -EOWNERDEAD; } + req->opcode = IORING_OP_NOP; req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; req->tctx = NULL; req->io_task_work.func = io_msg_tw_complete; - io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE); return 0; } @@ -154,7 +155,7 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; } if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) @@ -199,7 +200,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag struct file *src_file = msg->src_file; int ret; - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); diff --git a/io_uring/net.c b/io_uring/net.c index 8944eb679024..24040bc3916a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -97,6 +97,11 @@ struct io_recvzc { struct io_zcrx_ifq *ifq; }; +static int io_sg_from_iter_iovec(struct sk_buff *skb, + struct iov_iter *from, size_t length); +static int io_sg_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length); + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -176,16 +181,6 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) return hdr; } -/* assign new iovec to kmsg, if we need to */ -static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, - struct iovec *iov) -{ - if (iov) { - req->flags |= REQ_F_NEED_CLEANUP; - io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs); - } -} - static inline void io_mshot_prep_retry(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -217,7 +212,11 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg &iomsg->msg.msg_iter, io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; - io_net_vec_assign(req, iomsg, iov); + + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); + } return 0; } @@ -325,25 +324,6 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, return 0; } -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr msg; - int ret; - - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); - if (unlikely(ret)) - return ret; - - if (!(req->flags & REQ_F_BUFFER_SELECT)) - ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, - ITER_SOURCE); - /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control_user; - return ret; -} - void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { struct io_async_msghdr *io = req->async_data; @@ -379,6 +359,8 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = addr_len; } + if (sr->flags & IORING_RECVSEND_FIXED_BUF) + return 0; if (!io_do_buffer_select(req)) { ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); @@ -389,34 +371,27 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - - return io_sendmsg_copy_hdr(req, kmsg); -} - -static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct user_msghdr msg; int ret; - if (!(sr->flags & IORING_RECVSEND_FIXED_BUF)) - return io_sendmsg_setup(req, sqe); - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); if (unlikely(ret)) return ret; + /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = kmsg->msg.msg_control_user; - kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; - return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, msg.msg_iovlen); + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; + return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, + msg.msg_iovlen); + } + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) @@ -427,12 +402,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->done_io = 0; sr->retry = false; - - if (req->opcode != IORING_OP_SEND) { - if (sqe->addr2 || sqe->file_index) - return -EINVAL; - } - sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -458,6 +427,8 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) return io_send_setup(req, sqe); + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; return io_sendmsg_setup(req, sqe); } @@ -1302,11 +1273,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_ring_ctx *ctx = req->ctx; + struct io_async_msghdr *iomsg; struct io_kiocb *notif; + int ret; zc->done_io = 0; zc->retry = false; - req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1320,7 +1292,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif->cqe.user_data = req->cqe.user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; - req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; zc->flags = READ_ONCE(sqe->ioprio); if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { @@ -1335,11 +1307,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (req->opcode != IORING_OP_SEND_ZC) { - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; - } - zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; req->buf_index = READ_ONCE(sqe->buf_index); @@ -1349,13 +1316,28 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (io_is_compat(req->ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; - if (unlikely(!io_msg_alloc_async(req))) + iomsg = io_msg_alloc_async(req); + if (unlikely(!iomsg)) return -ENOMEM; + if (req->opcode == IORING_OP_SEND_ZC) { - req->flags |= REQ_F_IMPORT_BUFFER; - return io_send_setup(req, sqe); + if (zc->flags & IORING_RECVSEND_FIXED_BUF) + req->flags |= REQ_F_IMPORT_BUFFER; + ret = io_send_setup(req, sqe); + } else { + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; + ret = io_sendmsg_setup(req, sqe); } - return io_sendmsg_zc_setup(req, sqe); + if (unlikely(ret)) + return ret; + + if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { + iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; + return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); + } + iomsg->msg.sg_from_iter = io_sg_from_iter; + return 0; } static int io_sg_from_iter_iovec(struct sk_buff *skb, @@ -1412,27 +1394,13 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; - int ret; - if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - sr->notif->buf_index = req->buf_index; - ret = io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, - (u64)(uintptr_t)sr->buf, sr->len, - ITER_SOURCE, issue_flags); - if (unlikely(ret)) - return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter; - } else { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - ret = io_notif_account_mem(sr->notif, sr->len); - if (unlikely(ret)) - return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; - } + WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); - return ret; + sr->notif->buf_index = req->buf_index; + return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); } int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) @@ -1513,8 +1481,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; - if (req->flags & REQ_F_IMPORT_BUFFER) { unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; int ret; @@ -1523,7 +1489,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) &kmsg->vec, uvec_segs, issue_flags); if (unlikely(ret)) return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter; req->flags &= ~REQ_F_IMPORT_BUFFER; } diff --git a/io_uring/refs.h b/io_uring/refs.h index 63982ead9f7d..0d928d87c4ed 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -17,6 +17,13 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req) return atomic_inc_not_zero(&req->refs); } +static inline bool req_ref_put_and_test_atomic(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(data_race(req->flags) & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + static inline bool req_ref_put_and_test(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_REFCOUNT))) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3f195e24777e..5e64a8bb30a4 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1002,20 +1002,33 @@ unlock: } EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); -static int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +static int validate_fixed_range(u64 buf_addr, size_t len, + const struct io_mapped_ubuf *imu) { u64 buf_end; - size_t offset; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (unlikely(len > MAX_RW_COUNT)) + return -EFAULT; + return 0; +} + +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + size_t offset; + int ret; + + if (WARN_ON_ONCE(!imu)) + return -EFAULT; + ret = validate_fixed_range(buf_addr, len, imu); + if (unlikely(ret)) + return ret; if (!(imu->dir & (1 << ddir))) return -EFAULT; @@ -1305,12 +1318,12 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; struct bio_vec *src_bvec; size_t offset; - u64 buf_end; + int ret; + + ret = validate_fixed_range(buf_addr, iov_len, imu); + if (unlikely(ret)) + return ret; - if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) - return -EFAULT; - if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) - return -EFAULT; if (unlikely(!iov_len)) return -EFAULT; if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) @@ -1349,6 +1362,82 @@ static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, return max_segs; } +static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + const struct bio_vec *src_bvec = imu->bvec; + struct bio_vec *res_bvec = vec->bvec; + unsigned res_idx = 0; + size_t total_len = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; + size_t iov_len = iovec[iov_idx].iov_len; + struct bvec_iter bi = { + .bi_size = offset + iov_len, + }; + struct bio_vec bv; + + bvec_iter_advance(src_bvec, &bi, offset); + for_each_mp_bvec(bv, src_bvec, bi, bi) + res_bvec[res_idx++] = bv; + total_len += iov_len; + } + iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); + return 0; +} + +static int iov_kern_bvec_size(const struct iovec *iov, + const struct io_mapped_ubuf *imu, + unsigned int *nr_seg) +{ + size_t offset = (size_t)(uintptr_t)iov->iov_base; + const struct bio_vec *bvec = imu->bvec; + int start = 0, i = 0; + size_t off = 0; + int ret; + + ret = validate_fixed_range(offset, iov->iov_len, imu); + if (unlikely(ret)) + return ret; + + for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; + off += bvec[i].bv_len, i++) { + if (offset >= off && offset < off + bvec[i].bv_len) + start = i; + } + *nr_seg = i - start; + return 0; +} + +static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu, unsigned *nr_segs) +{ + unsigned max_segs = 0; + size_t total_len = 0; + unsigned i; + int ret; + + *nr_segs = 0; + for (i = 0; i < nr_iovs; i++) { + if (unlikely(!iov[i].iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov[i].iov_len, + &total_len))) + return -EOVERFLOW; + ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); + if (unlikely(ret)) + return ret; + *nr_segs += max_segs; + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + return 0; +} + int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, unsigned nr_iovs, unsigned issue_flags) @@ -1363,14 +1452,20 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, if (!node) return -EFAULT; imu = node->buf; - if (imu->is_kbuf) - return -EOPNOTSUPP; if (!(imu->dir & (1 << ddir))) return -EFAULT; iovec_off = vec->nr - nr_iovs; iov = vec->iovec + iovec_off; - nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (imu->is_kbuf) { + int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); + + if (unlikely(ret)) + return ret; + } else { + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + } if (sizeof(struct bio_vec) > sizeof(struct iovec)) { size_t bvec_bytes; @@ -1397,6 +1492,9 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, req->flags |= REQ_F_NEED_CLEANUP; } + if (imu->is_kbuf) + return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f2cfc371f3d0..a9ea7d29cdd9 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -205,8 +205,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, * that it doesn't read in per-op data, play it safe and ensure that * any SQE data is stable beyond prep. This can later get relaxed. */ - memcpy(ac->data.sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = ac->data.sqes; + memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->sqes; return 0; } @@ -307,17 +307,18 @@ static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) { + const struct io_uring_sqe *sqe = cmd->sqe; bool compat = !!(issue_flags & IO_URING_F_COMPAT); int optlen, optname, level, err; void __user *optval; - level = READ_ONCE(cmd->sqe->level); + level = READ_ONCE(sqe->level); if (level != SOL_SOCKET) return -EOPNOTSUPP; - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); err = do_sock_getsockopt(sock, compat, level, optname, USER_SOCKPTR(optval), @@ -333,15 +334,16 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) { + const struct io_uring_sqe *sqe = cmd->sqe; bool compat = !!(issue_flags & IO_URING_F_COMPAT); int optname, optlen, level; void __user *optval; sockptr_t optval_s; - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); - level = READ_ONCE(cmd->sqe->level); + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); optval_s = USER_SOCKPTR(optval); return do_sock_setsockopt(sock, compat, level, optname, optval_s, diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 14e525255854..b04686b6b5d2 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -6,6 +6,7 @@ struct io_async_cmd { struct io_uring_cmd_data data; struct iou_vec vec; + struct io_uring_sqe sqes[2]; }; int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 9c95b5b6ec4e..80d4a6f71d29 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -818,6 +818,14 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, int ret = 0; len = min_t(size_t, len, desc->count); + /* + * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even + * if desc->count is already 0. This is caused by the if (offset + 1 != + * skb->len) check. Return early in this case to break out of + * __tcp_read_sock(). + */ + if (!len) + return 0; if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) return -EAGAIN; diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 7817afe29005..c7781efea0f3 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -4,6 +4,8 @@ CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh +TEST_PROGS += test_generic_02.sh +TEST_PROGS += test_generic_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh @@ -11,8 +13,11 @@ TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh TEST_PROGS += test_loop_04.sh +TEST_PROGS += test_loop_05.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh +TEST_PROGS += test_stripe_03.sh +TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 05147b53c361..91c282bc7674 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -99,7 +99,7 @@ static int __ublk_ctrl_cmd(struct ublk_dev *dev, static int ublk_ctrl_stop_dev(struct ublk_dev *dev) { struct ublk_ctrl_cmd_data data = { - .cmd_op = UBLK_CMD_STOP_DEV, + .cmd_op = UBLK_U_CMD_STOP_DEV, }; return __ublk_ctrl_cmd(dev, &data); @@ -169,7 +169,7 @@ static int ublk_ctrl_get_params(struct ublk_dev *dev, struct ublk_params *params) { struct ublk_ctrl_cmd_data data = { - .cmd_op = UBLK_CMD_GET_PARAMS, + .cmd_op = UBLK_U_CMD_GET_PARAMS, .flags = CTRL_CMD_HAS_BUF, .addr = (__u64)params, .len = sizeof(*params), @@ -215,7 +215,7 @@ static void ublk_ctrl_dump(struct ublk_dev *dev) ret = ublk_ctrl_get_params(dev, &p); if (ret < 0) { - ublk_err("failed to get params %m\n"); + ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); return; } @@ -322,7 +322,7 @@ static int ublk_queue_init(struct ublk_queue *q) cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); - q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, + q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ, MAP_SHARED | MAP_POPULATE, dev->fds[0], off); if (q->io_cmd_buf == MAP_FAILED) { ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index f31a5c4d4143..760ff8ffb810 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -128,7 +128,7 @@ struct ublk_queue { unsigned int io_inflight; struct ublk_dev *dev; const struct ublk_tgt_ops *tgt_ops; - char *io_cmd_buf; + struct ublksrv_io_desc *io_cmd_buf; struct io_uring ring; struct ublk_io ios[UBLK_QUEUE_DEPTH]; #define UBLKSRV_QUEUE_STOPPING (1U << 0) @@ -302,7 +302,7 @@ static inline void ublk_mark_io_done(struct ublk_io *io, int res) static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) { - return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); + return &q->io_cmd_buf[tag]; } static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 899875ff50fe..91fec3690d4b 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -17,7 +17,8 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->tgt.dev_size = dev_size; dev->tgt.params = (struct ublk_params) { - .types = UBLK_PARAM_TYPE_BASIC, + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN | + UBLK_PARAM_TYPE_SEGMENT, .basic = { .logical_bs_shift = 9, .physical_bs_shift = 12, @@ -26,6 +27,14 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_sectors = info->max_io_buf_bytes >> 9, .dev_sectors = dev_size >> 9, }, + .dma = { + .alignment = 4095, + }, + .seg = { + .seg_boundary_mask = 4095, + .max_segment_size = 32 << 10, + .max_segments = 32, + }, }; if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 98c564b12f3c..179731c3dd6f 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -111,43 +111,67 @@ static void calculate_stripe_array(const struct stripe_conf *conf, } } -static inline enum io_uring_op stripe_to_uring_op(const struct ublksrv_io_desc *iod) +static inline enum io_uring_op stripe_to_uring_op( + const struct ublksrv_io_desc *iod, int zc) { unsigned ublk_op = ublksrv_get_op(iod); if (ublk_op == UBLK_IO_OP_READ) - return IORING_OP_READV; + return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) - return IORING_OP_WRITEV; + return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) { const struct stripe_conf *conf = get_chunk_shift(q); - enum io_uring_op op = stripe_to_uring_op(iod); + int zc = !!(ublk_queue_use_zc(q) != 0); + enum io_uring_op op = stripe_to_uring_op(iod, zc); struct io_uring_sqe *sqe[NR_STRIPE]; struct stripe_array *s = alloc_stripe_array(conf, iod); struct ublk_io *io = ublk_get_io(q, tag); - int i; + int i, extra = zc ? 2 : 0; io->private_data = s; calculate_stripe_array(conf, iod, s); - ublk_queue_alloc_sqes(q, sqe, s->nr); - for (i = 0; i < s->nr; i++) { - struct stripe *t = &s->s[i]; + ublk_queue_alloc_sqes(q, sqe, s->nr + extra); + + if (zc) { + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + } + + for (i = zc; i < s->nr + extra - zc; i++) { + struct stripe *t = &s->s[i - zc]; io_uring_prep_rw(op, sqe[i], t->seq + 1, (void *)t->vec, t->nr_vec, t->start << 9); - io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + if (zc) { + sqe[i]->buf_index = tag; + io_uring_sqe_set_flags(sqe[i], + IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK); + } else { + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + } /* bit63 marks us as tgt io */ - sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i, 1); + sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1); } - return s->nr; + if (zc) { + struct io_uring_sqe *unreg = sqe[s->nr + 1]; + + io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag); + unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1); + } + + /* register buffer is skip_success */ + return s->nr + zc; } static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) @@ -208,19 +232,27 @@ static void ublk_stripe_io_done(struct ublk_queue *q, int tag, struct ublk_io *io = ublk_get_io(q, tag); int res = cqe->res; - if (res < 0) { + if (res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { if (!io->result) io->result = res; - ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); + if (res < 0) + ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); } + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + /* fail short READ/WRITE simply */ if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) { unsigned seq = user_data_to_tgt_data(cqe->user_data); struct stripe_array *s = io->private_data; - if (res < s->s[seq].vec->iov_len) + if (res < s->s[seq].nr_sects << 9) { io->result = -EIO; + ublk_err("%s: short rw op %u res %d exp %u tag %u\n", + __func__, op, res, s->s[seq].vec->iov_len, tag); + } } if (ublk_completed_tgt_io(q, tag)) { @@ -253,7 +285,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) struct stripe_conf *conf; unsigned chunk_shift; loff_t bytes = 0; - int ret, i; + int ret, i, mul = 1; if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); @@ -295,8 +327,11 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; - dev->tgt.sq_depth = dev->dev_info.queue_depth * conf->nr_files; - dev->tgt.cq_depth = dev->dev_info.queue_depth * conf->nr_files; + + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) + mul = 2; + dev->tgt.sq_depth = mul * dev->dev_info.queue_depth * conf->nr_files; + dev->tgt.cq_depth = mul * dev->dev_info.queue_depth * conf->nr_files; printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files); diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 75f54ac6b1c4..a88b35943227 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -23,6 +23,12 @@ _get_disk_dev_t() { echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) } +_run_fio_verify_io() { + fio --name=verify --rw=randwrite --direct=1 --ioengine=libaio \ + --bs=8k --iodepth=32 --verify=crc32c --do_verify=1 \ + --verify_state_save=0 "$@" > /dev/null +} + _create_backfile() { local my_size=$1 local my_file diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh new file mode 100755 index 000000000000..3e80121e3bf5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_02" +ERR_CODE=0 + +if ! _have_program bpftrace; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "null" "sequential io order for MQ" + +dev_id=$(_add_ublk_dev -t null -q 2) +_check_add_dev $TID $? + +dev_t=$(_get_disk_dev_t "$dev_id") +bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & +btrace_pid=$! +sleep 2 + +if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# run fio over this ublk disk +fio --name=write_seq \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? +kill "$btrace_pid" +wait +if grep -q "io_out_of_order" "$UBLK_TMP"; then + cat "$UBLK_TMP" + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh new file mode 100755 index 000000000000..b551aa76cb0d --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_03" +ERR_CODE=0 + +_prep_test "null" "check dma & segment limits for zero copy" + +dev_id=$(_add_ublk_dev -t null -z) +_check_add_dev $TID $? + +sysfs_path=/sys/block/ublkb"${dev_id}" +dma_align=$(cat "$sysfs_path"/queue/dma_alignment) +max_segments=$(cat "$sysfs_path"/queue/max_segments) +max_segment_size=$(cat "$sysfs_path"/queue/max_segment_size) +if [ "$dma_align" != "4095" ]; then + ERR_CODE=255 +fi +if [ "$max_segments" != "32" ]; then + ERR_CODE=255 +fi +if [ "$max_segment_size" != "32768" ]; then + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index c882d2a08e13..1ef8b6044777 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -6,6 +6,10 @@ TID="loop_01" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "loop" "write and verify test" backfile_0=$(_create_backfile 256M) @@ -14,15 +18,7 @@ dev_id=$(_add_ublk_dev -t loop "$backfile_0") _check_add_dev $TID $? "${backfile_0}" # run fio over the ublk disk -fio --name=write_and_verify \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=256M \ - --direct=1 \ - --verify=crc32c \ - --do_verify=1 \ - --bs=4k > /dev/null 2>&1 +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M ERR_CODE=$? _cleanup_test "loop" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index 269c96787d7d..e9ca744de8b1 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -6,6 +6,10 @@ TID="loop_03" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "loop" "write and verify over zero copy" backfile_0=$(_create_backfile 256M) @@ -13,15 +17,7 @@ dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") _check_add_dev $TID $? "$backfile_0" # run fio over the ublk disk -fio --name=write_and_verify \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=64 \ - --rw=write \ - --size=256M \ - --direct=1 \ - --verify=crc32c \ - --do_verify=1 \ - --bs=4k > /dev/null 2>&1 +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M ERR_CODE=$? _cleanup_test "loop" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh new file mode 100755 index 000000000000..2e6e2e6978fc --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_05" +ERR_CODE=0 + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "loop" "write and verify test" + +backfile_0=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -q 2 -t loop "$backfile_0") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7177f6c57bc5..a8be24532b24 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -27,20 +27,20 @@ ublk_io_and_remove() _prep_test "stress" "run IO and remove device" -ublk_io_and_remove 8G -t null +ublk_io_and_remove 8G -t null -q 4 ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi BACK_FILE=$(_create_backfile 256M) -ublk_io_and_remove 256M -t loop "${BACK_FILE}" +ublk_io_and_remove 256M -t loop -q 4 "${BACK_FILE}" ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi -ublk_io_and_remove 256M -t loop -z "${BACK_FILE}" +ublk_io_and_remove 256M -t loop -q 4 -z "${BACK_FILE}" ERR_CODE=$? _cleanup_test "stress" _remove_backfile "${BACK_FILE}" diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 2a8e60579a06..2159e4cc8140 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -27,20 +27,20 @@ ublk_io_and_kill_daemon() _prep_test "stress" "run IO and kill ublk server" -ublk_io_and_kill_daemon 8G -t null +ublk_io_and_kill_daemon 8G -t null -q 4 ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi BACK_FILE=$(_create_backfile 256M) -ublk_io_and_kill_daemon 256M -t loop "${BACK_FILE}" +ublk_io_and_kill_daemon 256M -t loop -q 4 "${BACK_FILE}" ERR_CODE=$? if [ ${ERR_CODE} -ne 0 ]; then _show_result $TID $ERR_CODE fi -ublk_io_and_kill_daemon 256M -t loop -z "${BACK_FILE}" +ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${BACK_FILE}" ERR_CODE=$? _cleanup_test "stress" _remove_backfile "${BACK_FILE}" diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index c01f3dc325ab..7e387ef656ea 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -6,6 +6,10 @@ TID="stripe_01" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "stripe" "write and verify test" backfile_0=$(_create_backfile 256M) @@ -15,15 +19,7 @@ dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") _check_add_dev $TID $? "${backfile_0}" # run fio over the ublk disk -fio --name=write_and_verify \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=32 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --verify=crc32c \ - --do_verify=1 \ - --bs=4k > /dev/null 2>&1 +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=512M ERR_CODE=$? _cleanup_test "stripe" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh new file mode 100755 index 000000000000..c1b34af36145 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_03" +ERR_CODE=0 + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stripe" "write and verify test" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -q 2 -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=512M +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE