From e3e68311ead15d8be61e8e1a8d2f0d1773a7ba9c Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Tue, 1 Apr 2025 10:13:47 +0530 Subject: [PATCH 1/9] block: remove unused nseg parameter We are no longer using nr_segs, after blk_mq_attempt_bio_merge was moved out of blk_mq_get_new_request. Signed-off-by: Nitesh Shetty Link: https://lore.kernel.org/r/20250401044348.15588-1-nj.shetty@samsung.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ae8494d88897..0cfd1a149f64 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2965,8 +2965,7 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs) + struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, @@ -3125,7 +3124,7 @@ new_request: if (rq) { blk_mq_use_cached_rq(rq, plug, bio); } else { - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + rq = blk_mq_get_new_requests(q, plug, bio); if (unlikely(!rq)) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); From ea7789c1541084a3dae65ffd36778348dd98f61b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Mar 2025 17:22:18 +0900 Subject: [PATCH 2/9] nvmet: pci-epf: Keep completion queues mapped Instead of mapping and unmapping the completion queues memory to the host PCI address space whenever nvmet_pci_epf_cq_work() is called, map a completion queue to the host PCI address space when the completion queue is created with nvmet_pci_epf_create_cq() and unmap it when the completion queue is deleted with nvmet_pci_epf_delete_cq(). This removes the completion queue mapping/unmapping from nvmet_pci_epf_cq_work() and significantly increases performance. For a single job 4K random read QD=1 workload, the IOPS is increased from 23 KIOPS to 25 KIOPS. Some significant throughput increasde for high queue depth and large IOs workloads can also be seen. Since the functions nvmet_pci_epf_map_queue() and nvmet_pci_epf_unmap_queue() are called respectively only from nvmet_pci_epf_create_cq() and nvmet_pci_epf_delete_cq(), these functions are removed and open-coded in their respective call sites. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 63 ++++++++++++++--------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index b54b3fdbe389..51c27b32248d 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1264,6 +1264,7 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; u16 status; + int ret; if (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; @@ -1298,6 +1299,24 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, if (status != NVME_SC_SUCCESS) goto err; + /* + * Map the CQ PCI address space and since PCI endpoint controllers may + * return a partial mapping, check that the mapping is large enough. + */ + ret = nvmet_pci_epf_mem_map(ctrl->nvme_epf, cq->pci_addr, cq->pci_size, + &cq->pci_map); + if (ret) { + dev_err(ctrl->dev, "Failed to map CQ %u (err=%d)\n", + cq->qid, ret); + goto err_internal; + } + + if (cq->pci_map.pci_size < cq->pci_size) { + dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", + cq->qid); + goto err_unmap_queue; + } + set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", @@ -1305,6 +1324,10 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, return NVME_SC_SUCCESS; +err_unmap_queue: + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); +err_internal: + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; err: if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); @@ -1322,6 +1345,7 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid) cancel_delayed_work_sync(&cq->work); nvmet_pci_epf_drain_queue(cq); nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); return NVME_SC_SUCCESS; } @@ -1553,36 +1577,6 @@ static void nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl *ctrl) ctrl->cq = NULL; } -static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl, - struct nvmet_pci_epf_queue *queue) -{ - struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf; - int ret; - - ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr, - queue->pci_size, &queue->pci_map); - if (ret) { - dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n", - queue->qid, ret); - return ret; - } - - if (queue->pci_map.pci_size < queue->pci_size) { - dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", - queue->qid); - nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map); - return -ENOMEM; - } - - return 0; -} - -static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl, - struct nvmet_pci_epf_queue *queue) -{ - nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map); -} - static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) { struct nvmet_pci_epf_iod *iod = @@ -1746,11 +1740,7 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work) struct nvme_completion *cqe; struct nvmet_pci_epf_iod *iod; unsigned long flags; - int ret, n = 0; - - ret = nvmet_pci_epf_map_queue(ctrl, cq); - if (ret) - goto again; + int ret = 0, n = 0; while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) { @@ -1797,8 +1787,6 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work) n++; } - nvmet_pci_epf_unmap_queue(ctrl, cq); - /* * We do not support precise IRQ coalescing time (100ns units as per * NVMe specifications). So if we have posted completion entries without @@ -1807,7 +1795,6 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work) if (n) nvmet_pci_epf_raise_irq(ctrl, cq, true); -again: if (ret < 0) queue_delayed_work(system_highpri_wq, &cq->work, NVMET_PCI_EPF_CQ_RETRY_INTERVAL); From eada75467fca0b016b9b22212637c07216135c20 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 09:46:45 -0600 Subject: [PATCH 3/9] nvme/ioctl: don't warn on vectorized uring_cmd with fixed buffer The vectorized io_uring NVMe passthru opcodes don't yet support fixed buffers. But since userspace can trigger this condition based on the io_uring SQE parameters, it shouldn't cause a kernel warning. Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Fixes: 23fd22e55b76 ("nvme: wire up fixed buffer support for nvme passthrough") Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index ecf136489044..f81748096841 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -142,7 +142,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct iov_iter iter; /* fixedbufs is only for non-vectored io */ - if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) { + if (flags & NVME_IOCTL_VEC) { ret = -EINVAL; goto out; } From cd683de63e1d7ce3c2bb9c285f7885e96c98af15 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 09:46:46 -0600 Subject: [PATCH 4/9] nvme/ioctl: move blk_mq_free_request() out of nvme_map_user_request() The callers of nvme_map_user_request() (nvme_submit_user_cmd() and nvme_uring_cmd_io()) allocate the request, so have them free it if nvme_map_user_request() fails. Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f81748096841..809910da8c6e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -129,10 +129,9 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, if (!nvme_ctrl_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked data buffer\n"); if (has_metadata) { - if (!supports_metadata) { - ret = -EINVAL; - goto out; - } + if (!supports_metadata) + return -EINVAL; + if (!nvme_ctrl_meta_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked metadata buffer\n"); @@ -142,15 +141,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct iov_iter iter; /* fixedbufs is only for non-vectored io */ - if (flags & NVME_IOCTL_VEC) { - ret = -EINVAL; - goto out; - } + if (flags & NVME_IOCTL_VEC) + return -EINVAL; + ret = io_uring_cmd_import_fixed(ubuffer, bufflen, rq_data_dir(req), &iter, ioucmd, iou_issue_flags); if (ret < 0) - goto out; + return ret; ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); } else { ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), @@ -159,7 +157,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, } if (ret) - goto out; + return ret; bio = req->bio; if (bdev) @@ -176,8 +174,6 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) blk_rq_unmap_user(bio); -out: - blk_mq_free_request(req); return ret; } @@ -202,7 +198,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, meta_len, NULL, flags, 0); if (ret) - return ret; + goto out_free_req; } bio = req->bio; @@ -218,7 +214,10 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (effects) nvme_passthru_end(ctrl, ns, effects, cmd, ret); + return ret; +out_free_req: + blk_mq_free_request(req); return ret; } @@ -521,7 +520,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, d.data_len, nvme_to_user_ptr(d.metadata), d.metadata_len, ioucmd, vec, issue_flags); if (ret) - return ret; + goto out_free_req; } /* to free bio on completion, as req->bio will be null at that time */ @@ -531,6 +530,10 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, req->end_io = nvme_uring_cmd_end_io; blk_execute_rq_nowait(req, false); return -EIOCBQUEUED; + +out_free_req: + blk_mq_free_request(req); + return ret; } static bool is_ctrl_ioctl(unsigned int cmd) From 38808af53c6e72935d895182d69d63a5149da2ab Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Mar 2025 09:46:47 -0600 Subject: [PATCH 5/9] nvme/ioctl: move fixed buffer lookup to nvme_uring_cmd_io() nvme_map_user_request() is called from both nvme_submit_user_cmd() and nvme_uring_cmd_io(). But the ioucmd branch is only applicable to nvme_uring_cmd_io(). Move it to nvme_uring_cmd_io() and just pass the resulting iov_iter to nvme_map_user_request(). For NVMe passthru operations with fixed buffers, the fixed buffer lookup happens in io_uring_cmd_import_fixed(). But nvme_uring_cmd_io() can return -EAGAIN first from nvme_alloc_user_request() if all tags in the tag set are in use. This ordering difference is observable when using UBLK_U_IO_{,UN}REGISTER_IO_BUF SQEs to modify the fixed buffer table. If the NVMe passthru operation is followed by UBLK_U_IO_UNREGISTER_IO_BUF to unregister the fixed buffer and the NVMe passthru goes async, the fixed buffer lookup will fail because it happens after the unregister. Userspace should not depend on the order in which io_uring issues SQEs submitted in parallel, but it may try submitting the SQEs together and fall back on a slow path if the fixed buffer lookup fails. To make the fast path more likely, do the import before nvme_alloc_user_request(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 45 +++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 809910da8c6e..ca86d3bf7ea4 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -114,8 +114,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - struct io_uring_cmd *ioucmd, unsigned int flags, - unsigned int iou_issue_flags) + struct iov_iter *iter, unsigned int flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -137,24 +136,12 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, "using unchecked metadata buffer\n"); } - if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { - struct iov_iter iter; - - /* fixedbufs is only for non-vectored io */ - if (flags & NVME_IOCTL_VEC) - return -EINVAL; - - ret = io_uring_cmd_import_fixed(ubuffer, bufflen, - rq_data_dir(req), &iter, ioucmd, - iou_issue_flags); - if (ret < 0) - return ret; - ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); - } else { + if (iter) + ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL); + else ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 0, rq_data_dir(req)); - } if (ret) return ret; @@ -196,7 +183,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, NULL, flags, 0); + meta_len, NULL, flags); if (ret) goto out_free_req; } @@ -468,6 +455,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct nvme_uring_data d; struct nvme_command c; + struct iov_iter iter; + struct iov_iter *map_iter = NULL; struct request *req; blk_opf_t rq_flags = REQ_ALLOC_CACHE; blk_mq_req_flags_t blk_flags = 0; @@ -503,6 +492,20 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, d.metadata_len = READ_ONCE(cmd->metadata_len); d.timeout_ms = READ_ONCE(cmd->timeout_ms); + if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) { + /* fixedbufs is only for non-vectored io */ + if (vec) + return -EINVAL; + + ret = io_uring_cmd_import_fixed(d.addr, d.data_len, + nvme_is_write(&c) ? WRITE : READ, &iter, ioucmd, + issue_flags); + if (ret < 0) + return ret; + + map_iter = &iter; + } + if (issue_flags & IO_URING_F_NONBLOCK) { rq_flags |= REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; @@ -516,9 +519,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; if (d.data_len) { - ret = nvme_map_user_request(req, d.addr, - d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, ioucmd, vec, issue_flags); + ret = nvme_map_user_request(req, d.addr, d.data_len, + nvme_to_user_ptr(d.metadata), d.metadata_len, + map_iter, vec); if (ret) goto out_free_req; } From 47f8cc9e32c37343e18992c16ed2c7a9ac9ccd63 Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Sat, 22 Mar 2025 19:28:48 -0400 Subject: [PATCH 6/9] nvme: update the multipath warning in nvme_init_ns_head The new NVME_MULTIPATH_PARAM config option requires updates to the warning message in nvme_init_ns_head(). Signed-off-by: John Meneghini Tested-by: John Meneghini Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 777db89fdaa7..5ffc8f23a174 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3822,7 +3822,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) "Found shared namespace %d, but multipathing not supported.\n", info->nsid); dev_warn_once(ctrl->device, - "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n"); + "Shared namespace support requires core_nvme.multipath=Y.\n"); } } From 32c928142c4f2db5f9c11aa1fcc4de49d3f27fcd Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Sat, 22 Mar 2025 19:28:46 -0400 Subject: [PATCH 7/9] nvme-multipath: change the NVME_MULTIPATH config option Fix up the NVME_MULTIPATH config description so that it accurately describes what it does. Signed-off-by: John Meneghini Tested-by: John Meneghini Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/Kconfig | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 10e453b2436e..d47dfa80fb95 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -18,10 +18,15 @@ config NVME_MULTIPATH bool "NVMe multipath support" depends on NVME_CORE help - This option enables support for multipath access to NVMe - subsystems. If this option is enabled only a single - /dev/nvmeXnY device will show up for each NVMe namespace, - even if it is accessible through multiple controllers. + This option controls support for multipath access to NVMe + subsystems. If this option is enabled support for NVMe multipath + access is included in the kernel. If this option is disabled support + for NVMe multipath access is excluded from the kernel. When this + option is disabled each controller/namespace receives its + own /dev/nvmeXnY device entry and NVMe multipath access is + not supported. + + If unsure, say Y. config NVME_VERBOSE_ERRORS bool "NVMe verbose error reporting" From 288ff0d10beb069355036355d5f7612579dc869c Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 31 Mar 2025 18:28:59 +0200 Subject: [PATCH 8/9] nvme-pci: skip nvme_write_sq_db on empty rqlist nvme_submit_cmds() should check the rqlist before calling nvme_write_sq_db(); if the list is empty, it must return immediately. Fixes: beadf0088501 ("nvme-pci: reverse request order in nvme_queue_rqs") Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2883d17ee1eb..b178d52eac1b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -986,6 +986,9 @@ static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist) { struct request *req; + if (rq_list_empty(rqlist)) + return; + spin_lock(&nvmeq->sq_lock); while ((req = rq_list_pop(rqlist))) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); From 01b91bf14f6d4893e03e357006e7af3a20c03fee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Apr 2025 18:54:02 +0800 Subject: [PATCH 9/9] block: don't grab elevator lock during queue initialization ->elevator_lock depends on queue freeze lock, see block/blk-sysfs.c. queue freeze lock depends on fs_reclaim. So don't grab elevator lock during queue initialization which needs to call kmalloc(GFP_KERNEL), and we can cut the dependency between ->elevator_lock and fs_reclaim, then the lockdep warning can be killed. This way is safe because elevator setting isn't ready to run during queue initialization. There isn't such issue in __blk_mq_update_nr_hw_queues() because memalloc_noio_save() is called before acquiring elevator lock. Fixes the following lockdep warning: https://lore.kernel.org/linux-block/67e6b425.050a0220.2f068f.007b.GAE@google.com/ Reported-by: syzbot+4c7e0f9b94ad65811efb@syzkaller.appspotmail.com Cc: Nilay Shroff Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250403105402.1334206-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0cfd1a149f64..c2697db59109 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4464,14 +4464,12 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( return NULL; } -static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, - struct request_queue *q) +static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; - /* protect against switching io scheduler */ - mutex_lock(&q->elevator_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4504,7 +4502,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->elevator_lock); +} + +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q, bool lock) +{ + if (lock) { + /* protect against switching io scheduler */ + mutex_lock(&q->elevator_lock); + __blk_mq_realloc_hw_ctxs(set, q); + mutex_unlock(&q->elevator_lock); + } else { + __blk_mq_realloc_hw_ctxs(set, q); + } /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4536,7 +4546,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q); + blk_mq_realloc_hw_ctxs(set, q, false); if (!q->nr_hw_queues) goto err_hctxs; @@ -5032,7 +5042,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_realloc_hw_ctxs(set, q); + blk_mq_realloc_hw_ctxs(set, q, true); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues;