From: Keith Busch <kbusch@xxxxxxxxxx> Aligns data and metadata to the similar dma mapping scheme and removes one more user of the scatter-gather dma mapping. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- drivers/nvme/host/pci.c | 170 ++++++++++++++++++++++------------------ 1 file changed, 94 insertions(+), 76 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2b89e74d1fa73..3c9a53ba959c7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -172,9 +172,7 @@ struct nvme_dev { u32 last_ps; bool hmb; struct sg_table *hmb_sgt; - mempool_t *dmavec_mempool; - mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ __le32 *dbbuf_dbs; @@ -264,6 +262,12 @@ enum nvme_iod_flags { /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ IOD_P2P_BUS_ADDR = 1U << 3, + + /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_META_P2P_BUS_ADDR = 1U << 4, + + /* Metadata using non-coalesced MPTR */ + IOD_SINGLE_META_SEGMENT = 1U << 5, }; struct nvme_dma_vec { @@ -287,7 +291,8 @@ struct nvme_iod { unsigned int nr_dma_vecs; dma_addr_t meta_dma; - struct sg_table meta_sgt; + unsigned int meta_total_len; + struct dma_iova_state meta_dma_state; struct nvme_sgl_desc *meta_descriptor; }; @@ -644,6 +649,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, return nvmeq->descriptor_pools.large; } +static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd) +{ + return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG; +} + static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) { return cmd->common.flags & @@ -713,6 +723,43 @@ static void __nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, le32_to_cpu(sg_list[i].length), dir); } +static void nvme_free_meta_sgls(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_sgl_desc *sge = iod->meta_descriptor; + + __nvme_free_sgls(req, sge, &sge[1]); +} + +static void nvme_unmap_metadata(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum dma_data_direction dir = rq_dma_dir(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct device *dma_dev = nvmeq->dev->dev; + + if (iod->flags & IOD_SINGLE_META_SEGMENT) { + dma_unmap_page(dma_dev, iod->meta_dma, + rq_integrity_vec(req).bv_len, + rq_dma_dir(req)); + return; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len, + iod->flags & IOD_META_P2P_BUS_ADDR)) { + if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) + nvme_free_meta_sgls(req); + else + dma_unmap_page(dma_dev, iod->meta_dma, + iod->meta_total_len, dir); + } + + if (iod->meta_descriptor) + dma_pool_free(nvmeq->descriptor_pools.small, + iod->meta_descriptor, iod->meta_dma); +} + static void nvme_free_sgls(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1022,70 +1069,72 @@ static blk_status_t nvme_map_data(struct request *req) return nvme_pci_setup_data_prp(req, &iter); } -static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, - struct scatterlist *sg) -{ - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); - sge->type = NVME_SGL_FMT_DATA_DESC << 4; -} - static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; + unsigned int entries = req->nr_integrity_segments; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = nvmeq->dev; struct nvme_sgl_desc *sg_list; - struct scatterlist *sgl, *sg; - unsigned int entries; + struct blk_dma_iter iter; dma_addr_t sgl_dma; - int rc, i; + int i = 0; - iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); - if (!iod->meta_sgt.sgl) - return BLK_STS_RESOURCE; + if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev, + &iod->meta_dma_state, &iter)) + return iter.status; - sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); - iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, - iod->meta_sgt.sgl); - if (!iod->meta_sgt.orig_nents) - goto out_free_sg; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_META_P2P_BUS_ADDR; + else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + entries = 1; - rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) - goto out_free_sg; + /* + * The NVMe MPTR descriptor has an implicit length that the host and + * device must agree on to avoid data/memory corruption. We trust the + * kernel allocated correctly based on the format's parameters, so use + * the more efficient MPTR to avoid extra dma pool allocations for the + * SGL indirection. + * + * But for user commands, we don't necessarily know what they do, so + * the driver can't validate the metadata buffer size. The SGL + * descriptor provides an explicit length, so we're relying on that + * mechanism to catch any misunderstandings between the application and + * device. + */ + if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) { + iod->cmd.common.metadata = cpu_to_le64(iter.addr); + iod->meta_total_len = iter.len; + iod->meta_dma = iter.addr; + iod->meta_descriptor = NULL; + return BLK_STS_OK; + } sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, &sgl_dma); if (!sg_list) - goto out_unmap_sg; + return BLK_STS_RESOURCE; - entries = iod->meta_sgt.nents; iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; - iod->cmd.common.flags = NVME_CMD_SGL_METASEG; iod->cmd.common.metadata = cpu_to_le64(sgl_dma); - - sgl = iod->meta_sgt.sgl; if (entries == 1) { - nvme_pci_sgl_set_data_sg(sg_list, sgl); + iod->meta_total_len = iter.len; + nvme_pci_sgl_set_data(sg_list, &iter); return BLK_STS_OK; } sgl_dma += sizeof(*sg_list); - nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); - for_each_sg(sgl, sg, entries, i) - nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); - - return BLK_STS_OK; + do { + nvme_pci_sgl_set_data(&sg_list[++i], &iter); + iod->meta_total_len += iter.len; + } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter)); -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); - return BLK_STS_RESOURCE; + nvme_pci_sgl_set_seg(sg_list, sgl_dma, i); + if (unlikely(iter.status)) + nvme_unmap_metadata(req); + return iter.status; } static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) @@ -1098,6 +1147,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); + iod->flags |= IOD_SINGLE_META_SEGMENT; return BLK_STS_OK; } @@ -1119,7 +1169,7 @@ static blk_status_t nvme_prep_rq(struct request *req) iod->flags = 0; iod->nr_descriptors = 0; iod->total_len = 0; - iod->meta_sgt.nents = 0; + iod->meta_total_len = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) @@ -1230,25 +1280,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } -static __always_inline void nvme_unmap_metadata(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - - if (!iod->meta_sgt.nents) { - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req).bv_len, - rq_dma_dir(req)); - return; - } - - dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, - iod->meta_dma); - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); -} - static __always_inline void nvme_pci_unmap_rq(struct request *req) { if (blk_integrity_rq(req)) @@ -3054,7 +3085,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; dev->dmavec_mempool = mempool_create_node(1, @@ -3063,17 +3093,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->dmavec_mempool) return -ENOMEM; - - dev->iod_meta_mempool = mempool_create_node(1, - mempool_kmalloc, mempool_kfree, - (void *)meta_size, GFP_KERNEL, - dev_to_node(dev->dev)); - if (!dev->iod_meta_mempool) - goto free; return 0; -free: - mempool_destroy(dev->dmavec_mempool); - return -ENOMEM; } static void nvme_free_tagset(struct nvme_dev *dev) @@ -3523,7 +3543,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_free_queues(dev, 0); out_release_iod_mempool: mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: @@ -3587,7 +3606,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); -- 2.47.3