diff options
author | Jens Axboe <axboe@kernel.dk> | 2017-11-29 09:21:50 -0700 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2017-11-29 09:21:50 -0700 |
commit | ed565371e368f014db237aacf42b27b40b1bd247 (patch) | |
tree | 3d950c90e216b27175ccfb193002c798af4f5fc2 | |
parent | 2967acbb257a6a9bf912f4778b727e00972eac9b (diff) | |
parent | 7e5dd57ef3081ff6c03908d786ed5087f6fbb7ae (diff) | |
download | linux-ed565371e368f014db237aacf42b27b40b1bd247.tar.gz |
Merge branch 'nvme-4.15' of git://git.infradead.org/nvme into for-linus
Pull NVMe fixes from Christoph:
"A few more nvme updates for 4.15. A single small PCIe fix, and a number
of patches for RDMA that are a little larger than what I'd like to see
for -rc2, but they fix important issues seen in the wild."
-rw-r--r-- | drivers/nvme/host/pci.c | 1 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 234 |
2 files changed, 119 insertions, 116 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 617374762b7c51..f5800c3c9082a6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1759,6 +1759,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), dev->host_mem_descs, dev->host_mem_descs_dma); dev->host_mem_descs = NULL; + dev->nr_host_mem_descs = 0; } static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2c597105a6bf31..37af56596be6ce 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <rdma/mr_pool.h> #include <linux/err.h> #include <linux/string.h> #include <linux/atomic.h> @@ -59,6 +60,9 @@ struct nvme_rdma_request { struct nvme_request req; struct ib_mr *mr; struct nvme_rdma_qe sqe; + union nvme_result result; + __le16 status; + refcount_t ref; struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; int nents; @@ -73,11 +77,11 @@ struct nvme_rdma_request { enum nvme_rdma_queue_flags { NVME_RDMA_Q_ALLOCATED = 0, NVME_RDMA_Q_LIVE = 1, + NVME_RDMA_Q_TR_READY = 2, }; struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - atomic_t sig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -258,32 +262,6 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) return ret; } -static int nvme_rdma_reinit_request(void *data, struct request *rq) -{ - struct nvme_rdma_ctrl *ctrl = data; - struct nvme_rdma_device *dev = ctrl->device; - struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - int ret = 0; - - if (WARN_ON_ONCE(!req->mr)) - return 0; - - ib_dereg_mr(req->mr); - - req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, - ctrl->max_fr_pages); - if (IS_ERR(req->mr)) { - ret = PTR_ERR(req->mr); - req->mr = NULL; - goto out; - } - - req->mr->need_inval = false; - -out: - return ret; -} - static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { @@ -293,9 +271,6 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; - if (req->mr) - ib_dereg_mr(req->mr); - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); } @@ -317,21 +292,9 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, if (ret) return ret; - req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, - ctrl->max_fr_pages); - if (IS_ERR(req->mr)) { - ret = PTR_ERR(req->mr); - goto out_free_qe; - } - req->queue = queue; return 0; - -out_free_qe: - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - return -ENOMEM; } static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -428,10 +391,23 @@ out_err: static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; + struct nvme_rdma_device *dev; + struct ib_device *ibdev; - rdma_destroy_qp(queue->cm_id); + if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) + return; + + dev = queue->device; + ibdev = dev->dev; + + ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); + + /* + * The cm_id object might have been destroyed during RDMA connection + * establishment error flow to avoid getting other cma events, thus + * the destruction of the QP shouldn't use rdma_cm API. + */ + ib_destroy_qp(queue->qp); ib_free_cq(queue->ib_cq); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, @@ -440,6 +416,12 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) nvme_rdma_dev_put(dev); } +static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) +{ + return min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ibdev->attrs.max_fast_reg_page_list_len); +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; @@ -482,8 +464,24 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) goto out_destroy_qp; } + ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, + queue->queue_size, + IB_MR_TYPE_MEM_REG, + nvme_rdma_get_max_fr_pages(ibdev)); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "failed to initialize MR pool sized %d for QID %d\n", + queue->queue_size, idx); + goto out_destroy_ring; + } + + set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); + return 0; +out_destroy_ring: + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: @@ -510,7 +508,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; - atomic_set(&queue->sig_count, 0); queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -546,6 +543,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, out_destroy_cm_id: rdma_destroy_id(queue->cm_id); + nvme_rdma_destroy_queue_ib(queue); return ret; } @@ -756,8 +754,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->device = ctrl->queues[0].device; - ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ctrl->device->dev->attrs.max_fast_reg_page_list_len); + ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); if (new) { ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); @@ -771,10 +768,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, error = PTR_ERR(ctrl->ctrl.admin_q); goto out_free_tagset; } - } else { - error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); - if (error) - goto out_free_queue; } error = nvme_rdma_start_queue(ctrl, 0); @@ -854,10 +847,6 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_tag_set; } } else { - ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); - if (ret) - goto out_free_io_queues; - blk_mq_update_nr_hw_queues(&ctrl->tag_set, ctrl->ctrl.queue_count - 1); } @@ -1018,8 +1007,18 @@ static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) { - if (unlikely(wc->status != IB_WC_SUCCESS)) + struct nvme_rdma_request *req = + container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); + struct request *rq = blk_mq_rq_from_pdu(req); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); + return; + } + + if (refcount_dec_and_test(&req->ref)) + nvme_end_request(rq, req->status, req->result); + } static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, @@ -1030,7 +1029,7 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, .opcode = IB_WR_LOCAL_INV, .next = NULL, .num_sge = 0, - .send_flags = 0, + .send_flags = IB_SEND_SIGNALED, .ex.invalidate_rkey = req->mr->rkey, }; @@ -1044,22 +1043,15 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_rdma_ctrl *ctrl = queue->ctrl; struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; - int res; if (!blk_rq_bytes(rq)) return; - if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) { - res = nvme_rdma_inv_rkey(queue, req); - if (unlikely(res < 0)) { - dev_err(ctrl->ctrl.device, - "Queueing INV WR for rkey %#x failed (%d)\n", - req->mr->rkey, res); - nvme_rdma_error_recovery(queue->ctrl); - } + if (req->mr) { + ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); + req->mr = NULL; } ib_dma_unmap_sg(ibdev, req->sg_table.sgl, @@ -1118,12 +1110,18 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; int nr; + req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); + if (WARN_ON_ONCE(!req->mr)) + return -EAGAIN; + /* * Align the MR to a 4K page size to match the ctrl page size and * the block virtual boundary. */ nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); if (unlikely(nr < count)) { + ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); + req->mr = NULL; if (nr < 0) return nr; return -EINVAL; @@ -1142,8 +1140,6 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; - req->mr->need_inval = true; - sg->addr = cpu_to_le64(req->mr->iova); put_unaligned_le24(req->mr->length, sg->length); put_unaligned_le32(req->mr->rkey, sg->key); @@ -1163,7 +1159,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->num_sge = 1; req->inline_data = false; - req->mr->need_inval = false; + refcount_set(&req->ref, 2); /* send and recv completions */ c->common.flags |= NVME_CMD_SGL_METABUF; @@ -1200,25 +1196,24 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { - if (unlikely(wc->status != IB_WC_SUCCESS)) - nvme_rdma_wr_error(cq, wc, "SEND"); -} + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_request *req = + container_of(qe, struct nvme_rdma_request, sqe); + struct request *rq = blk_mq_rq_from_pdu(req); -/* - * We want to signal completion at least every queue depth/2. This returns the - * largest power of two that is not above half of (queue size + 1) to optimize - * (avoid divisions). - */ -static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) -{ - int limit = 1 << ilog2((queue->queue_size + 1) / 2); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvme_rdma_wr_error(cq, wc, "SEND"); + return; + } - return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; + if (refcount_dec_and_test(&req->ref)) + nvme_end_request(rq, req->status, req->result); } static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, - struct ib_send_wr *first, bool flush) + struct ib_send_wr *first) { struct ib_send_wr wr, *bad_wr; int ret; @@ -1227,31 +1222,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, sge->length = sizeof(struct nvme_command), sge->lkey = queue->device->pd->local_dma_lkey; - qe->cqe.done = nvme_rdma_send_done; - wr.next = NULL; wr.wr_cqe = &qe->cqe; wr.sg_list = sge; wr.num_sge = num_sge; wr.opcode = IB_WR_SEND; - wr.send_flags = 0; - - /* - * Unsignalled send completions are another giant desaster in the - * IB Verbs spec: If we don't regularly post signalled sends - * the send queue will fill up and only a QP reset will rescue us. - * Would have been way to obvious to handle this in hardware or - * at least the RDMA stack.. - * - * Always signal the flushes. The magic request used for the flush - * sequencer is not allocated in our driver's tagset and it's - * triggered to be freed by blk_cleanup_queue(). So we need to - * always mark it as signaled to ensure that the "wr_cqe", which is - * embedded in request's payload, is not freed when __ib_process_cq() - * calls wr_cqe->done(). - */ - if (nvme_rdma_queue_sig_limit(queue) || flush) - wr.send_flags |= IB_SEND_SIGNALED; + wr.send_flags = IB_SEND_SIGNALED; if (first) first->next = ≀ @@ -1301,6 +1277,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) return queue->ctrl->tag_set.tags[queue_idx - 1]; } +static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "ASYNC"); +} + static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); @@ -1319,10 +1301,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); + sqe->cqe.done = nvme_rdma_async_done; + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); - ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); + ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); WARN_ON_ONCE(ret); } @@ -1343,14 +1327,34 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, } req = blk_mq_rq_to_pdu(rq); - if (rq->tag == tag) - ret = 1; + req->status = cqe->status; + req->result = cqe->result; + + if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { + if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { + dev_err(queue->ctrl->ctrl.device, + "Bogus remote invalidation for rkey %#x\n", + req->mr->rkey); + nvme_rdma_error_recovery(queue->ctrl); + } + } else if (req->mr) { + ret = nvme_rdma_inv_rkey(queue, req); + if (unlikely(ret < 0)) { + dev_err(queue->ctrl->ctrl.device, + "Queueing INV WR for rkey %#x failed (%d)\n", + req->mr->rkey, ret); + nvme_rdma_error_recovery(queue->ctrl); + } + /* the local invalidation completion will end the request */ + return 0; + } - if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && - wc->ex.invalidate_rkey == req->mr->rkey) - req->mr->need_inval = false; + if (refcount_dec_and_test(&req->ref)) { + if (rq->tag == tag) + ret = 1; + nvme_end_request(rq, req->status, req->result); + } - nvme_end_request(rq, cqe->status, cqe->result); return ret; } @@ -1607,7 +1611,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = &req->sqe; struct nvme_command *c = sqe->data; - bool flush = false; struct ib_device *dev; blk_status_t ret; int err; @@ -1636,13 +1639,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, goto err; } + sqe->cqe.done = nvme_rdma_send_done; + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); - if (req_op(rq) == REQ_OP_FLUSH) - flush = true; err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, - req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); + req->mr ? &req->reg_wr.wr : NULL); if (unlikely(err)) { nvme_rdma_unmap_data(queue, rq); goto err; @@ -1790,7 +1793,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, - .reinit_request = nvme_rdma_reinit_request, }; static inline bool |