aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@nvidia.com>2020-11-26 10:42:11 -0400
committerLeon Romanovsky <leon@kernel.org>2024-01-08 16:14:04 +0200
commitee21a6fb9d9ce89e70547d6dd6e5b0b96e13389c (patch)
tree6ab5735c7b004a1ad19aa1cb01cf25f3efcf1f12
parent2dde18cd1d8fac735875f2e4987f11817cc0bc2c (diff)
downloadlinux-rdma-gpu-v6.5.tar.gz
RDMA/core: Introduce peer memory interfacegpu-v6.5
The peer_memory_client scheme allows a driver to register with the ib_umem system that it has the ability to understand user virtual address ranges that are not compatible with get_user_pages(). For instance VMAs created with io_remap_pfn_range(), or other driver special VMA. For ranges the interface understands it can provide a DMA mapped sg_table for use by the ib_umem, allowing user virtual ranges that cannot be supported by get_user_pages() to be used as umems for RDMA. This is designed to preserve the kABI, no functions or structures are changed, only new symbols are added: ib_register_peer_memory_client ib_unregister_peer_memory_client ib_umem_activate_invalidation_notifier ib_umem_get_peer And a bitfield in struct ib_umem uses more bits. This interface is compatible with the two out of tree GPU drivers: https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c https://github.com/Mellanox/nv_peer_memory/blob/master/nv_peer_mem.c Signed-off-by: Yishai Hadas <yishaih@mellanox.com> Signed-off-by: Feras Daoud <ferasda@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-rw-r--r--drivers/infiniband/core/Makefile2
-rw-r--r--drivers/infiniband/core/ib_peer_mem.h65
-rw-r--r--drivers/infiniband/core/peer_mem.c526
-rw-r--r--drivers/infiniband/core/umem.c47
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c12
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c3
-rw-r--r--drivers/infiniband/hw/mlx5/doorbell.c5
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h2
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c61
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c12
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c2
-rw-r--r--include/rdma/ib_umem.h29
-rw-r--r--include/rdma/peer_mem.h176
13 files changed, 912 insertions, 30 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 8ab4eea5a0a5e..3775466a36127 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -40,5 +40,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
uverbs_std_types_srq.o \
uverbs_std_types_wq.o \
uverbs_std_types_qp.o
-ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o peer_mem.o
ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h
new file mode 100644
index 0000000000000..988012340ce62
--- /dev/null
+++ b/drivers/infiniband/core/ib_peer_mem.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved.
+ * Copyright (C) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All Rights Reserved.
+ */
+#ifndef RDMA_IB_PEER_MEM_H
+#define RDMA_IB_PEER_MEM_H
+
+#include <rdma/peer_mem.h>
+#include <linux/kobject.h>
+#include <linux/xarray.h>
+#include <rdma/ib_umem.h>
+
+struct ib_peer_memory_statistics {
+ atomic64_t num_alloc_mrs;
+ atomic64_t num_dealloc_mrs;
+ atomic64_t num_reg_pages;
+ atomic64_t num_dereg_pages;
+ atomic64_t num_reg_bytes;
+ atomic64_t num_dereg_bytes;
+ unsigned long num_free_callbacks;
+};
+
+struct ib_peer_memory_client {
+ refcount_t usecnt;
+ struct completion usecnt_zero;
+ const struct peer_memory_client *peer_mem;
+ struct list_head core_peer_list;
+ struct ib_peer_memory_statistics stats;
+ struct xarray umem_xa;
+ u32 xa_cyclic_next;
+ bool invalidation_required;
+};
+
+enum ib_umem_mapped_state {
+ UMEM_PEER_UNMAPPED,
+ UMEM_PEER_MAPPED,
+ UMEM_PEER_INVALIDATED,
+};
+
+struct ib_umem_peer {
+ struct ib_umem umem;
+ struct kref kref;
+ /* peer memory that manages this umem */
+ struct ib_peer_memory_client *ib_peer_client;
+ void *peer_client_context;
+ umem_invalidate_func_t invalidation_func;
+ void *invalidation_private;
+ struct mutex mapping_lock;
+ enum ib_umem_mapped_state mapped_state;
+ u32 xa_id;
+ struct scatterlist *first_sg;
+ dma_addr_t first_dma_address;
+ unsigned int first_dma_length;
+ unsigned int first_length;
+ struct scatterlist *last_sg;
+ unsigned int last_dma_length;
+ unsigned int last_length;
+};
+
+struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
+ unsigned long peer_mem_flags);
+void ib_peer_umem_release(struct ib_umem *umem);
+
+#endif
diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c
new file mode 100644
index 0000000000000..51a9c9f699f67
--- /dev/null
+++ b/drivers/infiniband/core/peer_mem.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved.
+ * Copyright (C) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All Rights Reserved.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <linux/sched/mm.h>
+#include "ib_peer_mem.h"
+
+static DEFINE_MUTEX(peer_memory_mutex);
+static LIST_HEAD(peer_memory_list);
+#define PEER_NO_INVALIDATION_ID U32_MAX
+
+static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context);
+
+static int ib_memory_peer_check_mandatory(const struct peer_memory_client
+ *peer_client)
+{
+#define PEER_MEM_MANDATORY_FUNC(x) {offsetof(struct peer_memory_client, x), #x}
+ int i;
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ PEER_MEM_MANDATORY_FUNC(acquire),
+ PEER_MEM_MANDATORY_FUNC(get_pages),
+ PEER_MEM_MANDATORY_FUNC(put_pages),
+ PEER_MEM_MANDATORY_FUNC(dma_map),
+ PEER_MEM_MANDATORY_FUNC(dma_unmap),
+ };
+
+ for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+ if (!*(void **)((void *)peer_client +
+ mandatory_table[i].offset)) {
+ pr_err("Peer memory %s is missing mandatory function %s\n",
+ peer_client->name, mandatory_table[i].name);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+void *
+ib_register_peer_memory_client(const struct peer_memory_client *peer_client,
+ invalidate_peer_memory *invalidate_callback)
+{
+ struct ib_peer_memory_client *ib_peer_client;
+
+ if (ib_memory_peer_check_mandatory(peer_client))
+ return NULL;
+
+ ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL);
+ if (!ib_peer_client)
+ return NULL;
+ refcount_set(&ib_peer_client->usecnt, 1);
+ init_completion(&ib_peer_client->usecnt_zero);
+ ib_peer_client->peer_mem = peer_client;
+ xa_init_flags(&ib_peer_client->umem_xa, XA_FLAGS_ALLOC);
+
+ /*
+ * If the peer wants the invalidation_callback then all memory users
+ * linked to that peer must support invalidation.
+ */
+ if (invalidate_callback) {
+ *invalidate_callback = ib_invalidate_peer_memory;
+ ib_peer_client->invalidation_required = true;
+ }
+
+ mutex_lock(&peer_memory_mutex);
+ list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list);
+ mutex_unlock(&peer_memory_mutex);
+ return ib_peer_client;
+}
+EXPORT_SYMBOL(ib_register_peer_memory_client);
+
+void ib_unregister_peer_memory_client(void *reg_handle)
+{
+ struct ib_peer_memory_client *ib_peer_client = reg_handle;
+
+ mutex_lock(&peer_memory_mutex);
+ list_del(&ib_peer_client->core_peer_list);
+ mutex_unlock(&peer_memory_mutex);
+
+ /*
+ * Wait for all umems to be destroyed before returning. Once
+ * ib_unregister_peer_memory_client() returns no umems will call any
+ * peer_mem ops.
+ */
+ if (refcount_dec_and_test(&ib_peer_client->usecnt))
+ complete(&ib_peer_client->usecnt_zero);
+ wait_for_completion(&ib_peer_client->usecnt_zero);
+}
+EXPORT_SYMBOL(ib_unregister_peer_memory_client);
+
+static struct ib_peer_memory_client *
+ib_get_peer_client(unsigned long addr, size_t size,
+ unsigned long peer_mem_flags, void **peer_client_context)
+{
+ struct ib_peer_memory_client *ib_peer_client;
+ int ret = 0;
+
+ mutex_lock(&peer_memory_mutex);
+ list_for_each_entry(ib_peer_client, &peer_memory_list,
+ core_peer_list) {
+ if (ib_peer_client->invalidation_required &&
+ (!(peer_mem_flags & IB_PEER_MEM_INVAL_SUPP)))
+ continue;
+ ret = ib_peer_client->peer_mem->acquire(addr, size, NULL, NULL,
+ peer_client_context);
+ if (ret > 0) {
+ refcount_inc(&ib_peer_client->usecnt);
+ mutex_unlock(&peer_memory_mutex);
+ return ib_peer_client;
+ }
+ }
+ mutex_unlock(&peer_memory_mutex);
+ return NULL;
+}
+
+static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
+ void *peer_client_context)
+{
+ if (ib_peer_client->peer_mem->release)
+ ib_peer_client->peer_mem->release(peer_client_context);
+ if (refcount_dec_and_test(&ib_peer_client->usecnt))
+ complete(&ib_peer_client->usecnt_zero);
+}
+
+static void ib_peer_umem_kref_release(struct kref *kref)
+{
+ struct ib_umem_peer *umem_p =
+ container_of(kref, struct ib_umem_peer, kref);
+
+ mutex_destroy(&umem_p->mapping_lock);
+ kfree(umem_p);
+}
+
+static void ib_unmap_peer_client(struct ib_umem_peer *umem_p,
+ enum ib_umem_mapped_state cur_state,
+ enum ib_umem_mapped_state to_state)
+{
+ struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
+ const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
+ struct ib_umem *umem = &umem_p->umem;
+
+ if (cur_state == UMEM_PEER_MAPPED &&
+ (to_state == UMEM_PEER_UNMAPPED ||
+ to_state == UMEM_PEER_INVALIDATED)) {
+ /*
+ * In the invalidated state we will never touch the sg again,
+ * but the client might, so fix it anyhow.
+ */
+ if (umem_p->last_sg) {
+ umem_p->last_sg->length = umem_p->last_length;
+ sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length;
+ }
+
+ if (umem_p->first_sg) {
+ umem_p->first_sg->dma_address =
+ umem_p->first_dma_address;
+ umem_p->first_sg->length = umem_p->first_length;
+ sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length;
+ }
+
+ if (to_state == UMEM_PEER_UNMAPPED) {
+ peer_mem->dma_unmap(&umem_p->umem.sgt_append.sgt,
+ umem_p->peer_client_context,
+ umem_p->umem.ibdev->dma_device);
+ peer_mem->put_pages(&umem_p->umem.sgt_append.sgt,
+ umem_p->peer_client_context);
+ }
+
+ memset(&umem->sgt_append, 0, sizeof(umem->sgt_append));
+ atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
+ }
+
+ if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) ||
+ (cur_state == UMEM_PEER_INVALIDATED &&
+ to_state == UMEM_PEER_UNMAPPED)) {
+ atomic64_add(umem->sgt_append.sgt.nents,
+ &ib_peer_client->stats.num_dereg_pages);
+ atomic64_add(umem->length,
+ &ib_peer_client->stats.num_dereg_bytes);
+ }
+ umem_p->mapped_state = to_state;
+}
+
+/*
+ * True if the client should do unmap itself after the invalidate callback
+ * returns. Clients operating in this mode need to use this locking pattern:
+ *
+ * client_invalidate:
+ * mutex_lock(&client_lock)
+ * invalidate_callback():
+ * mutex_lock(mapping_lock)
+ * mutex_unlock(mapping_lock)
+ * client_dma_unmap()
+ * client_put_pages()
+ * mutex_unlock(&client_lock)
+ *
+ * ib_umem_stop_invalidation_notifier():
+ * mutex_lock(mapping_lock)
+ * mutex_unlock(mapping_lock)
+ * peer_mem->dma_unmap():
+ * mutex_lock(&client_lock)
+ * client_dma_unmap()
+ * mutex_unlock(&client_lock)
+ * peer_mem->put_pages():
+ * mutex_lock(&client_lock)
+ * client_put_pages()
+ * mutex_unlock(&client_lock)
+ *
+ * ib_peer_umem_release():
+ * peer_mem->release():
+ * mutex_lock(&client_lock)
+ * mutex_unlock(&client_lock)
+ *
+ * Noting that dma_unmap/put_pages can be called even though invalidate has
+ * already done the unmap, and release() can be called concurrently with
+ * invalidate. The client must protect itself against these races.
+ */
+static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p)
+{
+ const struct peer_memory_client *peer_mem =
+ umem_p->ib_peer_client->peer_mem;
+ const struct peer_memory_client_ex *peer_mem_ex;
+
+ if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0)
+ return false;
+ peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex,
+ client);
+ if (peer_mem_ex->ex_size <
+ offsetofend(struct peer_memory_client_ex, flags))
+ return false;
+ return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS;
+}
+
+static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
+{
+ struct ib_peer_memory_client *ib_peer_client = reg_handle;
+ struct ib_umem_peer *umem_p;
+
+ /*
+ * The client is not required to fence against invalidation during
+ * put_pages() as that would deadlock when we call put_pages() here.
+ * Thus the core_context cannot be a umem pointer as we have no control
+ * over the lifetime. Since we won't change the kABI for this to add a
+ * proper kref, an xarray is used.
+ */
+ xa_lock(&ib_peer_client->umem_xa);
+ ib_peer_client->stats.num_free_callbacks += 1;
+ umem_p = xa_load(&ib_peer_client->umem_xa, core_context);
+ if (!umem_p)
+ goto out_unlock;
+ kref_get(&umem_p->kref);
+ xa_unlock(&ib_peer_client->umem_xa);
+
+ mutex_lock(&umem_p->mapping_lock);
+ /*
+ * For flows that require invalidation the invalidation_func should not
+ * be NULL while the device can be doing DMA. The mapping_lock ensures
+ * that the device is ready to receive an invalidation before one is
+ * triggered here.
+ */
+ if (umem_p->mapped_state == UMEM_PEER_MAPPED &&
+ umem_p->invalidation_func)
+ umem_p->invalidation_func(&umem_p->umem,
+ umem_p->invalidation_private);
+ if (ib_peer_unmap_on_invalidate(umem_p))
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_INVALIDATED);
+ else
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_UNMAPPED);
+ mutex_unlock(&umem_p->mapping_lock);
+ kref_put(&umem_p->kref, ib_peer_umem_kref_release);
+ return 0;
+
+out_unlock:
+ xa_unlock(&ib_peer_client->umem_xa);
+ return 0;
+}
+
+void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
+ umem_invalidate_func_t func,
+ void *priv)
+{
+ struct ib_umem_peer *umem_p =
+ container_of(umem, struct ib_umem_peer, umem);
+
+ if (WARN_ON(!umem->is_peer))
+ return;
+ if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
+ return;
+
+ umem_p->invalidation_func = func;
+ umem_p->invalidation_private = priv;
+ /* Pairs with the lock in ib_peer_umem_get() */
+ mutex_unlock(&umem_p->mapping_lock);
+
+ /* At this point func can be called asynchronously */
+}
+EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
+
+/*
+ * Caller has blocked DMA and will no longer be able to handle invalidate
+ * callbacks. Callers using invalidation must call this function before calling
+ * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional
+ * before doing this.
+ */
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+ struct ib_umem_peer *umem_p =
+ container_of(umem, struct ib_umem_peer, umem);
+ bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p);
+ enum ib_umem_mapped_state cur_state;
+
+ if (umem_p->invalidation_func) {
+ mutex_lock(&umem_p->mapping_lock);
+ umem_p->invalidation_func = NULL;
+ } else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) {
+ mutex_lock(&umem_p->mapping_lock);
+ } else {
+ /*
+ * Haven't called ib_umem_activate_invalidation_notifier() yet,
+ * still have the lock
+ */
+ }
+
+ if (!unmap_on_invalidate) {
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_UNMAPPED);
+ } else {
+ /* Block ib_invalidate_peer_memory() */
+ cur_state = umem_p->mapped_state;
+ umem_p->mapped_state = UMEM_PEER_UNMAPPED;
+ }
+ mutex_unlock(&umem_p->mapping_lock);
+
+ if (unmap_on_invalidate)
+ ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED);
+
+}
+EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier);
+
+static void fix_peer_sgls(struct ib_umem_peer *umem_p,
+ unsigned long peer_page_size)
+{
+ struct ib_umem *umem = &umem_p->umem;
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
+ if (i == 0) {
+ unsigned long offset;
+
+ umem_p->first_sg = sg;
+ umem_p->first_dma_address = sg->dma_address;
+ umem_p->first_dma_length = sg_dma_len(sg);
+ umem_p->first_length = sg->length;
+
+ offset = ALIGN_DOWN(umem->address, PAGE_SIZE) -
+ ALIGN_DOWN(umem->address, peer_page_size);
+ sg->dma_address += offset;
+ sg_dma_len(sg) -= offset;
+ sg->length -= offset;
+ }
+
+ if (i == umem->sgt_append.sgt.nents - 1) {
+ unsigned long trim;
+
+ umem_p->last_sg = sg;
+ umem_p->last_dma_length = sg_dma_len(sg);
+ umem_p->last_length = sg->length;
+
+ trim = ALIGN(umem->address + umem->length,
+ peer_page_size) -
+ ALIGN(umem->address + umem->length, PAGE_SIZE);
+ sg_dma_len(sg) -= trim;
+ sg->length -= trim;
+ }
+ }
+}
+
+struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
+ unsigned long peer_mem_flags)
+{
+ struct ib_peer_memory_client *ib_peer_client;
+ unsigned long peer_page_size;
+ void *peer_client_context;
+ struct ib_umem_peer *umem_p;
+ int ret;
+
+ ib_peer_client =
+ ib_get_peer_client(old_umem->address, old_umem->length,
+ peer_mem_flags, &peer_client_context);
+ if (!ib_peer_client)
+ return ERR_PTR(old_ret);
+
+ umem_p = kzalloc(sizeof(*umem_p), GFP_KERNEL);
+ if (!umem_p) {
+ ret = -ENOMEM;
+ goto err_client;
+ }
+
+ kref_init(&umem_p->kref);
+ umem_p->umem = *old_umem;
+ memset(&umem_p->umem.sgt_append, 0, sizeof(umem_p->umem.sgt_append));
+ umem_p->umem.is_peer = 1;
+ umem_p->ib_peer_client = ib_peer_client;
+ umem_p->peer_client_context = peer_client_context;
+ mutex_init(&umem_p->mapping_lock);
+ umem_p->xa_id = PEER_NO_INVALIDATION_ID;
+
+ mutex_lock(&umem_p->mapping_lock);
+ if (ib_peer_client->invalidation_required) {
+ ret = xa_alloc_cyclic(&ib_peer_client->umem_xa, &umem_p->xa_id,
+ umem_p,
+ XA_LIMIT(0, PEER_NO_INVALIDATION_ID - 1),
+ &ib_peer_client->xa_cyclic_next,
+ GFP_KERNEL);
+ if (ret < 0)
+ goto err_umem;
+ }
+
+ /*
+ * We always request write permissions to the pages, to force breaking
+ * of any CoW during the registration of the MR. For read-only MRs we
+ * use the "force" flag to indicate that CoW breaking is required but
+ * the registration should not fail if referencing read-only areas.
+ */
+ ret = ib_peer_client->peer_mem->get_pages(umem_p->umem.address,
+ umem_p->umem.length, 1,
+ !umem_p->umem.writable, NULL,
+ peer_client_context,
+ umem_p->xa_id);
+ if (ret)
+ goto err_xa;
+
+ ret = ib_peer_client->peer_mem->dma_map(
+ &umem_p->umem.sgt_append.sgt, peer_client_context,
+ umem_p->umem.ibdev->dma_device, 0,
+ &umem_p->umem.sgt_append.sgt.nents);
+ if (ret)
+ goto err_pages;
+
+ peer_page_size =
+ ib_peer_client->peer_mem->get_page_size(peer_client_context);
+ if (peer_page_size != PAGE_SIZE)
+ fix_peer_sgls(umem_p, peer_page_size);
+
+ umem_p->mapped_state = UMEM_PEER_MAPPED;
+ atomic64_add(umem_p->umem.sgt_append.sgt.nents,
+ &ib_peer_client->stats.num_reg_pages);
+ atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
+ atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
+
+ /*
+ * If invalidation is allowed then the caller must call
+ * ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to
+ * unlock this mutex. This call should be done after the last read to
+ * sg_head, once the caller is ready for the invalidation function to be
+ * called.
+ */
+ if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
+ mutex_unlock(&umem_p->mapping_lock);
+
+ /*
+ * On success the old umem is replaced with the new, larger, allocation
+ */
+ kfree(old_umem);
+ return &umem_p->umem;
+
+err_pages:
+ ib_peer_client->peer_mem->put_pages(&umem_p->umem.sgt_append.sgt,
+ umem_p->peer_client_context);
+err_xa:
+ if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
+ xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
+err_umem:
+ mutex_unlock(&umem_p->mapping_lock);
+ kref_put(&umem_p->kref, ib_peer_umem_kref_release);
+err_client:
+ ib_put_peer_client(ib_peer_client, peer_client_context);
+ return ERR_PTR(ret);
+}
+
+void ib_peer_umem_release(struct ib_umem *umem)
+{
+ struct ib_umem_peer *umem_p =
+ container_of(umem, struct ib_umem_peer, umem);
+
+ /*
+ * If ib_umem_activate_invalidation_notifier() is called then
+ * ib_umem_stop_invalidation_notifier() must be called before release.
+ */
+ WARN_ON(umem_p->invalidation_func);
+
+ /* For no invalidation cases, make sure it is unmapped */
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED);
+
+ if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
+ xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
+ ib_put_peer_client(umem_p->ib_peer_client, umem_p->peer_client_context);
+ umem_p->ib_peer_client = NULL;
+
+ /* Must match ib_umem_release() */
+ atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
+ mmdrop(umem->owning_mm);
+
+ kref_put(&umem_p->kref, ib_peer_umem_kref_release);
+}
+
+/* Use it like this:
+struct peer_memory_client_ex peer_memory_test = {
+ .client = {
+ .version = "1.0",
+ .version[IB_PEER_MEMORY_VER_MAX-1] = 1,
+ },
+ .ex_size = sizeof(struct peer_memory_client_ex),
+ .flags = PEER_MEM_INVALIDATE_UNMAPS,
+};
+*/
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index f9ab671c8eda5..a26b89349a7b6 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -45,6 +45,8 @@
#include "uverbs.h"
+#include "ib_peer_mem.h"
+
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
bool make_dirty = umem->writable && dirty;
@@ -138,15 +140,17 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
EXPORT_SYMBOL(ib_umem_find_best_pgsz);
/**
- * ib_umem_get - Pin and DMA map userspace memory.
+ * __ib_umem_get - Pin and DMA map userspace memory.
*
* @device: IB device to connect UMEM
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
+ * @peer_mem_flags: IB_PEER_MEM_xxx flags for memory being used
*/
-struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
- size_t size, int access)
+static struct ib_umem *__ib_umem_get(struct ib_device *device,
+ unsigned long addr, size_t size, int access,
+ unsigned long peer_mem_flags)
{
struct ib_umem *umem;
struct page **page_list;
@@ -249,6 +253,26 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
umem_release:
__ib_umem_release(device, umem, 0);
+
+ /*
+ * If the address belongs to peer memory client, then the first
+ * call to get_user_pages will fail. In this case, try to get
+ * these pages from the peers.
+ */
+ //FIXME: this placement is horrible
+ if (ret < 0 && peer_mem_flags & IB_PEER_MEM_ALLOW) {
+ struct ib_umem *new_umem;
+
+ new_umem = ib_peer_umem_get(umem, ret, peer_mem_flags);
+ if (IS_ERR(new_umem)) {
+ ret = PTR_ERR(new_umem);
+ goto vma;
+ }
+ umem = new_umem;
+ ret = 0;
+ goto out;
+ }
+vma:
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out:
free_page((unsigned long) page_list);
@@ -259,8 +283,23 @@ umem_kfree:
}
return ret ? ERR_PTR(ret) : umem;
}
+
+struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
+ size_t size, int access)
+{
+ return __ib_umem_get(device, addr, size, access, 0);
+}
EXPORT_SYMBOL(ib_umem_get);
+struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
+ size_t size, int access,
+ unsigned long peer_mem_flags)
+{
+ return __ib_umem_get(device, addr, size, access,
+ IB_PEER_MEM_ALLOW | peer_mem_flags);
+}
+EXPORT_SYMBOL(ib_umem_get_peer);
+
/**
* ib_umem_release - release memory pinned with ib_umem_get
* @umem: umem struct to release
@@ -274,6 +313,8 @@ void ib_umem_release(struct ib_umem *umem)
if (umem->is_odp)
return ib_umem_odp_release(to_ib_umem_odp(umem));
+ if (umem->is_peer)
+ return ib_peer_umem_release(umem);
__ib_umem_release(umem->ibdev, umem, 1);
atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index efc9e4a6df04a..cd0de5260a59f 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -745,9 +745,9 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
*cqe_size = ucmd.cqe_size;
- cq->buf.umem =
- ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
- entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
+ cq->buf.umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr,
+ entries * ucmd.cqe_size,
+ IB_ACCESS_LOCAL_WRITE, 0);
if (IS_ERR(cq->buf.umem)) {
err = PTR_ERR(cq->buf.umem);
return err;
@@ -1168,9 +1168,9 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
return -EINVAL;
- umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
- (size_t)ucmd.cqe_size * entries,
- IB_ACCESS_LOCAL_WRITE);
+ umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr,
+ (size_t)ucmd.cqe_size * entries,
+ IB_ACCESS_LOCAL_WRITE, 0);
if (IS_ERR(umem)) {
err = PTR_ERR(umem);
return err;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index db5fb196c728b..2f07385e9c059 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2208,7 +2208,8 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
return PTR_ERR(umem_dmabuf);
obj->umem = &umem_dmabuf->umem;
} else {
- obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access_flags);
+ obj->umem = ib_umem_get_peer(&dev->ib_dev, addr, size,
+ access_flags, 0);
if (IS_ERR(obj->umem))
return PTR_ERR(obj->umem);
}
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index e32111117a5ea..b5200b49c622a 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -66,8 +66,9 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
- page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
- PAGE_SIZE, 0);
+ page->umem =
+ ib_umem_get_peer(context->ibucontext.device, virt & PAGE_MASK,
+ PAGE_SIZE, 0, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 9c33d960af3c5..9d0c56b59ed29 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1643,7 +1643,7 @@ static inline bool rt_supported(int ts_cap)
static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev,
struct ib_umem *umem, int access_flags)
{
- if (!MLX5_CAP_GEN(dev->mdev, ats) || !umem->is_dmabuf)
+ if (!MLX5_CAP_GEN(dev->mdev, ats) || (!umem->is_dmabuf && !umem->is_peer))
return false;
return access_flags & IB_ACCESS_RELAXED_ORDERING;
}
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2017ede100a62..1ce48e485c5ba 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -44,6 +44,8 @@
#include "mlx5_ib.h"
#include "umr.h"
+static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv);
+
enum {
MAX_PENDING_REG_MR = 8,
};
@@ -1383,17 +1385,20 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
int err;
xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
- if (xlt_with_umr) {
+ if (xlt_with_umr && !umem->is_peer) {
mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
} else {
unsigned int page_size = mlx5_umem_find_best_pgsz(
umem, mkc, log_page_size, 0, iova);
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+ mr = reg_create(pd, umem, iova, access_flags, page_size,
+ !xlt_with_umr);
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
+ if (umem->is_peer)
+ ib_umem_stop_invalidation_notifier(umem);
ib_umem_release(umem);
return ERR_CAST(mr);
}
@@ -1414,6 +1419,11 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
return ERR_PTR(err);
}
}
+
+ if (umem->is_peer)
+ ib_umem_activate_invalidation_notifier(
+ umem, mlx5_invalidate_umem, mr);
+
return &mr->ibmr;
}
@@ -1491,7 +1501,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (access_flags & IB_ACCESS_ON_DEMAND)
return create_user_odp_mr(pd, start, length, iova, access_flags,
udata);
- umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
+ umem = ib_umem_get_peer(&dev->ib_dev, start, length, access_flags,
+ IB_PEER_MEM_INVAL_SUPP);
if (IS_ERR(umem))
return ERR_CAST(umem);
return create_real_mr(pd, umem, iova, access_flags);
@@ -1651,6 +1662,10 @@ static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
return err;
}
+ if (new_umem->is_peer)
+ ib_umem_activate_invalidation_notifier(
+ new_umem, mlx5_invalidate_umem, mr);
+
atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
ib_umem_release(old_umem);
atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
@@ -1694,8 +1709,13 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
return ERR_PTR(err);
return NULL;
}
- /* DM or ODP MR's don't have a normal umem so we can't re-use it */
- if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
+ /*
+ * DM or ODP MR's don't have a normal umem so we can't re-use
+ * it. Peer umems cannot have their MR's changed once created
+ * due to races with invalidation.
+ */
+ if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) ||
+ mr->umem->is_peer)
goto recreate;
/*
@@ -1714,10 +1734,11 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
}
/*
- * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
- * but the logic around releasing the umem is different
+ * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does but
+ * the logic around releasing the umem is different, peer memory
+ * invalidation semantics are incompatible.
*/
- if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
+ if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || mr->umem->is_peer)
goto recreate;
if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
@@ -1725,8 +1746,9 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
struct ib_umem *new_umem;
unsigned long page_size;
- new_umem = ib_umem_get(&dev->ib_dev, start, length,
- new_access_flags);
+ new_umem = ib_umem_get_peer(&dev->ib_dev, start, length,
+ new_access_flags,
+ IB_PEER_MEM_INVAL_SUPP);
if (IS_ERR(new_umem))
return ERR_CAST(new_umem);
@@ -1894,6 +1916,13 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
cache_ent_find_and_store(dev, mr))
mr->mmkey.cache_ent = NULL;
+ if (mr->umem && mr->umem->is_peer) {
+ rc = mlx5r_umr_revoke_mr(mr);
+ if (rc)
+ return rc;
+ ib_umem_stop_invalidation_notifier(mr->umem);
+ }
+
if (!mr->mmkey.cache_ent) {
rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
if (rc)
@@ -2612,3 +2641,15 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
return n;
}
+
+static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv)
+{
+ struct mlx5_ib_mr *mr = priv;
+
+ /*
+ * DMA is turned off for the mkey, but the mkey remains otherwise
+ * untouched until the normal flow of dereg_mr happens. Any access to
+ * this mkey will generate CQEs.
+ */
+ mlx5r_umr_revoke_mr(mr);
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 78b96bfb4e6ac..9261df5328a40 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -890,7 +890,8 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (!ucmd->buf_addr)
return -EINVAL;
- rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0);
+ rwq->umem = ib_umem_get_peer(&dev->ib_dev, ucmd->buf_addr,
+ rwq->buf_size, 0, 0);
if (IS_ERR(rwq->umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
err = PTR_ERR(rwq->umem);
@@ -1000,8 +1001,9 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (ucmd->buf_addr && ubuffer->buf_size) {
ubuffer->buf_addr = ucmd->buf_addr;
- ubuffer->umem = ib_umem_get(&dev->ib_dev, ubuffer->buf_addr,
- ubuffer->buf_size, 0);
+ ubuffer->umem =
+ ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr,
+ ubuffer->buf_size, 0, 0);
if (IS_ERR(ubuffer->umem)) {
err = PTR_ERR(ubuffer->umem);
goto err_bfreg;
@@ -1355,8 +1357,8 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
if (ts_format < 0)
return ts_format;
- sq->ubuffer.umem = ib_umem_get(&dev->ib_dev, ubuffer->buf_addr,
- ubuffer->buf_size, 0);
+ sq->ubuffer.umem = ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr,
+ ubuffer->buf_size, 0, 0);
if (IS_ERR(sq->ubuffer.umem))
return PTR_ERR(sq->ubuffer.umem);
page_size = mlx5_umem_find_best_quantized_pgoff(
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index a056ea835da54..32c6643d0f7af 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -75,7 +75,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
- srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0);
+ srq->umem = ib_umem_get_peer(pd->device, ucmd.buf_addr, buf_size, 0, 0);
if (IS_ERR(srq->umem)) {
mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
err = PTR_ERR(srq->umem);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 95896472a82bf..8d4caf90e1aac 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -25,6 +25,8 @@ struct ib_umem {
u32 writable : 1;
u32 is_odp : 1;
u32 is_dmabuf : 1;
+ /* Placing at the end of the bitfield list is ABI preserving on LE */
+ u32 is_peer : 1;
struct sg_append_table sgt_append;
};
@@ -45,6 +47,12 @@ static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
return container_of(umem, struct ib_umem_dmabuf, umem);
}
+typedef void (*umem_invalidate_func_t)(struct ib_umem *umem, void *priv);
+enum ib_peer_mem_flags {
+ IB_PEER_MEM_ALLOW = 1 << 0,
+ IB_PEER_MEM_INVAL_SUPP = 1 << 1,
+};
+
/* Returns the offset of the umem start relative to the first page. */
static inline int ib_umem_offset(struct ib_umem *umem)
{
@@ -146,6 +154,13 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
+struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
+ size_t size, int access,
+ unsigned long peer_mem_flags);
+void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
+ umem_invalidate_func_t func,
+ void *cookie);
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem);
#else /* CONFIG_INFINIBAND_USER_MEM */
@@ -195,6 +210,20 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
}
static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { }
+static inline struct ib_umem *ib_umem_get_peer(struct ib_device *device,
+ unsigned long addr, size_t size,
+ int access,
+ unsigned long peer_mem_flags)
+{
+ return ERR_PTR(-EINVAL);
+}
+static inline void ib_umem_activate_invalidation_notifier(
+ struct ib_umem *umem, umem_invalidate_func_t func, void *cookie)
+{
+}
+static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+}
#endif /* CONFIG_INFINIBAND_USER_MEM */
#endif /* IB_UMEM_H */
diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h
new file mode 100644
index 0000000000000..aa29b3ffb1c4f
--- /dev/null
+++ b/include/rdma/peer_mem.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved.
+ * Copyright (C) 2020-2021 NVIDIA CORPORATION & AFFILIATES. All Rights Reserved.
+ */
+#ifndef RDMA_PEER_MEM_H
+#define RDMA_PEER_MEM_H
+
+#include <linux/scatterlist.h>
+
+#define IB_PEER_MEMORY_NAME_MAX 64
+#define IB_PEER_MEMORY_VER_MAX 16
+
+/*
+ * Prior versions used a void * for core_context, at some point this was
+ * switched to use u64. Be careful if compiling this as 32 bit. To help the
+ * value of core_context is limited to u32 so it should work OK despite the
+ * type change.
+ */
+#define PEER_MEM_U64_CORE_CONTEXT
+
+struct device;
+
+/**
+ * struct peer_memory_client - registration information for user virtual
+ * memory handlers
+ *
+ * The peer_memory_client scheme allows a driver to register with the ib_umem
+ * system that it has the ability to understand user virtual address ranges
+ * that are not compatible with get_user_pages(). For instance VMAs created
+ * with io_remap_pfn_range(), or other driver special VMA.
+ *
+ * For ranges the interface understands it can provide a DMA mapped sg_table
+ * for use by the ib_umem, allowing user virtual ranges that cannot be
+ * supported by get_user_pages() to be used as umems.
+ */
+struct peer_memory_client {
+ char name[IB_PEER_MEMORY_NAME_MAX];
+ char version[IB_PEER_MEMORY_VER_MAX];
+
+ /**
+ * acquire - Begin working with a user space virtual address range
+ *
+ * @addr - Virtual address to be checked whether belongs to peer.
+ * @size - Length of the virtual memory area starting at addr.
+ * @peer_mem_private_data - Obsolete, always NULL
+ * @peer_mem_name - Obsolete, always NULL
+ * @client_context - Returns an opaque value for this acquire use in
+ * other APIs
+ *
+ * Returns 1 if the peer_memory_client supports the entire virtual
+ * address range, 0 or -ERRNO otherwise. If 1 is returned then
+ * release() will be called to release the acquire().
+ */
+ int (*acquire)(unsigned long addr, size_t size,
+ void *peer_mem_private_data, char *peer_mem_name,
+ void **client_context);
+ /**
+ * get_pages - Fill in the first part of a sg_table for a virtual
+ * address range
+ *
+ * @addr - Virtual address to be checked whether belongs to peer.
+ * @size - Length of the virtual memory area starting at addr.
+ * @write - Always 1
+ * @force - 1 if write is required
+ * @sg_head - Obsolete, always NULL
+ * @client_context - Value returned by acquire()
+ * @core_context - Value to be passed to invalidate_peer_memory for
+ * this get
+ *
+ * addr/size are passed as the raw virtual address range requested by
+ * the user, it is not aligned to any page size. get_pages() is always
+ * followed by dma_map().
+ *
+ * Upon return the caller can call the invalidate_callback().
+ *
+ * Returns 0 on success, -ERRNO on failure. After success put_pages()
+ * will be called to return the pages.
+ */
+ int (*get_pages)(unsigned long addr, size_t size, int write, int force,
+ struct sg_table *sg_head, void *client_context,
+ u64 core_context);
+ /**
+ * dma_map - Create a DMA mapped sg_table
+ *
+ * @sg_head - The sg_table to allocate
+ * @client_context - Value returned by acquire()
+ * @dma_device - The device that will be doing DMA from these addresses
+ * @dmasync - Obsolete, always 0
+ * @nmap - Returns the number of dma mapped entries in the sg_head
+ *
+ * Must be called after get_pages(). This must fill in the sg_head with
+ * DMA mapped SGLs for dma_device. Each SGL start and end must meet a
+ * minimum alignment of at least PAGE_SIZE, though individual sgls can
+ * be multiples of PAGE_SIZE, in any mixture. Since the user virtual
+ * address/size are not page aligned, the implementation must increase
+ * it to the logical alignment when building the SGLs.
+ *
+ * Returns 0 on success, -ERRNO on failure. After success dma_unmap()
+ * will be called to unmap the pages. On failure sg_head must be left
+ * untouched or point to a valid sg_table.
+ */
+ int (*dma_map)(struct sg_table *sg_head, void *client_context,
+ struct device *dma_device, int dmasync, int *nmap);
+ /**
+ * dma_unmap - Unmap a DMA mapped sg_table
+ *
+ * @sg_head - The sg_table to unmap
+ * @client_context - Value returned by acquire()
+ * @dma_device - The device that will be doing DMA from these addresses
+ *
+ * sg_head will not be touched after this function returns.
+ *
+ * Must return 0.
+ */
+ int (*dma_unmap)(struct sg_table *sg_head, void *client_context,
+ struct device *dma_device);
+ /**
+ * put_pages - Unpin a SGL
+ *
+ * @sg_head - The sg_table to unpin
+ * @client_context - Value returned by acquire()
+ *
+ * sg_head must be freed on return.
+ */
+ void (*put_pages)(struct sg_table *sg_head, void *client_context);
+ /* Client should always return PAGE_SIZE */
+ unsigned long (*get_page_size)(void *client_context);
+ /**
+ * release - Undo acquire
+ *
+ * @client_context - Value returned by acquire()
+ *
+ * If acquire() returns 1 then release() must be called. All
+ * get_pages() and dma_map()'s must be undone before calling this
+ * function.
+ */
+ void (*release)(void *client_context);
+};
+
+enum {
+ PEER_MEM_INVALIDATE_UNMAPS = 1 << 0,
+};
+
+struct peer_memory_client_ex {
+ struct peer_memory_client client;
+ size_t ex_size;
+ u32 flags;
+};
+
+/*
+ * If invalidate_callback() is non-NULL then the client will only support
+ * umems which can be invalidated. The caller may call the
+ * invalidate_callback() after acquire() on return the range will no longer
+ * have DMA active, and release() will have been called.
+ *
+ * Note: The implementation locking must ensure that get_pages(), and
+ * dma_map() do not have locking dependencies with invalidate_callback(). The
+ * ib_core will wait until any concurrent get_pages() or dma_map() completes
+ * before returning.
+ *
+ * Similarly, this can call dma_unmap(), put_pages() and release() from within
+ * the callback, or will wait for another thread doing those operations to
+ * complete.
+ *
+ * For these reasons the user of invalidate_callback() must be careful with
+ * locking.
+ */
+typedef int (*invalidate_peer_memory)(void *reg_handle, u64 core_context);
+
+void *
+ib_register_peer_memory_client(const struct peer_memory_client *peer_client,
+ invalidate_peer_memory *invalidate_callback);
+void ib_unregister_peer_memory_client(void *reg_handle);
+
+#endif