diff options
author | Jason Gunthorpe <jgg@nvidia.com> | 2020-11-26 10:42:11 -0400 |
---|---|---|
committer | Leon Romanovsky <leonro@nvidia.com> | 2023-05-24 10:12:42 +0300 |
commit | eba78bb0f8efd2e5096c1321110ed29ce92797fc (patch) | |
tree | 2105e1681cdc23972d2891d377080b28c41d9145 | |
parent | 9f4ad9e425a1d3b6a34617b8ea226d56a119a717 (diff) | |
download | linux-rdma-gpu-v5.12.tar.gz |
RDMA/core: Introduce peer memory interfacegpu-v5.12
The peer_memory_client scheme allows a driver to register with the ib_umem
system that it has the ability to understand user virtual address ranges
that are not compatible with get_user_pages(). For instance VMAs created
with io_remap_pfn_range(), or other driver special VMA.
For ranges the interface understands it can provide a DMA mapped sg_table
for use by the ib_umem, allowing user virtual ranges that cannot be
supported by get_user_pages() to be used as umems for RDMA.
This is designed to preserve the kABI, no functions or structures are
changed, only new symbols are added:
ib_register_peer_memory_client
ib_unregister_peer_memory_client
ib_umem_activate_invalidation_notifier
ib_umem_get_peer
And a bitfield in struct ib_umem uses more bits.
This interface is compatible with the two out of tree GPU drivers:
https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
https://github.com/Mellanox/nv_peer_memory/blob/master/nv_peer_mem.c
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
-rw-r--r-- | drivers/infiniband/core/Makefile | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/ib_peer_mem.h | 66 | ||||
-rw-r--r-- | drivers/infiniband/core/peer_mem.c | 692 | ||||
-rw-r--r-- | drivers/infiniband/core/umem.c | 47 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 12 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/doorbell.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 59 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 12 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/srq.c | 2 | ||||
-rw-r--r-- | include/rdma/ib_umem.h | 29 | ||||
-rw-r--r-- | include/rdma/peer_mem.h | 176 |
12 files changed, 1075 insertions, 29 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 8ab4eea5a0a5e..3775466a36127 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -40,5 +40,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_srq.o \ uverbs_std_types_wq.o \ uverbs_std_types_qp.o -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o peer_mem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h new file mode 100644 index 0000000000000..dd467d35ad4d6 --- /dev/null +++ b/drivers/infiniband/core/ib_peer_mem.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + * Copyright (C) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All Rights Reserved. + */ +#ifndef RDMA_IB_PEER_MEM_H +#define RDMA_IB_PEER_MEM_H + +#include <rdma/peer_mem.h> +#include <linux/kobject.h> +#include <linux/xarray.h> +#include <rdma/ib_umem.h> + +struct ib_peer_memory_statistics { + atomic64_t num_alloc_mrs; + atomic64_t num_dealloc_mrs; + atomic64_t num_reg_pages; + atomic64_t num_dereg_pages; + atomic64_t num_reg_bytes; + atomic64_t num_dereg_bytes; + unsigned long num_free_callbacks; +}; + +struct ib_peer_memory_client { + struct kobject kobj; + refcount_t usecnt; + struct completion usecnt_zero; + const struct peer_memory_client *peer_mem; + struct list_head core_peer_list; + struct ib_peer_memory_statistics stats; + struct xarray umem_xa; + u32 xa_cyclic_next; + bool invalidation_required; +}; + +enum ib_umem_mapped_state { + UMEM_PEER_UNMAPPED, + UMEM_PEER_MAPPED, + UMEM_PEER_INVALIDATED, +}; + +struct ib_umem_peer { + struct ib_umem umem; + struct kref kref; + /* peer memory that manages this umem */ + struct ib_peer_memory_client *ib_peer_client; + void *peer_client_context; + umem_invalidate_func_t invalidation_func; + void *invalidation_private; + struct mutex mapping_lock; + enum ib_umem_mapped_state mapped_state; + u32 xa_id; + struct scatterlist *first_sg; + dma_addr_t first_dma_address; + unsigned int first_dma_length; + unsigned int first_length; + struct scatterlist *last_sg; + unsigned int last_dma_length; + unsigned int last_length; +}; + +struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret, + unsigned long peer_mem_flags); +void ib_peer_umem_release(struct ib_umem *umem); + +#endif diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c new file mode 100644 index 0000000000000..042d73cd41ea4 --- /dev/null +++ b/drivers/infiniband/core/peer_mem.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + * Copyright (C) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All Rights Reserved. + */ + +#include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> +#include <linux/sched/mm.h> +#include "ib_peer_mem.h" + +static DEFINE_MUTEX(peer_memory_mutex); +static LIST_HEAD(peer_memory_list); +static struct kobject *peers_kobj; +#define PEER_NO_INVALIDATION_ID U32_MAX + +static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context); + +struct peer_mem_attribute { + struct attribute attr; + ssize_t (*show)(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf); + ssize_t (*store)(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, const char *buf, + size_t count); +}; +#define PEER_ATTR_RO(_name) \ + struct peer_mem_attribute peer_attr_ ## _name = __ATTR_RO(_name) + +static ssize_t version_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", + ib_peer_client->peer_mem->version); +} +static PEER_ATTR_RO(version); + +static ssize_t num_alloc_mrs_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_alloc_mrs)); +} +static PEER_ATTR_RO(num_alloc_mrs); + +static ssize_t +num_dealloc_mrs_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) + +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_dealloc_mrs)); +} +static PEER_ATTR_RO(num_dealloc_mrs); + +static ssize_t num_reg_pages_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_reg_pages)); +} +static PEER_ATTR_RO(num_reg_pages); + +static ssize_t +num_dereg_pages_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_dereg_pages)); +} +static PEER_ATTR_RO(num_dereg_pages); + +static ssize_t num_reg_bytes_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_reg_bytes)); +} +static PEER_ATTR_RO(num_reg_bytes); + +static ssize_t +num_dereg_bytes_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_dereg_bytes)); +} +static PEER_ATTR_RO(num_dereg_bytes); + +static ssize_t +num_free_callbacks_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%lu\n", + ib_peer_client->stats.num_free_callbacks); +} +static PEER_ATTR_RO(num_free_callbacks); + +static struct attribute *peer_mem_attrs[] = { + &peer_attr_version.attr, + &peer_attr_num_alloc_mrs.attr, + &peer_attr_num_dealloc_mrs.attr, + &peer_attr_num_reg_pages.attr, + &peer_attr_num_dereg_pages.attr, + &peer_attr_num_reg_bytes.attr, + &peer_attr_num_dereg_bytes.attr, + &peer_attr_num_free_callbacks.attr, + NULL, +}; + +static const struct attribute_group peer_mem_attr_group = { + .attrs = peer_mem_attrs, +}; + +static ssize_t peer_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct peer_mem_attribute *peer_attr = + container_of(attr, struct peer_mem_attribute, attr); + + if (!peer_attr->show) + return -EIO; + return peer_attr->show(container_of(kobj, struct ib_peer_memory_client, + kobj), + peer_attr, buf); +} + +static const struct sysfs_ops peer_mem_sysfs_ops = { + .show = peer_attr_show, +}; + +static void ib_peer_memory_client_release(struct kobject *kobj) +{ + struct ib_peer_memory_client *ib_peer_client = + container_of(kobj, struct ib_peer_memory_client, kobj); + + kfree(ib_peer_client); +} + +static struct kobj_type peer_mem_type = { + .sysfs_ops = &peer_mem_sysfs_ops, + .release = ib_peer_memory_client_release, +}; + +static int ib_memory_peer_check_mandatory(const struct peer_memory_client + *peer_client) +{ +#define PEER_MEM_MANDATORY_FUNC(x) {offsetof(struct peer_memory_client, x), #x} + int i; + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + PEER_MEM_MANDATORY_FUNC(acquire), + PEER_MEM_MANDATORY_FUNC(get_pages), + PEER_MEM_MANDATORY_FUNC(put_pages), + PEER_MEM_MANDATORY_FUNC(dma_map), + PEER_MEM_MANDATORY_FUNC(dma_unmap), + }; + + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { + if (!*(void **)((void *)peer_client + + mandatory_table[i].offset)) { + pr_err("Peer memory %s is missing mandatory function %s\n", + peer_client->name, mandatory_table[i].name); + return -EINVAL; + } + } + + return 0; +} + +void * +ib_register_peer_memory_client(const struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback) +{ + struct ib_peer_memory_client *ib_peer_client; + int ret; + + if (ib_memory_peer_check_mandatory(peer_client)) + return NULL; + + ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL); + if (!ib_peer_client) + return NULL; + kobject_init(&ib_peer_client->kobj, &peer_mem_type); + refcount_set(&ib_peer_client->usecnt, 1); + init_completion(&ib_peer_client->usecnt_zero); + ib_peer_client->peer_mem = peer_client; + xa_init_flags(&ib_peer_client->umem_xa, XA_FLAGS_ALLOC); + + /* + * If the peer wants the invalidation_callback then all memory users + * linked to that peer must support invalidation. + */ + if (invalidate_callback) { + *invalidate_callback = ib_invalidate_peer_memory; + ib_peer_client->invalidation_required = true; + } + + mutex_lock(&peer_memory_mutex); + if (!peers_kobj) { + /* Created under /sys/kernel/mm */ + peers_kobj = kobject_create_and_add("memory_peers", mm_kobj); + if (!peers_kobj) + goto err_unlock; + } + + ret = kobject_add(&ib_peer_client->kobj, peers_kobj, peer_client->name); + if (ret) + goto err_parent; + + ret = sysfs_create_group(&ib_peer_client->kobj, + &peer_mem_attr_group); + if (ret) + goto err_parent; + list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list); + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; + +err_parent: + if (list_empty(&peer_memory_list)) { + kobject_put(peers_kobj); + peers_kobj = NULL; + } +err_unlock: + mutex_unlock(&peer_memory_mutex); + kobject_put(&ib_peer_client->kobj); + return NULL; +} +EXPORT_SYMBOL(ib_register_peer_memory_client); + +void ib_unregister_peer_memory_client(void *reg_handle) +{ + struct ib_peer_memory_client *ib_peer_client = reg_handle; + + mutex_lock(&peer_memory_mutex); + list_del(&ib_peer_client->core_peer_list); + if (list_empty(&peer_memory_list)) { + kobject_put(peers_kobj); + peers_kobj = NULL; + } + mutex_unlock(&peer_memory_mutex); + + /* + * Wait for all umems to be destroyed before returning. Once + * ib_unregister_peer_memory_client() returns no umems will call any + * peer_mem ops. + */ + if (refcount_dec_and_test(&ib_peer_client->usecnt)) + complete(&ib_peer_client->usecnt_zero); + wait_for_completion(&ib_peer_client->usecnt_zero); + + kobject_put(&ib_peer_client->kobj); +} +EXPORT_SYMBOL(ib_unregister_peer_memory_client); + +static struct ib_peer_memory_client * +ib_get_peer_client(unsigned long addr, size_t size, + unsigned long peer_mem_flags, void **peer_client_context) +{ + struct ib_peer_memory_client *ib_peer_client; + int ret = 0; + + mutex_lock(&peer_memory_mutex); + list_for_each_entry(ib_peer_client, &peer_memory_list, + core_peer_list) { + if (ib_peer_client->invalidation_required && + (!(peer_mem_flags & IB_PEER_MEM_INVAL_SUPP))) + continue; + ret = ib_peer_client->peer_mem->acquire(addr, size, NULL, NULL, + peer_client_context); + if (ret > 0) { + refcount_inc(&ib_peer_client->usecnt); + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; + } + } + mutex_unlock(&peer_memory_mutex); + return NULL; +} + +static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, + void *peer_client_context) +{ + if (ib_peer_client->peer_mem->release) + ib_peer_client->peer_mem->release(peer_client_context); + if (refcount_dec_and_test(&ib_peer_client->usecnt)) + complete(&ib_peer_client->usecnt_zero); +} + +static void ib_peer_umem_kref_release(struct kref *kref) +{ + struct ib_umem_peer *umem_p = + container_of(kref, struct ib_umem_peer, kref); + + mutex_destroy(&umem_p->mapping_lock); + kfree(umem_p); +} + +static void ib_unmap_peer_client(struct ib_umem_peer *umem_p, + enum ib_umem_mapped_state cur_state, + enum ib_umem_mapped_state to_state) +{ + struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client; + const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem; + struct ib_umem *umem = &umem_p->umem; + + if (cur_state == UMEM_PEER_MAPPED && + (to_state == UMEM_PEER_UNMAPPED || + to_state == UMEM_PEER_INVALIDATED)) { + /* + * In the invalidated state we will never touch the sg again, + * but the client might, so fix it anyhow. + */ + if (umem_p->last_sg) { + umem_p->last_sg->length = umem_p->last_length; + sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length; + } + + if (umem_p->first_sg) { + umem_p->first_sg->dma_address = + umem_p->first_dma_address; + umem_p->first_sg->length = umem_p->first_length; + sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length; + } + + if (to_state == UMEM_PEER_UNMAPPED) { + peer_mem->dma_unmap(&umem_p->umem.sg_head, + umem_p->peer_client_context, + umem_p->umem.ibdev->dma_device); + peer_mem->put_pages(&umem_p->umem.sg_head, + umem_p->peer_client_context); + } + + memset(&umem->sg_head, 0, sizeof(umem->sg_head)); + atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs); + } + + if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) || + (cur_state == UMEM_PEER_INVALIDATED && + to_state == UMEM_PEER_UNMAPPED)) { + atomic64_add(umem->sg_nents, + &ib_peer_client->stats.num_dereg_pages); + atomic64_add(umem->length, + &ib_peer_client->stats.num_dereg_bytes); + } + umem_p->mapped_state = to_state; +} + +/* + * True if the client should do unmap itself after the invalidate callback + * returns. Clients operating in this mode need to use this locking pattern: + * + * client_invalidate: + * mutex_lock(&client_lock) + * invalidate_callback(): + * mutex_lock(mapping_lock) + * mutex_unlock(mapping_lock) + * client_dma_unmap() + * client_put_pages() + * mutex_unlock(&client_lock) + * + * ib_umem_stop_invalidation_notifier(): + * mutex_lock(mapping_lock) + * mutex_unlock(mapping_lock) + * peer_mem->dma_unmap(): + * mutex_lock(&client_lock) + * client_dma_unmap() + * mutex_unlock(&client_lock) + * peer_mem->put_pages(): + * mutex_lock(&client_lock) + * client_put_pages() + * mutex_unlock(&client_lock) + * + * ib_peer_umem_release(): + * peer_mem->release(): + * mutex_lock(&client_lock) + * mutex_unlock(&client_lock) + * + * Noting that dma_unmap/put_pages can be called even though invalidate has + * already done the unmap, and release() can be called concurrently with + * invalidate. The client must protect itself against these races. + */ +static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p) +{ + const struct peer_memory_client *peer_mem = + umem_p->ib_peer_client->peer_mem; + const struct peer_memory_client_ex *peer_mem_ex; + + if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0) + return false; + peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex, + client); + if (peer_mem_ex->ex_size < + offsetofend(struct peer_memory_client_ex, flags)) + return false; + return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS; +} + +static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context) +{ + struct ib_peer_memory_client *ib_peer_client = reg_handle; + struct ib_umem_peer *umem_p; + + /* + * The client is not required to fence against invalidation during + * put_pages() as that would deadlock when we call put_pages() here. + * Thus the core_context cannot be a umem pointer as we have no control + * over the lifetime. Since we won't change the kABI for this to add a + * proper kref, an xarray is used. + */ + xa_lock(&ib_peer_client->umem_xa); + ib_peer_client->stats.num_free_callbacks += 1; + umem_p = xa_load(&ib_peer_client->umem_xa, core_context); + if (!umem_p) + goto out_unlock; + kref_get(&umem_p->kref); + xa_unlock(&ib_peer_client->umem_xa); + + mutex_lock(&umem_p->mapping_lock); + /* + * For flows that require invalidation the invalidation_func should not + * be NULL while the device can be doing DMA. The mapping_lock ensures + * that the device is ready to receive an invalidation before one is + * triggered here. + */ + if (umem_p->mapped_state == UMEM_PEER_MAPPED && + umem_p->invalidation_func) + umem_p->invalidation_func(&umem_p->umem, + umem_p->invalidation_private); + if (ib_peer_unmap_on_invalidate(umem_p)) + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_INVALIDATED); + else + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_UNMAPPED); + mutex_unlock(&umem_p->mapping_lock); + kref_put(&umem_p->kref, ib_peer_umem_kref_release); + return 0; + +out_unlock: + xa_unlock(&ib_peer_client->umem_xa); + return 0; +} + +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *priv) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + + if (WARN_ON(!umem->is_peer)) + return; + if (umem_p->xa_id == PEER_NO_INVALIDATION_ID) + return; + + umem_p->invalidation_func = func; + umem_p->invalidation_private = priv; + /* Pairs with the lock in ib_peer_umem_get() */ + mutex_unlock(&umem_p->mapping_lock); + + /* At this point func can be called asynchronously */ +} +EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); + +/* + * Caller has blocked DMA and will no longer be able to handle invalidate + * callbacks. Callers using invalidation must call this function before calling + * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional + * before doing this. + */ +void ib_umem_stop_invalidation_notifier(struct ib_umem *umem) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p); + enum ib_umem_mapped_state cur_state; + + if (umem_p->invalidation_func) { + mutex_lock(&umem_p->mapping_lock); + umem_p->invalidation_func = NULL; + } else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) { + mutex_lock(&umem_p->mapping_lock); + } else { + /* + * Haven't called ib_umem_activate_invalidation_notifier() yet, + * still have the lock + */ + } + + if (!unmap_on_invalidate) { + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_UNMAPPED); + } else { + /* Block ib_invalidate_peer_memory() */ + cur_state = umem_p->mapped_state; + umem_p->mapped_state = UMEM_PEER_UNMAPPED; + } + mutex_unlock(&umem_p->mapping_lock); + + if (unmap_on_invalidate) + ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED); + +} +EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier); + +static void fix_peer_sgls(struct ib_umem_peer *umem_p, + unsigned long peer_page_size) +{ + struct ib_umem *umem = &umem_p->umem; + struct scatterlist *sg; + int i; + + for_each_sgtable_sg(&umem->sg_head, sg, i) { + if (i == 0) { + unsigned long offset; + + umem_p->first_sg = sg; + umem_p->first_dma_address = sg->dma_address; + umem_p->first_dma_length = sg_dma_len(sg); + umem_p->first_length = sg->length; + + offset = ALIGN_DOWN(umem->address, PAGE_SIZE) - + ALIGN_DOWN(umem->address, peer_page_size); + sg->dma_address += offset; + sg_dma_len(sg) -= offset; + sg->length -= offset; + } + + if (i == umem->sg_nents - 1) { + unsigned long trim; + + umem_p->last_sg = sg; + umem_p->last_dma_length = sg_dma_len(sg); + umem_p->last_length = sg->length; + + trim = ALIGN(umem->address + umem->length, + peer_page_size) - + ALIGN(umem->address + umem->length, PAGE_SIZE); + sg_dma_len(sg) -= trim; + sg->length -= trim; + } + } +} + +struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret, + unsigned long peer_mem_flags) +{ + struct ib_peer_memory_client *ib_peer_client; + unsigned long peer_page_size; + void *peer_client_context; + struct ib_umem_peer *umem_p; + int ret; + + ib_peer_client = + ib_get_peer_client(old_umem->address, old_umem->length, + peer_mem_flags, &peer_client_context); + if (!ib_peer_client) + return ERR_PTR(old_ret); + + umem_p = kzalloc(sizeof(*umem_p), GFP_KERNEL); + if (!umem_p) { + ret = -ENOMEM; + goto err_client; + } + + kref_init(&umem_p->kref); + umem_p->umem = *old_umem; + memset(&umem_p->umem.sg_head, 0, sizeof(umem_p->umem.sg_head)); + umem_p->umem.is_peer = 1; + umem_p->ib_peer_client = ib_peer_client; + umem_p->peer_client_context = peer_client_context; + mutex_init(&umem_p->mapping_lock); + umem_p->xa_id = PEER_NO_INVALIDATION_ID; + + mutex_lock(&umem_p->mapping_lock); + if (ib_peer_client->invalidation_required) { + ret = xa_alloc_cyclic(&ib_peer_client->umem_xa, &umem_p->xa_id, + umem_p, + XA_LIMIT(0, PEER_NO_INVALIDATION_ID - 1), + &ib_peer_client->xa_cyclic_next, + GFP_KERNEL); + if (ret < 0) + goto err_umem; + } + + /* + * We always request write permissions to the pages, to force breaking + * of any CoW during the registration of the MR. For read-only MRs we + * use the "force" flag to indicate that CoW breaking is required but + * the registration should not fail if referencing read-only areas. + */ + ret = ib_peer_client->peer_mem->get_pages(umem_p->umem.address, + umem_p->umem.length, 1, + !umem_p->umem.writable, NULL, + peer_client_context, + umem_p->xa_id); + if (ret) + goto err_xa; + + ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sg_head, + peer_client_context, + umem_p->umem.ibdev->dma_device, + 0, &umem_p->umem.nmap); + if (ret) + goto err_pages; + + peer_page_size = + ib_peer_client->peer_mem->get_page_size(peer_client_context); + if (peer_page_size != PAGE_SIZE) + fix_peer_sgls(umem_p, peer_page_size); + + umem_p->mapped_state = UMEM_PEER_MAPPED; + atomic64_add(umem_p->umem.sg_nents, + &ib_peer_client->stats.num_reg_pages); + atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes); + atomic64_inc(&ib_peer_client->stats.num_alloc_mrs); + + /* + * If invalidation is allowed then the caller must call + * ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to + * unlock this mutex. This call should be done after the last read to + * sg_head, once the caller is ready for the invalidation function to be + * called. + */ + if (umem_p->xa_id == PEER_NO_INVALIDATION_ID) + mutex_unlock(&umem_p->mapping_lock); + + /* + * On success the old umem is replaced with the new, larger, allocation + */ + kfree(old_umem); + return &umem_p->umem; + +err_pages: + ib_peer_client->peer_mem->put_pages(&umem_p->umem.sg_head, + umem_p->peer_client_context); +err_xa: + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) + xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id); +err_umem: + mutex_unlock(&umem_p->mapping_lock); + kref_put(&umem_p->kref, ib_peer_umem_kref_release); +err_client: + ib_put_peer_client(ib_peer_client, peer_client_context); + return ERR_PTR(ret); +} + +void ib_peer_umem_release(struct ib_umem *umem) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + + /* + * If ib_umem_activate_invalidation_notifier() is called then + * ib_umem_stop_invalidation_notifier() must be called before release. + */ + WARN_ON(umem_p->invalidation_func); + + /* For no invalidation cases, make sure it is unmapped */ + ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED); + + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) + xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id); + ib_put_peer_client(umem_p->ib_peer_client, umem_p->peer_client_context); + umem_p->ib_peer_client = NULL; + + /* Must match ib_umem_release() */ + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); + mmdrop(umem->owning_mm); + + kref_put(&umem_p->kref, ib_peer_umem_kref_release); +} + +/* Use it like this: +struct peer_memory_client_ex peer_memory_test = { + .client = { + .version = "1.0", + .version[IB_PEER_MEMORY_VER_MAX-1] = 1, + }, + .ex_size = sizeof(struct peer_memory_client_ex), + .flags = PEER_MEM_INVALIDATE_UNMAPS, +}; +*/ diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 2dde99a9ba078..f8c30655c58fd 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -45,6 +45,8 @@ #include "uverbs.h" +#include "ib_peer_mem.h" + static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { struct sg_page_iter sg_iter; @@ -141,15 +143,17 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** - * ib_umem_get - Pin and DMA map userspace memory. + * __ib_umem_get - Pin and DMA map userspace memory. * * @device: IB device to connect UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned + * @peer_mem_flags: IB_PEER_MEM_xxx flags for memory being used */ -struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, - size_t size, int access) +static struct ib_umem *__ib_umem_get(struct ib_device *device, + unsigned long addr, size_t size, int access, + unsigned long peer_mem_flags) { struct ib_umem *umem; struct page **page_list; @@ -259,6 +263,26 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, umem_release: __ib_umem_release(device, umem, 0); + + /* + * If the address belongs to peer memory client, then the first + * call to get_user_pages will fail. In this case, try to get + * these pages from the peers. + */ + //FIXME: this placement is horrible + if (ret < 0 && peer_mem_flags & IB_PEER_MEM_ALLOW) { + struct ib_umem *new_umem; + + new_umem = ib_peer_umem_get(umem, ret, peer_mem_flags); + if (IS_ERR(new_umem)) { + ret = PTR_ERR(new_umem); + goto vma; + } + umem = new_umem; + ret = 0; + goto out; + } +vma: atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: free_page((unsigned long) page_list); @@ -269,8 +293,23 @@ umem_kfree: } return ret ? ERR_PTR(ret) : umem; } + +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, + size_t size, int access) +{ + return __ib_umem_get(device, addr, size, access, 0); +} EXPORT_SYMBOL(ib_umem_get); +struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, + size_t size, int access, + unsigned long peer_mem_flags) +{ + return __ib_umem_get(device, addr, size, access, + IB_PEER_MEM_ALLOW | peer_mem_flags); +} +EXPORT_SYMBOL(ib_umem_get_peer); + /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release @@ -284,6 +323,8 @@ void ib_umem_release(struct ib_umem *umem) if (umem->is_odp) return ib_umem_odp_release(to_ib_umem_odp(umem)); + if (umem->is_peer) + return ib_peer_umem_release(umem); __ib_umem_release(umem->ibdev, umem, 1); atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index eb92cefffd777..40f091a523b63 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -734,9 +734,9 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, *cqe_size = ucmd.cqe_size; - cq->buf.umem = - ib_umem_get(&dev->ib_dev, ucmd.buf_addr, - entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE); + cq->buf.umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 0); if (IS_ERR(cq->buf.umem)) { err = PTR_ERR(cq->buf.umem); return err; @@ -1157,9 +1157,9 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) return -EINVAL; - umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr, - (size_t)ucmd.cqe_size * entries, - IB_ACCESS_LOCAL_WRITE); + umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, + (size_t)ucmd.cqe_size * entries, + IB_ACCESS_LOCAL_WRITE, 0); if (IS_ERR(umem)) { err = PTR_ERR(umem); return err; diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 07b8350929cd6..902d2a3dfdafa 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2179,7 +2179,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, if (err) return err; - obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access); + obj->umem = ib_umem_get_peer(&dev->ib_dev, addr, size, access, 0); if (IS_ERR(obj->umem)) return PTR_ERR(obj->umem); return 0; diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index 61475b5715312..ce05ea3df2750 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -64,8 +64,9 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK, - PAGE_SIZE, 0); + page->umem = + ib_umem_get_peer(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index db05b0e0a8d7a..28863dcc06ee6 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -51,6 +51,8 @@ void *xlt_emergency_page; static DEFINE_MUTEX(xlt_emergency_page_mutex); +static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv); + enum { MAX_PENDING_REG_MR = 8, }; @@ -1478,17 +1480,20 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, int err; xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length); - if (xlt_with_umr) { + if (xlt_with_umr && !umem->is_peer) { mr = alloc_cacheable_mr(pd, umem, iova, access_flags); } else { unsigned int page_size = mlx5_umem_find_best_pgsz( umem, mkc, log_page_size, 0, iova); mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, true); + mr = reg_create(pd, umem, iova, access_flags, page_size, + !xlt_with_umr); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { + if (umem->is_peer) + ib_umem_stop_invalidation_notifier(umem); ib_umem_release(umem); return ERR_CAST(mr); } @@ -1509,6 +1514,11 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, return ERR_PTR(err); } } + + if (umem->is_peer) + ib_umem_activate_invalidation_notifier( + umem, mlx5_invalidate_umem, mr); + return &mr->ibmr; } @@ -1582,7 +1592,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (access_flags & IB_ACCESS_ON_DEMAND) return create_user_odp_mr(pd, start, length, iova, access_flags, udata); - umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); + umem = ib_umem_get_peer(&dev->ib_dev, start, length, access_flags, + IB_PEER_MEM_INVAL_SUPP); if (IS_ERR(umem)) return ERR_CAST(umem); return create_real_mr(pd, umem, iova, access_flags); @@ -1794,6 +1805,10 @@ static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, return err; } + if (new_umem->is_peer) + ib_umem_activate_invalidation_notifier( + new_umem, mlx5_invalidate_umem, mr); + atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); ib_umem_release(old_umem); atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); @@ -1836,8 +1851,13 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, return ERR_PTR(err); return NULL; } - /* DM or ODP MR's don't have a normal umem so we can't re-use it */ - if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) + /* + * DM or ODP MR's don't have a normal umem so we can't re-use + * it. Peer umems cannot have their MR's changed once created + * due to races with invalidation. + */ + if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || + mr->umem->is_peer) goto recreate; /* @@ -1856,10 +1876,11 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, } /* - * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does - * but the logic around releasing the umem is different + * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does but + * the logic around releasing the umem is different, peer memory + * invalidation semantics are incompatible. */ - if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) + if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || mr->umem->is_peer) goto recreate; if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && @@ -1867,8 +1888,9 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct ib_umem *new_umem; unsigned long page_size; - new_umem = ib_umem_get(&dev->ib_dev, start, length, - new_access_flags); + new_umem = ib_umem_get_peer(&dev->ib_dev, start, length, + new_access_flags, + IB_PEER_MEM_INVAL_SUPP); if (IS_ERR(new_umem)) return ERR_CAST(new_umem); @@ -1959,6 +1981,11 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mr->sig = NULL; } + if (mr->umem && mr->umem->is_peer) { + mlx5_mr_cache_invalidate(mr); + ib_umem_stop_invalidation_notifier(mr->umem); + } + if (!mr->cache_ent) { destroy_mkey(dev, mr); mlx5_free_priv_descs(mr); @@ -2705,3 +2732,15 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, return n; } + +static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv) +{ + struct mlx5_ib_mr *mr = priv; + + /* + * DMA is turned off for the mkey, but the mkey remains otherwise + * untouched until the normal flow of dereg_mr happens. Any access to + * this mkey will generate CQEs. + */ + mlx5_mr_cache_invalidate(mr); +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f5a52a6fae437..2bc23944a86d1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -807,7 +807,8 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (!ucmd->buf_addr) return -EINVAL; - rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0); + rwq->umem = ib_umem_get_peer(&dev->ib_dev, ucmd->buf_addr, + rwq->buf_size, 0, 0); if (IS_ERR(rwq->umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); err = PTR_ERR(rwq->umem); @@ -917,8 +918,9 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (ucmd->buf_addr && ubuffer->buf_size) { ubuffer->buf_addr = ucmd->buf_addr; - ubuffer->umem = ib_umem_get(&dev->ib_dev, ubuffer->buf_addr, - ubuffer->buf_size, 0); + ubuffer->umem = + ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr, + ubuffer->buf_size, 0, 0); if (IS_ERR(ubuffer->umem)) { err = PTR_ERR(ubuffer->umem); goto err_bfreg; @@ -1259,8 +1261,8 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, if (ts_format < 0) return ts_format; - sq->ubuffer.umem = ib_umem_get(&dev->ib_dev, ubuffer->buf_addr, - ubuffer->buf_size, 0); + sq->ubuffer.umem = ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr, + ubuffer->buf_size, 0, 0); if (IS_ERR(sq->ubuffer.umem)) return PTR_ERR(sq->ubuffer.umem); page_size = mlx5_umem_find_best_quantized_pgoff( diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index fab6736e4d6a0..e2157fb5a7fd5 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -76,7 +76,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); - srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0); + srq->umem = ib_umem_get_peer(pd->device, ucmd.buf_addr, buf_size, 0, 0); if (IS_ERR(srq->umem)) { mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); err = PTR_ERR(srq->umem); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 676c57f5ca80c..c418f91c2886c 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -25,6 +25,8 @@ struct ib_umem { u32 writable : 1; u32 is_odp : 1; u32 is_dmabuf : 1; + /* Placing at the end of the bitfield list is ABI preserving on LE */ + u32 is_peer : 1; struct work_struct work; struct sg_table sg_head; int nmap; @@ -47,6 +49,12 @@ static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem) return container_of(umem, struct ib_umem_dmabuf, umem); } +typedef void (*umem_invalidate_func_t)(struct ib_umem *umem, void *priv); +enum ib_peer_mem_flags { + IB_PEER_MEM_ALLOW = 1 << 0, + IB_PEER_MEM_INVAL_SUPP = 1 << 1, +}; + /* Returns the offset of the umem start relative to the first page. */ static inline int ib_umem_offset(struct ib_umem *umem) { @@ -143,6 +151,13 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); +struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, + size_t size, int access, + unsigned long peer_mem_flags); +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *cookie); +void ib_umem_stop_invalidation_notifier(struct ib_umem *umem); #else /* CONFIG_INFINIBAND_USER_MEM */ @@ -186,6 +201,20 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) } static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { } +static inline struct ib_umem *ib_umem_get_peer(struct ib_device *device, + unsigned long addr, size_t size, + int access, + unsigned long peer_mem_flags) +{ + return ERR_PTR(-EINVAL); +} +static inline void ib_umem_activate_invalidation_notifier( + struct ib_umem *umem, umem_invalidate_func_t func, void *cookie) +{ +} +static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem) +{ +} #endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h new file mode 100644 index 0000000000000..aa29b3ffb1c4f --- /dev/null +++ b/include/rdma/peer_mem.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + * Copyright (C) 2020-2021 NVIDIA CORPORATION & AFFILIATES. All Rights Reserved. + */ +#ifndef RDMA_PEER_MEM_H +#define RDMA_PEER_MEM_H + +#include <linux/scatterlist.h> + +#define IB_PEER_MEMORY_NAME_MAX 64 +#define IB_PEER_MEMORY_VER_MAX 16 + +/* + * Prior versions used a void * for core_context, at some point this was + * switched to use u64. Be careful if compiling this as 32 bit. To help the + * value of core_context is limited to u32 so it should work OK despite the + * type change. + */ +#define PEER_MEM_U64_CORE_CONTEXT + +struct device; + +/** + * struct peer_memory_client - registration information for user virtual + * memory handlers + * + * The peer_memory_client scheme allows a driver to register with the ib_umem + * system that it has the ability to understand user virtual address ranges + * that are not compatible with get_user_pages(). For instance VMAs created + * with io_remap_pfn_range(), or other driver special VMA. + * + * For ranges the interface understands it can provide a DMA mapped sg_table + * for use by the ib_umem, allowing user virtual ranges that cannot be + * supported by get_user_pages() to be used as umems. + */ +struct peer_memory_client { + char name[IB_PEER_MEMORY_NAME_MAX]; + char version[IB_PEER_MEMORY_VER_MAX]; + + /** + * acquire - Begin working with a user space virtual address range + * + * @addr - Virtual address to be checked whether belongs to peer. + * @size - Length of the virtual memory area starting at addr. + * @peer_mem_private_data - Obsolete, always NULL + * @peer_mem_name - Obsolete, always NULL + * @client_context - Returns an opaque value for this acquire use in + * other APIs + * + * Returns 1 if the peer_memory_client supports the entire virtual + * address range, 0 or -ERRNO otherwise. If 1 is returned then + * release() will be called to release the acquire(). + */ + int (*acquire)(unsigned long addr, size_t size, + void *peer_mem_private_data, char *peer_mem_name, + void **client_context); + /** + * get_pages - Fill in the first part of a sg_table for a virtual + * address range + * + * @addr - Virtual address to be checked whether belongs to peer. + * @size - Length of the virtual memory area starting at addr. + * @write - Always 1 + * @force - 1 if write is required + * @sg_head - Obsolete, always NULL + * @client_context - Value returned by acquire() + * @core_context - Value to be passed to invalidate_peer_memory for + * this get + * + * addr/size are passed as the raw virtual address range requested by + * the user, it is not aligned to any page size. get_pages() is always + * followed by dma_map(). + * + * Upon return the caller can call the invalidate_callback(). + * + * Returns 0 on success, -ERRNO on failure. After success put_pages() + * will be called to return the pages. + */ + int (*get_pages)(unsigned long addr, size_t size, int write, int force, + struct sg_table *sg_head, void *client_context, + u64 core_context); + /** + * dma_map - Create a DMA mapped sg_table + * + * @sg_head - The sg_table to allocate + * @client_context - Value returned by acquire() + * @dma_device - The device that will be doing DMA from these addresses + * @dmasync - Obsolete, always 0 + * @nmap - Returns the number of dma mapped entries in the sg_head + * + * Must be called after get_pages(). This must fill in the sg_head with + * DMA mapped SGLs for dma_device. Each SGL start and end must meet a + * minimum alignment of at least PAGE_SIZE, though individual sgls can + * be multiples of PAGE_SIZE, in any mixture. Since the user virtual + * address/size are not page aligned, the implementation must increase + * it to the logical alignment when building the SGLs. + * + * Returns 0 on success, -ERRNO on failure. After success dma_unmap() + * will be called to unmap the pages. On failure sg_head must be left + * untouched or point to a valid sg_table. + */ + int (*dma_map)(struct sg_table *sg_head, void *client_context, + struct device *dma_device, int dmasync, int *nmap); + /** + * dma_unmap - Unmap a DMA mapped sg_table + * + * @sg_head - The sg_table to unmap + * @client_context - Value returned by acquire() + * @dma_device - The device that will be doing DMA from these addresses + * + * sg_head will not be touched after this function returns. + * + * Must return 0. + */ + int (*dma_unmap)(struct sg_table *sg_head, void *client_context, + struct device *dma_device); + /** + * put_pages - Unpin a SGL + * + * @sg_head - The sg_table to unpin + * @client_context - Value returned by acquire() + * + * sg_head must be freed on return. + */ + void (*put_pages)(struct sg_table *sg_head, void *client_context); + /* Client should always return PAGE_SIZE */ + unsigned long (*get_page_size)(void *client_context); + /** + * release - Undo acquire + * + * @client_context - Value returned by acquire() + * + * If acquire() returns 1 then release() must be called. All + * get_pages() and dma_map()'s must be undone before calling this + * function. + */ + void (*release)(void *client_context); +}; + +enum { + PEER_MEM_INVALIDATE_UNMAPS = 1 << 0, +}; + +struct peer_memory_client_ex { + struct peer_memory_client client; + size_t ex_size; + u32 flags; +}; + +/* + * If invalidate_callback() is non-NULL then the client will only support + * umems which can be invalidated. The caller may call the + * invalidate_callback() after acquire() on return the range will no longer + * have DMA active, and release() will have been called. + * + * Note: The implementation locking must ensure that get_pages(), and + * dma_map() do not have locking dependencies with invalidate_callback(). The + * ib_core will wait until any concurrent get_pages() or dma_map() completes + * before returning. + * + * Similarly, this can call dma_unmap(), put_pages() and release() from within + * the callback, or will wait for another thread doing those operations to + * complete. + * + * For these reasons the user of invalidate_callback() must be careful with + * locking. + */ +typedef int (*invalidate_peer_memory)(void *reg_handle, u64 core_context); + +void * +ib_register_peer_memory_client(const struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback); +void ib_unregister_peer_memory_client(void *reg_handle); + +#endif |