aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoland Dreier <roland@purestorage.com>2012-05-29 12:46:35 -0700
committerRoland Dreier <roland@purestorage.com>2015-04-02 09:43:50 -0700
commitac4233bcd208123030deb121cc33ea03baa78763 (patch)
tree09f413a7ce0f134e0b649f9640bd3a7a81a8cbda
parente42391cd048809d903291d07f86ed3934ce138e9 (diff)
downloadinfiniband-ummunot.tar.gz
IB/uverbs: Add support for passing memory region invalidations to userspaceummunot
As discussed in <http://article.gmane.org/gmane.linux.drivers.openib/61925> and follow-up messages, libraries using RDMA would like to track precisely when application code changes memory mapping via free(), munmap(), etc. Current pure-userspace solutions using malloc hooks and other tricks are not robust, and the feeling among experts is that the issue is unfixable without kernel help. We solve this not by implementing the full API proposed in the email linked above but rather with a simpler and more generic interface. Unlike previous attempts at this, which implemented a new generic character device, this patch works within the existing RDMA userspace verbs support. Specifically, we implement three new userspace operations: 1. A new version of the "register MR" operation, which creates an MR for which the kernel will notify userspace when the virtual mapping changes. We need a new operation for this (rather than, say, simply adding another flag to the access_flags field of the existing reg_mr operation) because we need to extend the ABI to allow userspace to pass in a cookie that will be returned as part of invalidation events. 2. A new version of the "deregister MR" operation that returns the number of invalidate events passed to userspace. Now that we generate events for MRs, we need this event count in the destroy operation to avoid unfixable races in userspace for exactly analogous reasons to the existing destroy CQ, QP and SRQ operations. 3. A new command to create an MMU notification file descriptor for a userspace verbs context. We require this FD to be created before allowing any MRs with invalidation notification to be registered. When an invalidation event occurs, the kernel queues an event on this FD that userspace can retrieve with read(). We also allow userspace to mmap() one page at offset 0 to map a kernel page that contains a generation counter that is incremented each time an event is queued. This allows userspace to have a fast path that checks that no events have occurred, without needing to do a system call. Thanks to Jason Gunthorpe for suggestions on the interface design. Also thanks to Jeff Squyres for prototyping support for this in Open MPI, which helped find several bugs during development. Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/umem.c141
-rw-r--r--drivers/infiniband/core/uverbs.h24
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c306
-rw-r--r--drivers/infiniband/core/uverbs_main.c121
-rw-r--r--include/rdma/ib_umem.h61
-rw-r--r--include/uapi/rdma/ib_user_verbs.h36
7 files changed, 581 insertions, 109 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index b899531498eb0d..6281d01483d7bf 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -24,6 +24,7 @@ config INFINIBAND_USER_MAD
config INFINIBAND_USER_ACCESS
tristate "InfiniBand userspace access (verbs and CM)"
select ANON_INODES
+ select MMU_NOTIFIER
---help---
Userspace InfiniBand access support. This enables the
kernel side of userspace verbs and the userspace
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index aec7a6aa2951db..89430ecabaf858 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -352,3 +352,144 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
return 0;
}
EXPORT_SYMBOL(ib_umem_copy_from);
+
+void ib_ummunotify_register_range(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range)
+{
+ struct ib_ummunotify_range *trange;
+ struct rb_node **n = &context->reg_tree.rb_node;
+ struct rb_node *pn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&context->lock, flags);
+
+ pn = NULL;
+ while (*n) {
+ pn = *n;
+ trange = rb_entry(pn, struct ib_ummunotify_range, node);
+
+ if (range->start <= trange->start)
+ n = &pn->rb_left;
+ else
+ n = &pn->rb_right;
+ }
+
+ rb_link_node(&range->node, pn, n);
+ rb_insert_color(&range->node, &context->reg_tree);
+
+ spin_unlock_irqrestore(&context->lock, flags);
+}
+EXPORT_SYMBOL(ib_ummunotify_register_range);
+
+void ib_ummunotify_unregister_range(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range)
+{
+ unsigned long flags;
+
+ if (!ib_ummunotify_context_used(context))
+ return;
+
+ if (RB_EMPTY_NODE(&range->node))
+ return;
+
+ spin_lock_irqsave(&context->lock, flags);
+ rb_erase(&range->node, &context->reg_tree);
+ spin_unlock_irqrestore(&context->lock, flags);
+}
+EXPORT_SYMBOL(ib_ummunotify_unregister_range);
+
+static void ib_ummunotify_handle_notify(struct mmu_notifier *mn,
+ unsigned long start, unsigned long end)
+{
+ struct ib_ummunotify_context *context =
+ container_of(mn, struct ib_ummunotify_context, mmu_notifier);
+ unsigned long flags;
+ struct rb_node *n;
+ struct ib_ummunotify_range *range;
+
+ spin_lock_irqsave(&context->lock, flags);
+
+ for (n = rb_first(&context->reg_tree); n; n = rb_next(n)) {
+ range = rb_entry(n, struct ib_ummunotify_range, node);
+
+ /*
+ * Ranges overlap if they're not disjoint; and they're
+ * disjoint if the end of one is before the start of
+ * the other one. So if both disjointness comparisons
+ * fail then the ranges overlap.
+ *
+ * Since we keep the tree of regions we're watching
+ * sorted by start address, we can end this loop as
+ * soon as we hit a region that starts past the end of
+ * the range for the event we're handling.
+ */
+ if (range->start >= end)
+ break;
+
+ /*
+ * Just go to the next region if the start of the
+ * range is after the end of the region -- there
+ * might still be more overlapping ranges that have a
+ * greater start.
+ */
+ if (start >= range->end)
+ continue;
+
+ context->callback(context, range);
+ }
+
+ spin_unlock_irqrestore(&context->lock, flags);
+}
+
+static void ib_ummunotify_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long addr)
+{
+ ib_ummunotify_handle_notify(mn, addr, addr + PAGE_SIZE);
+}
+
+static void ib_ummunotify_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ ib_ummunotify_handle_notify(mn, start, end);
+}
+
+static const struct mmu_notifier_ops ib_ummunotify_mmu_notifier_ops = {
+ .invalidate_page = ib_ummunotify_invalidate_page,
+ .invalidate_range_start = ib_ummunotify_invalidate_range_start,
+};
+
+int ib_ummunotify_init_context(struct ib_ummunotify_context *context,
+ void (*callback)(struct ib_ummunotify_context *,
+ struct ib_ummunotify_range *))
+{
+ int ret;
+
+ context->callback = callback;
+ context->reg_tree = RB_ROOT;
+ spin_lock_init(&context->lock);
+
+ context->mm = current->mm;
+ atomic_inc(&current->mm->mm_count);
+
+ context->mmu_notifier.ops = &ib_ummunotify_mmu_notifier_ops;
+ ret = mmu_notifier_register(&context->mmu_notifier, context->mm);
+ if (ret) {
+ mmdrop(context->mm);
+ context->mm = NULL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_ummunotify_init_context);
+
+void ib_ummunotify_cleanup_context(struct ib_ummunotify_context *context)
+{
+ if (!ib_ummunotify_context_used(context))
+ return;
+ mmu_notifier_unregister(&context->mmu_notifier, context->mm);
+ mmdrop(context->mm);
+}
+EXPORT_SYMBOL(ib_ummunotify_cleanup_context);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b716b08156446e..dbb2b7f12b0e38 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -96,9 +96,15 @@ struct ib_uverbs_device {
struct mutex xrcd_tree_mutex;
};
+enum ib_uverbs_event_file_type {
+ IB_UVERBS_EVENT_FILE_ASYNC,
+ IB_UVERBS_EVENT_FILE_COMP,
+ IB_UVERBS_EVENT_FILE_MMU_NOTIFY,
+};
+
struct ib_uverbs_event_file {
struct kref ref;
- int is_async;
+ enum ib_uverbs_event_file_type type;
struct ib_uverbs_file *uverbs_file;
spinlock_t lock;
int is_closed;
@@ -113,13 +119,17 @@ struct ib_uverbs_file {
struct ib_uverbs_device *device;
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
+ struct ib_ummunotify_context mmu_notify_context;
+ u64 *mmu_notify_counter;
struct ib_uverbs_event_file *async_file;
+ struct ib_uverbs_event_file *mmu_notify_file;
};
struct ib_uverbs_event {
union {
struct ib_uverbs_async_event_desc async;
struct ib_uverbs_comp_event_desc comp;
+ struct ib_uverbs_mmu_notify_event_desc mmu_notify;
} desc;
struct list_head list;
struct list_head obj_list;
@@ -148,6 +158,11 @@ struct ib_usrq_object {
struct ib_uxrcd_object *uxrcd;
};
+struct ib_umr_object {
+ struct ib_uevent_object uevent;
+ struct ib_ummunotify_range range;
+};
+
struct ib_uqp_object {
struct ib_uevent_object uevent;
struct list_head mcast_list;
@@ -177,7 +192,7 @@ extern struct idr ib_uverbs_rule_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
- int is_async);
+ enum ib_uverbs_event_file_type type);
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -187,6 +202,8 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
struct ib_uevent_object *uobj);
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_mr_event_handler(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range);
void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
@@ -250,6 +267,9 @@ IB_UVERBS_DECLARE_CMD(destroy_srq);
IB_UVERBS_DECLARE_CMD(create_xsrq);
IB_UVERBS_DECLARE_CMD(open_xrcd);
IB_UVERBS_DECLARE_CMD(close_xrcd);
+IB_UVERBS_DECLARE_CMD(create_mmu_notify_channel);
+IB_UVERBS_DECLARE_CMD(reg_mmu_notify_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mmu_notify_mr);
#define IB_UVERBS_DECLARE_EX_CMD(name) \
int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a9f048990dfcd8..e3707de3e1b90b 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_free;
resp.async_fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 1);
+ filp = ib_uverbs_alloc_event_file(file, IB_UVERBS_EVENT_FILE_ASYNC);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto err_fd;
@@ -933,49 +933,38 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
xrcd_table_delete(dev, inode);
}
-ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static ssize_t ib_uverbs_reg_mr_common(struct ib_uverbs_file *file,
+ struct ib_uverbs_reg_mmu_notify_mr *cmd,
+ struct ib_uverbs_reg_mr_resp *resp,
+ struct ib_udata *udata,
+ bool do_notify)
{
- struct ib_uverbs_reg_mr cmd;
- struct ib_uverbs_reg_mr_resp resp;
- struct ib_udata udata;
- struct ib_uobject *uobj;
- struct ib_pd *pd;
- struct ib_mr *mr;
- int ret;
-
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ struct ib_umr_object *obj;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ int ret;
- if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+ if ((cmd->start & ~PAGE_MASK) != (cmd->hca_va & ~PAGE_MASK))
return -EINVAL;
ret = ib_check_mr_access(cmd.access_flags);
if (ret)
return ret;
- uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
- if (!uobj)
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
return -ENOMEM;
- init_uobj(uobj, 0, file->ucontext, &mr_lock_class);
- down_write(&uobj->mutex);
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &mr_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ pd = idr_read_pd(cmd->pd_handle, file->ucontext);
if (!pd) {
ret = -EINVAL;
goto err_free;
}
- if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+ if (cmd->access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_device_attr attr;
ret = ib_query_device(pd->device, &attr);
@@ -987,8 +976,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
}
}
- mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags, &udata);
+ mr = pd->device->reg_user_mr(pd, cmd->start, cmd->length, cmd->hca_va,
+ cmd->access_flags, &udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
goto err_put;
@@ -996,22 +985,22 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
mr->device = pd->device;
mr->pd = pd;
- mr->uobject = uobj;
+ mr->uobject = &obj->uevent.uobject;
atomic_inc(&pd->usecnt);
atomic_set(&mr->usecnt, 0);
- uobj->object = mr;
- ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+ obj->uevent.uobject.object = mr;
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, &obj->uevent.uobject);
if (ret)
goto err_unreg;
memset(&resp, 0, sizeof resp);
- resp.lkey = mr->lkey;
- resp.rkey = mr->rkey;
- resp.mr_handle = uobj->id;
+ resp->lkey = mr->lkey;
+ resp->rkey = mr->rkey;
+ resp->mr_handle = obj->uevent.uobject.id;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
+ if (copy_to_user((void __user *) (unsigned long) cmd->response,
+ resp, sizeof *resp)) {
ret = -EFAULT;
goto err_copy;
}
@@ -1019,17 +1008,23 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
put_pd_read(pd);
mutex_lock(&file->mutex);
- list_add_tail(&uobj->list, &file->ucontext->mr_list);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->mr_list);
mutex_unlock(&file->mutex);
- uobj->live = 1;
+ obj->uevent.uobject.live = 1;
- up_write(&uobj->mutex);
+ if (do_notify)
+ ib_ummunotify_register_range(&file->mmu_notify_context,
+ &obj->range);
+ else
+ ib_ummunotify_clear_range(&obj->range);
- return in_len;
+ up_write(&obj->uevent.uobject.mutex);
+
+ return 0;
err_copy:
- idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+ idr_remove_uobj(&ib_uverbs_mr_idr, &obj->uevent.uobject);
err_unreg:
ib_dereg_mr(mr);
@@ -1038,7 +1033,7 @@ err_put:
put_pd_read(pd);
err_free:
- put_uobj_write(uobj);
+ put_uobj_write(&obj->uevent.uobject);
return ret;
}
@@ -1135,23 +1130,79 @@ put_uobjs:
return ret;
}
-ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
{
- struct ib_uverbs_dereg_mr cmd;
- struct ib_mr *mr;
- struct ib_uobject *uobj;
- int ret = -EINVAL;
+ struct ib_uverbs_reg_mr cmd;
+ struct ib_uverbs_reg_mmu_notify_mr not_cmd;
+ struct ib_uverbs_reg_mr_resp resp;
+ struct ib_udata udata;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ not_cmd.response = cmd.response;
+ not_cmd.user_handle = 0;
+ not_cmd.start = cmd.start;
+ not_cmd.length = cmd.length;
+ not_cmd.hca_va = cmd.hca_va;
+ not_cmd.pd_handle = cmd.pd_handle;
+ not_cmd.access_flags = cmd.access_flags;
+
+ ret = ib_uverbs_reg_mr_common(file, &not_cmd, &resp, &udata, false);
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_reg_mmu_notify_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_reg_mmu_notify_mr cmd;
+ struct ib_uverbs_reg_mr_resp resp;
+ struct ib_udata udata;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (!ib_ummunotify_context_used(&file->mmu_notify_context))
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ ret = ib_uverbs_reg_mr_common(file, &cmd, &resp, &udata, true);
+ return ret ? ret : in_len;
+}
+
+static ssize_t ib_uverbs_dereg_mr_common(struct ib_uverbs_file *file,
+ int mr_handle,
+ u32 *events_reported)
+{
+ struct ib_uobject *uobj;
+ struct ib_mr *mr;
+ struct ib_umr_object *obj;
+ int ret;
+
+ uobj = idr_write_uobj(&ib_uverbs_mr_idr, mr_handle, file->ucontext);
if (!uobj)
return -EINVAL;
mr = uobj->object;
+ obj = container_of(uobj, struct ib_umr_object, uevent.uobject);
ret = ib_dereg_mr(mr);
if (!ret)
@@ -1162,15 +1213,60 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_ummunotify_unregister_range(&file->mmu_notify_context,
+ &obj->range);
+
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
mutex_lock(&file->mutex);
list_del(&uobj->list);
mutex_unlock(&file->mutex);
+ ib_uverbs_release_uevent(file, &obj->uevent);
+
+ if (events_reported)
+ *events_reported = obj->uevent.events_reported;
+
put_uobj(uobj);
- return in_len;
+ return 0;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dereg_mr cmd;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_uverbs_dereg_mr_common(file, cmd.mr_handle, NULL);
+
+ return ret ? ret : in_len;
+}
+
+
+ssize_t ib_uverbs_dereg_mmu_notify_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dereg_mmu_notify_mr cmd;
+ struct ib_uverbs_dereg_mmu_notify_mr_resp resp;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_uverbs_dereg_mr_common(file, cmd.mr_handle,
+ &resp.events_reported);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return ret ? ret : in_len;
}
ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
@@ -1313,7 +1409,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return ret;
resp.fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 0);
+ filp = ib_uverbs_alloc_event_file(file, IB_UVERBS_EVENT_FILE_COMP);
if (IS_ERR(filp)) {
put_unused_fd(resp.fd);
return PTR_ERR(filp);
@@ -3295,63 +3391,73 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
return ret ? ret : in_len;
}
-int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+ssize_t ib_uverbs_create_mmu_notify_channel(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
{
- struct ib_uverbs_ex_query_device_resp resp;
- struct ib_uverbs_ex_query_device cmd;
- struct ib_device_attr attr;
- struct ib_device *device;
- int err;
+ struct ib_uverbs_create_mmu_notify_channel cmd;
+ struct ib_uverbs_create_mmu_notify_channel_resp resp;
+ struct file *filp;
+ int ret;
- device = file->device->ib_dev;
- if (ucore->inlen < sizeof(cmd))
- return -EINVAL;
+ if (out_len < sizeof resp)
+ return -ENOSPC;
- err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
- if (err)
- return err;
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
- if (cmd.comp_mask)
- return -EINVAL;
+ mutex_lock(&file->mutex);
- if (cmd.reserved)
- return -EINVAL;
+ if (file->mmu_notify_file) {
+ ret = -EINVAL;
+ goto err;
+ }
- resp.response_length = offsetof(typeof(resp), odp_caps);
+ ret = get_unused_fd();
+ if (ret < 0)
+ goto err;
+ resp.fd = ret;
- if (ucore->outlen < resp.response_length)
- return -ENOSPC;
+ filp = ib_uverbs_alloc_event_file(file, IB_UVERBS_EVENT_FILE_MMU_NOTIFY);
+ if (IS_ERR(filp)) {
+ ret = PTR_ERR(filp);
+ goto err_put_fd;
+ }
- err = device->query_device(device, &attr);
- if (err)
- return err;
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_fput;
+ }
- copy_query_dev_fields(file, &resp.base, &attr);
- resp.comp_mask = 0;
+ ret = ib_ummunotify_init_context(&file->mmu_notify_context,
+ ib_uverbs_mr_event_handler);
+ if (ret)
+ goto err_fput;
- if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
- goto end;
+ file->mmu_notify_counter = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!file->mmu_notify_counter) {
+ ret = -ENOMEM;
+ goto err_context;
+ }
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- resp.odp_caps.general_caps = attr.odp_caps.general_caps;
- resp.odp_caps.per_transport_caps.rc_odp_caps =
- attr.odp_caps.per_transport_caps.rc_odp_caps;
- resp.odp_caps.per_transport_caps.uc_odp_caps =
- attr.odp_caps.per_transport_caps.uc_odp_caps;
- resp.odp_caps.per_transport_caps.ud_odp_caps =
- attr.odp_caps.per_transport_caps.ud_odp_caps;
- resp.odp_caps.reserved = 0;
-#else
- memset(&resp.odp_caps, 0, sizeof(resp.odp_caps));
-#endif
- resp.response_length += sizeof(resp.odp_caps);
+ file->mmu_notify_file = filp->private_data;
+ fd_install(resp.fd, filp);
-end:
- err = ib_copy_to_udata(ucore, &resp, resp.response_length);
- if (err)
- return err;
+ mutex_unlock(&file->mutex);
- return 0;
+ return in_len;
+
+err_context:
+ ib_ummunotify_cleanup_context(&file->mmu_notify_context);
+
+err_fput:
+ fput(filp);
+
+err_put_fd:
+ put_unused_fd(resp.fd);
+
+err:
+ mutex_unlock(&file->mutex);
+ return ret;
}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 259dcc7779f5e0..1aae7ed09d2f8f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -116,6 +116,9 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
[IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd,
[IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq,
[IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp,
+ [IB_USER_VERBS_CMD_CREATE_MMU_NOTIFY_CHANNEL] = ib_uverbs_create_mmu_notify_channel,
+ [IB_USER_VERBS_CMD_REG_MMU_NOTIFY_MR] = ib_uverbs_reg_mmu_notify_mr,
+ [IB_USER_VERBS_CMD_DEREG_MMU_NOTIFY_MR] = ib_uverbs_dereg_mmu_notify_mr,
};
static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
@@ -271,9 +274,15 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
struct ib_mr *mr = uobj->object;
+ struct ib_umr_object *umr =
+ container_of(uobj, struct ib_umr_object, uevent.uobject);
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+ if (ib_ummunotify_context_used(&file->mmu_notify_context))
+ ib_ummunotify_unregister_range(&file->mmu_notify_context,
+ &umr->range);
ib_dereg_mr(mr);
+ ib_uverbs_release_uevent(file, &umr->uevent);
kfree(uobj);
}
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
}
put_pid(context->tgid);
+ ib_ummunotify_cleanup_context(&file->mmu_notify_context);
+ kfree(file->mmu_notify_counter);
return context->device->dealloc_ucontext(context);
}
@@ -318,7 +329,7 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
{
struct ib_uverbs_event_file *file = filp->private_data;
struct ib_uverbs_event *event;
- int eventsz;
+ int uninitialized_var(eventsz);
int ret = 0;
spin_lock_irq(&file->lock);
@@ -338,10 +349,17 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
- if (file->is_async)
+ switch (file->type) {
+ case IB_UVERBS_EVENT_FILE_ASYNC:
eventsz = sizeof (struct ib_uverbs_async_event_desc);
- else
+ break;
+ case IB_UVERBS_EVENT_FILE_COMP:
eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+ break;
+ case IB_UVERBS_EVENT_FILE_MMU_NOTIFY:
+ eventsz = sizeof (struct ib_uverbs_mmu_notify_event_desc);
+ break;
+ }
if (eventsz > count) {
ret = -EINVAL;
@@ -368,6 +386,37 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
return ret;
}
+static int uverbs_mmu_notify_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct ib_uverbs_file *file = vma->vm_private_data;
+
+ if (vmf->pgoff != 0)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = virt_to_page(file->mmu_notify_counter);
+ get_page(vmf->page);
+
+ return 0;
+}
+
+static const struct vm_operations_struct uverbs_mmu_notify_vm_ops = {
+ .fault = uverbs_mmu_notify_fault,
+};
+
+static int ib_uverbs_event_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ib_uverbs_event_file *ev_file = filp->private_data;
+ struct ib_uverbs_file *file = ev_file->uverbs_file;
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ vma->vm_ops = &uverbs_mmu_notify_vm_ops;
+ vma->vm_private_data = file;
+
+ return 0;
+}
+
static unsigned int ib_uverbs_event_poll(struct file *filp,
struct poll_table_struct *wait)
{
@@ -405,10 +454,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
}
spin_unlock_irq(&file->lock);
- if (file->is_async) {
+ if (file->type == IB_UVERBS_EVENT_FILE_ASYNC) {
ib_unregister_event_handler(&file->uverbs_file->event_handler);
kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
}
+
+ if (file->type == IB_UVERBS_EVENT_FILE_MMU_NOTIFY) {
+ /* XXX */
+ }
+
kref_put(&file->ref, ib_uverbs_release_event_file);
return 0;
@@ -423,6 +477,16 @@ static const struct file_operations uverbs_event_fops = {
.llseek = no_llseek,
};
+static const struct file_operations uverbs_event_mmap_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_event_read,
+ .mmap = ib_uverbs_event_mmap,
+ .poll = ib_uverbs_event_poll,
+ .release = ib_uverbs_event_close,
+ .fasync = ib_uverbs_event_fasync,
+ .llseek = no_llseek,
+};
+
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
{
struct ib_uverbs_event_file *file = cq_context;
@@ -458,6 +522,47 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
kill_fasync(&file->async_queue, SIGIO, POLL_IN);
}
+void ib_uverbs_mr_event_handler(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range)
+{
+ struct ib_uverbs_event_file *file =
+ container_of(context, struct ib_uverbs_file,
+ mmu_notify_context)->mmu_notify_file;
+ struct ib_umr_object *uobj;
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ if (!file)
+ return;
+
+ spin_lock_irqsave(&file->lock, flags);
+ if (file->is_closed) {
+ spin_unlock_irqrestore(&file->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&file->lock, flags);
+ return;
+ }
+
+ uobj = container_of(range, struct ib_umr_object, range);
+
+ entry->desc.mmu_notify.cq_handle = uobj->uevent.uobject.user_handle;
+ entry->counter = &uobj->uevent.events_reported;
+
+ list_add_tail(&entry->list, &file->event_list);
+ list_add_tail(&entry->obj_list, &uobj->uevent.event_list);
+
+ ++(*file->uverbs_file->mmu_notify_counter);
+
+ spin_unlock_irqrestore(&file->lock, flags);
+
+ wake_up_interruptible(&file->poll_wait);
+ kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
__u64 element, __u64 event,
struct list_head *obj_list,
@@ -541,7 +646,7 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
}
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
- int is_async)
+ enum ib_uverbs_event_file_type type)
{
struct ib_uverbs_event_file *ev_file;
struct file *filp;
@@ -556,7 +661,7 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
init_waitqueue_head(&ev_file->poll_wait);
ev_file->uverbs_file = uverbs_file;
ev_file->async_queue = NULL;
- ev_file->is_async = is_async;
+ ev_file->type = type;
ev_file->is_closed = 0;
filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
@@ -584,7 +689,7 @@ struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
goto out;
ev_file = f.file->private_data;
- if (ev_file->is_async) {
+ if (ev_file->type != IB_UVERBS_EVENT_FILE_COMP) {
ev_file = NULL;
goto out;
}
@@ -763,6 +868,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
file->async_file = NULL;
kref_init(&file->ref);
mutex_init(&file->mutex);
+ ib_ummunotify_clear_context(&file->mmu_notify_context);
+ file->mmu_notify_counter = NULL;
filp->private_data = file;
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 2d83cfd7e6ce20..b2e1c0808056d4 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -34,6 +34,8 @@
#define IB_UMEM_H
#include <linux/list.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rbtree.h>
#include <linux/scatterlist.h>
#include <linux/workqueue.h>
@@ -80,8 +82,23 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem)
return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT;
}
+struct ib_ummunotify_range {
+ unsigned long start;
+ unsigned long end;
+ struct rb_node node;
+};
+
#ifdef CONFIG_INFINIBAND_USER_MEM
+struct ib_ummunotify_context {
+ struct mmu_notifier mmu_notifier;
+ void (*callback)(struct ib_ummunotify_context *,
+ struct ib_ummunotify_range *);
+ struct mm_struct *mm;
+ struct rb_root reg_tree;
+ spinlock_t lock;
+};
+
struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
size_t size, int access, int dmasync);
void ib_umem_release(struct ib_umem *umem);
@@ -89,10 +106,37 @@ int ib_umem_page_count(struct ib_umem *umem);
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
size_t length);
+void ib_ummunotify_register_range(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range);
+void ib_ummunotify_unregister_range(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range);
+
+int ib_ummunotify_init_context(struct ib_ummunotify_context *context,
+ void (*callback)(struct ib_ummunotify_context *,
+ struct ib_ummunotify_range *));
+void ib_ummunotify_cleanup_context(struct ib_ummunotify_context *context);
+
+static inline void ib_ummunotify_clear_range(struct ib_ummunotify_range *range)
+{
+ RB_CLEAR_NODE(&range->node);
+}
+
+static inline void ib_ummunotify_clear_context(struct ib_ummunotify_context *context)
+{
+ context->mm = NULL;
+}
+
+static inline int ib_ummunotify_context_used(struct ib_ummunotify_context *context)
+{
+ return !!context->mm;
+}
+
#else /* CONFIG_INFINIBAND_USER_MEM */
#include <linux/err.h>
+struct ib_ummunotify_context;
+
static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
unsigned long addr, size_t size,
int access, int dmasync) {
@@ -104,6 +148,23 @@ static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offs
size_t length) {
return -EINVAL;
}
+
+static inline void ib_ummunotify_register_range(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range) { }
+static inline void ib_ummunotify_unregister_range(struct ib_ummunotify_context *context,
+ struct ib_ummunotify_range *range) { }
+
+static inline int ib_ummunotify_init_context(struct ib_ummunotify_context *context,
+ void (*callback)(struct ib_ummunotify_context *,
+ struct ib_ummunotify_range *)) { return 0; }
+static inline void ib_ummunotify_cleanup_context(struct ib_ummunotify_context *context) { }
+
+static inline void ib_ummunotify_clear_range(struct ib_ummunotify_range *range) { }
+
+static inline void ib_ummunotify_clear_context(struct ib_ummunotify_context *context) { }
+
+static inline int ib_ummunotify_context_used(struct ib_ummunotify_context *context) { return 0; }
+
#endif /* CONFIG_INFINIBAND_USER_MEM */
#endif /* IB_UMEM_H */
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index b513e662d8e499..e3c7f162f97b1f 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -87,6 +87,9 @@ enum {
IB_USER_VERBS_CMD_CLOSE_XRCD,
IB_USER_VERBS_CMD_CREATE_XSRQ,
IB_USER_VERBS_CMD_OPEN_QP,
+ IB_USER_VERBS_CMD_CREATE_MMU_NOTIFY_CHANNEL,
+ IB_USER_VERBS_CMD_REG_MMU_NOTIFY_MR,
+ IB_USER_VERBS_CMD_DEREG_MMU_NOTIFY_MR,
};
enum {
@@ -116,6 +119,10 @@ struct ib_uverbs_comp_event_desc {
__u64 cq_handle;
};
+struct ib_uverbs_mmu_notify_event_desc {
+ __u64 cq_handle;
+};
+
/*
* All commands from userspace should start with a __u32 command field
* followed by __u16 in_words and out_words fields (which give the
@@ -900,4 +907,33 @@ struct ib_uverbs_destroy_srq_resp {
__u32 events_reported;
};
+struct ib_uverbs_create_mmu_notify_channel {
+ __u64 response;
+};
+
+struct ib_uverbs_create_mmu_notify_channel_resp {
+ __u32 fd;
+};
+
+struct ib_uverbs_reg_mmu_notify_mr {
+ __u64 response;
+ __u64 user_handle;
+ __u64 start;
+ __u64 length;
+ __u64 hca_va;
+ __u32 pd_handle;
+ __u32 access_flags;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_dereg_mmu_notify_mr {
+ __u64 response;
+ __u32 mr_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_dereg_mmu_notify_mr_resp {
+ __u32 events_reported;
+};
+
#endif /* IB_USER_VERBS_H */