aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>2024-02-29 17:24:33 -0500
committerMathieu Desnoyers <mathieu.desnoyers@efficios.com>2024-03-01 20:32:11 -0500
commit367e559c27a9c9637ff26c9531fd44ef22991aea (patch)
tree2aa028dc936e29749e98132c36fd6366e298f34e
parentef6695f10502099b68005a8248a0c90a3323a0e6 (diff)
downloadlibrseq-367e559c27a9c9637ff26c9531fd44ef22991aea.tar.gz
rseq percpu alloc: Implement numa support
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Change-Id: I732b632f476ffef362a1ab486bcf425e4ded6644
-rw-r--r--configure.ac25
-rw-r--r--include/rseq/percpu-alloc.h3
-rw-r--r--src/Makefile.am4
-rw-r--r--src/rseq-percpu-alloc.c130
4 files changed, 123 insertions, 39 deletions
diff --git a/configure.ac b/configure.ac
index ac0883f..4f58d02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -196,11 +196,32 @@ PKG_CHECK_MODULES([SECCOMP], [libseccomp],
## Optional features selection ##
## ##
+# Enabled by default
+AE_FEATURE_DEFAULT_ENABLE
+AE_FEATURE([numa],[disable NUMA support])
+
# When given, add -Werror to WARN_CFLAGS and WARN_CXXFLAGS.
# Disabled by default
AE_FEATURE_DEFAULT_DISABLE
AE_FEATURE([Werror], [Treat compiler warnings as errors.])
+## ##
+## Check for optional features dependencies ##
+## ##
+
+# The numa integration requires libnuma
+AE_IF_FEATURE_ENABLED([numa], [
+ AC_CHECK_LIB([numa], [numa_available], [
+ AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if libnuma is available.])
+ ], [
+ AC_MSG_ERROR([dnl
+libnuma is not available. Please either install it (e.g. libnuma-dev) or use
+[LDFLAGS]=-Ldir to specify the right location, or use --disable-numa configure
+argument to disable NUMA support.
+ ])
+ ])
+])
+
## ##
## Set automake variables for optional feature conditionnals in Makefile.am ##
@@ -208,7 +229,7 @@ AE_FEATURE([Werror], [Treat compiler warnings as errors.])
AM_CONDITIONAL([ENABLE_SHARED], [test "x${enable_shared}" = "xyes"])
AM_CONDITIONAL([ENABLE_SECCOMP], [test "x${have_seccomp}" = "xyes"])
-
+AM_CONDITIONAL([ENABLE_NUMA], AE_IS_FEATURE_ENABLED([numa]))
## ##
## Substitute variables for use in Makefile.am ##
@@ -264,6 +285,8 @@ AS_ECHO
PPRINT_SUBTITLE([Features])
PPRINT_PROP_STRING([Target architecture], $host_cpu)
+AE_IS_FEATURE_ENABLED([numa]) && value=1 || value=0
+PPRINT_PROP_BOOL([NUMA], $value)
report_bindir="`eval eval echo $bindir`"
report_libdir="`eval eval echo $libdir`"
diff --git a/include/rseq/percpu-alloc.h b/include/rseq/percpu-alloc.h
index c1ea96a..546f5c0 100644
--- a/include/rseq/percpu-alloc.h
+++ b/include/rseq/percpu-alloc.h
@@ -15,7 +15,8 @@ struct rseq_percpu_pool;
struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
size_t percpu_len, int max_nr_cpus,
- int prot, int flags, int fd, off_t offset);
+ int mmap_prot, int mmap_flags, int mmap_fd, off_t mmap_offset,
+ int numa_flags);
int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool);
void *rseq_percpu_malloc(struct rseq_percpu_pool *pool);
diff --git a/src/Makefile.am b/src/Makefile.am
index c9e134c..9ef6cfb 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -9,5 +9,9 @@ librseq_la_SOURCES = \
librseq_la_LDFLAGS = -no-undefined -version-info $(RSEQ_LIBRARY_VERSION)
librseq_la_LIBADD = $(DL_LIBS)
+if ENABLE_NUMA
+librseq_la_LIBADD += -lnuma
+endif
+
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = librseq.pc
diff --git a/src/rseq-percpu-alloc.c b/src/rseq-percpu-alloc.c
index 12f8b2b..3b48d51 100644
--- a/src/rseq-percpu-alloc.c
+++ b/src/rseq-percpu-alloc.c
@@ -12,6 +12,12 @@
#include <errno.h>
#include <stdint.h>
#include <stdbool.h>
+#include <stdio.h>
+
+#ifdef HAVE_LIBNUMA
+# include <numa.h>
+# include <numaif.h>
+#endif
/*
* rseq-percpu-alloc.c: rseq per-cpu memory allocator.
@@ -176,15 +182,96 @@ int get_count_order_ulong(unsigned long x)
return fls_ulong(x - 1);
}
+static
+long rseq_get_page_len(void)
+{
+ long page_len = sysconf(_SC_PAGE_SIZE);
+
+ if (page_len < 0)
+ page_len = DEFAULT_PAGE_SIZE;
+ return page_len;
+}
+
+static
+void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
+{
+ return pool->base + (pool->percpu_len * cpu) + item_offset;
+}
+
+void *__rseq_percpu_ptr(void *_ptr, int cpu)
+{
+ uintptr_t ptr = (uintptr_t) _ptr;
+ uintptr_t item_offset = ptr >> OFFSET_SHIFT;
+ uintptr_t pool_index = ptr & POOL_MASK;
+ struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
+
+ assert(cpu >= 0);
+ return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
+}
+
+static
+void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
+{
+ int i;
+
+ for (i = 0; i < pool->max_nr_cpus; i++) {
+ char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
+ memset(p, 0, pool->item_len);
+ }
+}
+
+#ifdef HAVE_LIBNUMA
+static
+void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool,
+ int numa_flags)
+{
+ unsigned long nr_pages, page;
+ long ret, page_len;
+ int cpu;
+
+ if (!numa_flags)
+ return;
+ page_len = rseq_get_page_len();
+ nr_pages = pool->percpu_len >> get_count_order_ulong(page_len);
+ for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
+ int node = numa_node_of_cpu(cpu);
+
+ /* TODO: batch move_pages() call with an array of pages. */
+ for (page = 0; page < nr_pages; page++) {
+ void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
+ int status = -EPERM;
+
+ ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
+ if (ret) {
+ perror("move_pages");
+ abort();
+ }
+ }
+ }
+}
+#else
+static
+void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
+ int numa_flags __attribute__((unused)))
+{
+}
+#endif
+
+/*
+ * Expected numa_flags:
+ * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
+ * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
+ * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
+ */
struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
size_t percpu_len, int max_nr_cpus,
- int prot, int flags, int fd, off_t offset)
+ int mmap_prot, int mmap_flags, int mmap_fd,
+ off_t mmap_offset, int numa_flags)
{
struct rseq_percpu_pool *pool;
void *base;
unsigned int i;
int order;
- long page_len;
/* Make sure each item is large enough to contain free list pointers. */
if (item_len < sizeof(void *))
@@ -199,10 +286,7 @@ struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
item_len = 1UL << order;
/* Align percpu_len on page size. */
- page_len = sysconf(_SC_PAGE_SIZE);
- if (page_len < 0)
- page_len = DEFAULT_PAGE_SIZE;
- percpu_len = rseq_align(percpu_len, page_len);
+ percpu_len = rseq_align(percpu_len, rseq_get_page_len());
if (max_nr_cpus < 0 || item_len > percpu_len ||
percpu_len > (UINTPTR_MAX >> OFFSET_SHIFT)) {
@@ -222,13 +306,13 @@ struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
goto end;
found_empty:
- base = mmap(NULL, percpu_len * max_nr_cpus, prot, flags, fd, offset);
+ base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot,
+ mmap_flags, mmap_fd, mmap_offset);
if (base == MAP_FAILED) {
pool = NULL;
goto end;
}
- // TODO: integrate with libnuma to provide NUMA placement hints.
- // See move_pages(2).
+ rseq_percpu_pool_init_numa(pool, numa_flags);
pthread_mutex_init(&pool->lock, NULL);
pool->base = base;
pool->percpu_len = percpu_len;
@@ -262,34 +346,6 @@ end:
}
static
-void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
-{
- return pool->base + (pool->percpu_len * cpu) + item_offset;
-}
-
-void *__rseq_percpu_ptr(void *_ptr, int cpu)
-{
- uintptr_t ptr = (uintptr_t) _ptr;
- uintptr_t item_offset = ptr >> OFFSET_SHIFT;
- uintptr_t pool_index = ptr & POOL_MASK;
- struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
-
- assert(cpu >= 0);
- return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
-}
-
-static
-void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
-{
- int i;
-
- for (i = 0; i < pool->max_nr_cpus; i++) {
- char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
- memset(p, 0, pool->item_len);
- }
-}
-
-static
void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
{
struct free_list_node *node;