aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <clm@fb.com>2023-04-11 11:19:27 -0700
committerChris Mason <clm@fb.com>2023-04-13 12:52:58 -0700
commit5a425ee35b75f4bf001c1a16ad3fd5970f1a62a8 (patch)
tree5d6d3a7c8f5ae37bd70d1d96a664508cabce5d12
parentbad164cd303626007b2439010773b95ec16523c7 (diff)
downloadschbench-5a425ee35b75f4bf001c1a16ad3fd5970f1a62a8.tar.gz
schbench: add per-cpu spin locking during the work
Real workloads have performance impacts to scheduling the process away during critical sections. We could spend a lot of time perfecting the right set of cache misses to reflect those costs. Or, we could make a per-cpu spinlock that gets held while we're doing the matrix math. This is a pretty blunt hammer but it accurately reflects the impact of preemption on our workloads. Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--schbench.c60
1 files changed, 58 insertions, 2 deletions
diff --git a/schbench.c b/schbench.c
index 8be7af0..8af5a28 100644
--- a/schbench.c
+++ b/schbench.c
@@ -68,6 +68,13 @@ static volatile unsigned long stopping = 0;
/* size of matrices to multiply */
static unsigned long matrix_size = 0;
+struct per_cpu_lock {
+ pthread_mutex_t lock;
+} __attribute__((aligned));
+
+static struct per_cpu_lock *per_cpu_locks;
+static int num_cpu_locks;
+
/*
* one stat struct per thread data, when the workers sleep this records the
* latency between when they are woken up and when they actually get the
@@ -988,15 +995,35 @@ static void do_some_math(struct thread_data *thread_data)
}
}
+static pthread_mutex_t *lock_this_cpu(void)
+{
+ int cpu = sched_getcpu();
+ pthread_mutex_t *lock;
+ if (cpu < 0) {
+ perror("sched_getcpu failed\n");
+ exit(1);
+ }
+ lock = &per_cpu_locks[cpu].lock;
+ while (pthread_mutex_trylock(lock) != 0)
+ nop;
+ return lock;
+
+}
+
/*
* spin or do some matrix arithmetic
*/
static void do_work(struct thread_data *td)
{
+ pthread_mutex_t *lock = NULL;
unsigned long i;
+ if (!calibrate_only)
+ lock = lock_this_cpu();
for (i = 0; i < operations; i++)
do_some_math(td);
+ if (!calibrate_only)
+ pthread_mutex_unlock(lock);
}
/*
@@ -1024,7 +1051,21 @@ void *worker_thread(void *arg)
do {
struct request *tmp;
- gettimeofday(&work_start, NULL);
+ if (calibrate_only) {
+ /*
+ * in calibration mode, don't include the
+ * usleep in the timing
+ */
+ usleep(100);
+ gettimeofday(&work_start, NULL);
+ } else {
+ /*
+ * lets start off with some simulated networking,
+ * and also make sure we get a fresh clean timeslice
+ */
+ gettimeofday(&work_start, NULL);
+ usleep(100);
+ }
do_work(td);
@@ -1275,6 +1316,22 @@ int main(int ac, char **av)
matrix_size = sqrt(cache_footprint_kb * 1024 / 3 / sizeof(unsigned long));
+ num_cpu_locks = get_nprocs();
+ per_cpu_locks = calloc(num_cpu_locks, sizeof(struct per_cpu_lock));
+ if (!per_cpu_locks) {
+ perror("unable to allocate memory for per cpu locks\n");
+ exit(1);
+ }
+
+ for (i = 0; i < num_cpu_locks; i++) {
+ pthread_mutex_t *lock = &per_cpu_locks[i].lock;
+ ret = pthread_mutex_init(lock, NULL);
+ if (ret) {
+ perror("mutex init failed\n");
+ exit(1);
+ }
+ }
+
again:
requests_per_sec /= message_threads;
loops_per_sec = 0;
@@ -1352,6 +1409,5 @@ again:
if (!auto_rps)
fprintf(stdout, "average rps: %.2f\n",
(double)(loop_count) / runtime);
-
return 0;
}