diff options
author | Chris Mason <clm@fb.com> | 2023-04-11 11:19:27 -0700 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2023-04-13 12:52:58 -0700 |
commit | 5a425ee35b75f4bf001c1a16ad3fd5970f1a62a8 (patch) | |
tree | 5d6d3a7c8f5ae37bd70d1d96a664508cabce5d12 | |
parent | bad164cd303626007b2439010773b95ec16523c7 (diff) | |
download | schbench-5a425ee35b75f4bf001c1a16ad3fd5970f1a62a8.tar.gz |
schbench: add per-cpu spin locking during the work
Real workloads have performance impacts to scheduling the process away
during critical sections. We could spend a lot of time perfecting the
right set of cache misses to reflect those costs.
Or, we could make a per-cpu spinlock that gets held while we're doing
the matrix math. This is a pretty blunt hammer but it accurately
reflects the impact of preemption on our workloads.
Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r-- | schbench.c | 60 |
1 files changed, 58 insertions, 2 deletions
@@ -68,6 +68,13 @@ static volatile unsigned long stopping = 0; /* size of matrices to multiply */ static unsigned long matrix_size = 0; +struct per_cpu_lock { + pthread_mutex_t lock; +} __attribute__((aligned)); + +static struct per_cpu_lock *per_cpu_locks; +static int num_cpu_locks; + /* * one stat struct per thread data, when the workers sleep this records the * latency between when they are woken up and when they actually get the @@ -988,15 +995,35 @@ static void do_some_math(struct thread_data *thread_data) } } +static pthread_mutex_t *lock_this_cpu(void) +{ + int cpu = sched_getcpu(); + pthread_mutex_t *lock; + if (cpu < 0) { + perror("sched_getcpu failed\n"); + exit(1); + } + lock = &per_cpu_locks[cpu].lock; + while (pthread_mutex_trylock(lock) != 0) + nop; + return lock; + +} + /* * spin or do some matrix arithmetic */ static void do_work(struct thread_data *td) { + pthread_mutex_t *lock = NULL; unsigned long i; + if (!calibrate_only) + lock = lock_this_cpu(); for (i = 0; i < operations; i++) do_some_math(td); + if (!calibrate_only) + pthread_mutex_unlock(lock); } /* @@ -1024,7 +1051,21 @@ void *worker_thread(void *arg) do { struct request *tmp; - gettimeofday(&work_start, NULL); + if (calibrate_only) { + /* + * in calibration mode, don't include the + * usleep in the timing + */ + usleep(100); + gettimeofday(&work_start, NULL); + } else { + /* + * lets start off with some simulated networking, + * and also make sure we get a fresh clean timeslice + */ + gettimeofday(&work_start, NULL); + usleep(100); + } do_work(td); @@ -1275,6 +1316,22 @@ int main(int ac, char **av) matrix_size = sqrt(cache_footprint_kb * 1024 / 3 / sizeof(unsigned long)); + num_cpu_locks = get_nprocs(); + per_cpu_locks = calloc(num_cpu_locks, sizeof(struct per_cpu_lock)); + if (!per_cpu_locks) { + perror("unable to allocate memory for per cpu locks\n"); + exit(1); + } + + for (i = 0; i < num_cpu_locks; i++) { + pthread_mutex_t *lock = &per_cpu_locks[i].lock; + ret = pthread_mutex_init(lock, NULL); + if (ret) { + perror("mutex init failed\n"); + exit(1); + } + } + again: requests_per_sec /= message_threads; loops_per_sec = 0; @@ -1352,6 +1409,5 @@ again: if (!auto_rps) fprintf(stdout, "average rps: %.2f\n", (double)(loop_count) / runtime); - return 0; } |