diff options
author | Chris Mason <clm@fb.com> | 2016-04-04 08:00:11 -0700 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2016-04-04 08:00:11 -0700 |
commit | 36d859a9a857ab0524dce8d4eaf8e328ef53f9c2 (patch) | |
tree | 20074e603d5ad14474a6d948b4320cd538f5e20f | |
download | schbench-36d859a9a857ab0524dce8d4eaf8e328ef53f9c2.tar.gz |
schbench: measure scheduler wakeup latencies
Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r-- | Makefile | 27 | ||||
-rw-r--r-- | schbench.c | 706 |
2 files changed, 733 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..94d042e --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +CC = gcc +CFLAGS = -Wall -O0 -g -W +ALL_CFLAGS = $(CFLAGS) -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 + +PROGS = schbench +ALL = $(PROGS) + +$(PROGS): | depend + +all: $(ALL) + +%.o: %.c + $(CC) -o $*.o -c $(ALL_CFLAGS) $< + +schbench: schbench.o + $(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^) -lm -lpthread + +depend: + @$(CC) -MM $(ALL_CFLAGS) *.c 1> .depend + +clean: + -rm -f *.o $(PROGS) .depend + +ifneq ($(wildcard .depend),) +include .depend +endif + diff --git a/schbench.c b/schbench.c new file mode 100644 index 0000000..3c8a21f --- /dev/null +++ b/schbench.c @@ -0,0 +1,706 @@ +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <getopt.h> +#include <sys/time.h> +#include <time.h> +#include <string.h> +#include <linux/futex.h> +#include <sys/syscall.h> + +#define PLAT_BITS 8 +#define PLAT_VAL (1 << PLAT_BITS) +#define PLAT_GROUP_NR 19 +#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL) +#define PLAT_LIST_MAX 20 + +/* -m number of message threads */ +static int message_threads = 2; +/* -t number of workers per message thread */ +static int worker_threads = 16; +/* -r seconds */ +static int runtime = 30; +/* -s usec */ +static int sleeptime = 10000; +/* -c usec */ +static unsigned long long cputime = 10000; +/* -a, bool */ +static int autobench = 0; + +/* the latency histogram uses this to pitch outliers */ +static unsigned int max_us = 50000; + +/* main() sets this to the time when we should all stop doing work */ +static struct timeval global_stop; + +/* the message threads flip this to true when they decide runtime is up */ +static unsigned long stopping = 0; + + +/* + * one stat struct per thread data, when the workers sleep this records the + * latency between when they are woken up and when they actually get the + * CPU again. The message threads sum up the stats of all the workers and + * then bubble them up to main() for printing + */ +struct stats { + unsigned int plat[PLAT_NR]; + unsigned int nr_samples; + unsigned int max; + unsigned int min; + unsigned int over; +}; + +/* this defines which latency profiles get printed */ +#define PLIST_P99 4 +static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9 }; + +enum { + HELP_LONG_OPT = 1, +}; + +char *option_string = "am:t:s:c:r:"; +static struct option long_options[] = { + {"auto", no_argument, 0, 'a'}, + {"message-threads", required_argument, 0, 'm'}, + {"threads", required_argument, 0, 't'}, + {"runtime", required_argument, 0, 'r'}, + {"sleeptime", required_argument, 0, 's'}, + {"cputime", required_argument, 0, 'c'}, + {"help", no_argument, 0, HELP_LONG_OPT}, + {0, 0, 0, 0} +}; + +static void print_usage(void) +{ + fprintf(stderr, "schbench usage:\n" + "\t-d (--dispatch-threads): number of message threads (def: 2)\n" + "\t-t (--threads): worker threads per message thread (def: 16)\n" + "\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n" + "\t-s (--sleeptime): Message thread latency (usec, def: 10000\n" + "\t-c (--cputime): How long to think during loop (usec, def: 10000\n" + ); + exit(1); +} + +static void parse_options(int ac, char **av) +{ + int c; + + while (1) { + int option_index = 0; + + c = getopt_long(ac, av, option_string, + long_options, &option_index); + + if (c == -1) + break; + + switch(c) { + case 'a': + autobench = 1; + break; + case 's': + sleeptime = atoi(optarg); + break; + case 'c': + cputime = atoi(optarg); + break; + case 'd': + message_threads = atoi(optarg); + break; + case 't': + worker_threads = atoi(optarg); + break; + case 'r': + runtime = atoi(optarg); + break; + case '?': + case HELP_LONG_OPT: + print_usage(); + break; + default: + break; + } + } + + if (optind < ac) { + fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]); + exit(1); + } +} + +void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0) +{ + tdiff->tv_sec = t1->tv_sec - t0->tv_sec; + tdiff->tv_usec = t1->tv_usec - t0->tv_usec; + if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) { + tdiff->tv_sec--; + tdiff->tv_usec += 1000000; + if (tdiff->tv_usec < 0) { + fprintf(stderr, "lat_fs: tvsub shows test time ran backwards!\n"); + exit(1); + } + } + + /* time shouldn't go backwards!!! */ + if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) { + tdiff->tv_sec = 0; + tdiff->tv_usec = 0; + } +} + +/* + * returns the difference between start and stop in usecs. Negative values + * are turned into 0 + */ +unsigned long long tvdelta(struct timeval *start, struct timeval *stop) +{ + struct timeval td; + unsigned long long usecs; + + tvsub(&td, stop, start); + usecs = td.tv_sec; + usecs *= 1000000; + usecs += td.tv_usec; + return (usecs); +} + +/* mr axboe's magic latency histogram */ +static unsigned int plat_val_to_idx(unsigned int val) +{ + unsigned int msb, error_bits, base, offset; + + /* Find MSB starting from bit 0 */ + if (val == 0) + msb = 0; + else + msb = sizeof(val)*8 - __builtin_clz(val) - 1; + + /* + * MSB <= (PLAT_BITS-1), cannot be rounded off. Use + * all bits of the sample as index + */ + if (msb <= PLAT_BITS) + return val; + + /* Compute the number of error bits to discard*/ + error_bits = msb - PLAT_BITS; + + /* Compute the number of buckets before the group */ + base = (error_bits + 1) << PLAT_BITS; + + /* + * Discard the error bits and apply the mask to find the + * index for the buckets in the group + */ + offset = (PLAT_VAL - 1) & (val >> error_bits); + + /* Make sure the index does not exceed (array size - 1) */ + return (base + offset) < (PLAT_NR - 1) ? + (base + offset) : (PLAT_NR - 1); +} + +/* + * Convert the given index of the bucket array to the value + * represented by the bucket + */ +static unsigned int plat_idx_to_val(unsigned int idx) +{ + unsigned int error_bits, k, base; + + if (idx >= PLAT_NR) { + fprintf(stderr, "idx %u is too large\n", idx); + exit(1); + } + + /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use + * all bits of the sample as index */ + if (idx < (PLAT_VAL << 1)) + return idx; + + /* Find the group and compute the minimum value of that group */ + error_bits = (idx >> PLAT_BITS) - 1; + base = 1 << (error_bits + PLAT_BITS); + + /* Find its bucket number of the group */ + k = idx % PLAT_VAL; + + /* Return the mean of the range of the bucket */ + return base + ((k + 0.5) * (1 << error_bits)); +} + + +static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr, + unsigned int **output) +{ + unsigned long sum = 0; + unsigned int len, i, j = 0; + unsigned int oval_len = 0; + unsigned int *ovals = NULL; + int is_last; + + len = 0; + while (len < PLAT_LIST_MAX && plist[len] != 0.0) + len++; + + if (!len) + return 0; + + /* + * Calculate bucket values, note down max and min values + */ + is_last = 0; + for (i = 0; i < PLAT_NR && !is_last; i++) { + sum += io_u_plat[i]; + while (sum >= (plist[j] / 100.0 * nr)) { + if (j == oval_len) { + oval_len += 100; + ovals = realloc(ovals, oval_len * sizeof(unsigned int)); + } + + ovals[j] = plat_idx_to_val(i); + is_last = (j == len - 1); + if (is_last) + break; + + j++; + } + } + + *output = ovals; + return len; +} + +static int calc_p99(struct stats *s) +{ + unsigned int *ovals = NULL; + int ret = 0; + int len; + + len = calc_percentiles(s->plat, s->nr_samples, &ovals); + if (len && len > PLIST_P99) + ret = ovals[PLIST_P99]; + if (ovals) + free(ovals); + return ret; +} + +static void show_latencies(struct stats *s) +{ + unsigned int *ovals = NULL; + unsigned int len, i; + + len = calc_percentiles(s->plat, s->nr_samples, &ovals); + if (len) { + fprintf(stderr, "Latency percentiles (usec)\n"); + for (i = 0; i < len; i++) + fprintf(stderr, "\t%s%2.4fth: %u\n", + i == PLIST_P99 ? "*" : "", + plist[i], ovals[i]); + } + + if (ovals) + free(ovals); + + fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max); +} + +/* fold latency info from s into d */ +void combine_stats(struct stats *d, struct stats *s) +{ + int i; + for (i = 0; i < PLAT_NR; i++) + d->plat[i] += s->plat[i]; + d->nr_samples += s->nr_samples; + d->over += s->over; + if (s->max > d->max) + d->max = s->max; + if (s->min < d->min) + d->min = s->min; +} + +/* record a latency result into the histogram */ +static void add_lat(struct stats *s, unsigned int us) +{ + int lat_index = 0; + + if (us > s->max) + s->max = us; + if (us < s->min) + s->min = us; + + if (us > max_us) { + fprintf(stderr, "latency=%u usec\n", us); + s->over++; + } + + lat_index = plat_val_to_idx(us); + __sync_fetch_and_add(&s->plat[lat_index], 1); + __sync_fetch_and_add(&s->nr_samples, 1); +} + +/* + * every thread has one of these, it comes out to about 19K thanks to the + * giant stats struct + */ +struct thread_data { + pthread_t tid; + /* ->next is for placing us on the msg_thread's list for waking */ + struct thread_data *next; + + /* our parent thread and messaging partner */ + struct thread_data *msg_thread; + + /* + * the msg thread stuffs gtod in here before waking us, so we can + * measure scheduler latency + */ + struct timeval wake_time; + + /* keep the futex and the wake_time in the same cacheline */ + int futex; + + /* mr axboe's magic latency histogram */ + struct stats stats; +}; + +/* we're so fancy we make our own futex wrappers */ +#define FUTEX_BLOCKED 0 +#define FUTEX_RUNNING 1 + +static int futex(int *uaddr, int futex_op, int val, + const struct timespec *timeout, int *uaddr2, int val3) +{ + return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3); +} + +/* + * wakeup a process waiting on a futex, making sure they are really waiting + * first + */ +static void fpost(int *futexp) +{ + int s; + + if (__sync_bool_compare_and_swap(futexp, FUTEX_BLOCKED, + FUTEX_RUNNING)) { + s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0); + if (s == -1) { + perror("FUTEX_WAKE"); + exit(1); + } + } +} + +/* + * wait on a futex, with an optional timeout. Make sure to set + * the futex to FUTEX_BLOCKED beforehand. + * + * This will return zero if all went well, or return -ETIMEDOUT if you + * hit the timeout without getting posted + */ +static int fwait(int *futexp, struct timespec *timeout) +{ + int s; + while (1) { + /* Is the futex available? */ + if (__sync_bool_compare_and_swap(futexp, FUTEX_RUNNING, + FUTEX_BLOCKED)) { + break; /* Yes */ + } + /* Futex is not available; wait */ + s = futex(futexp, FUTEX_WAIT, FUTEX_BLOCKED, timeout, NULL, 0); + if (s == -1 && errno != EAGAIN) { + if (errno == ETIMEDOUT) + return -ETIMEDOUT; + perror("futex-FUTEX_WAIT"); + exit(1); + } + } + return 0; +} + +/* + * cmpxchg based list prepend + */ +static void xlist_add(struct thread_data *head, struct thread_data *add) +{ + struct thread_data *old; + struct thread_data *ret; + + while (1) { + old = head->next; + add->next = old; + ret = __sync_val_compare_and_swap(&head->next, old, add); + if (ret == old) + break; + } +} + +/* + * xchg based list splicing. This returns the entire list and + * replaces the head->next with NULL + */ +static struct thread_data *xlist_splice(struct thread_data *head) +{ + struct thread_data *old; + struct thread_data *ret; + + while (1) { + old = head->next; + ret = __sync_val_compare_and_swap(&head->next, old, NULL); + if (ret == old) + break; + } + return ret; +} + +/* + * Wake everyone currently waiting on the message list, filling in their + * thread_data->wake_time with the current time. + * + * It's not exactly the current time, it's really the time at the start of + * the list run. We want to detect when the scheduler is just preempting the + * waker and giving away the rest of its timeslice. So we gtod once at + * the start of the loop and use that for all the threads we wake. + */ +static void xlist_wake_all(struct thread_data *td) +{ + struct thread_data *list; + struct thread_data *next; + struct timeval now; + + list = xlist_splice(td); + gettimeofday(&now, NULL); + while (list) { + next = list->next; + list->next = NULL; + memcpy(&list->wake_time, &now, sizeof(now)); + fpost(&list->futex); + list = next; + } +} + +/* + * called by worker threads to send a message and wait for the answer. + * In reality we're just trading one cacheline with the gtod and futex in + * it, but that's good enough. We gtod after waking and use that to + * record scheduler latency. + */ +static void msg_and_wait(struct thread_data *td) +{ + struct timeval now; + unsigned long long delta; + struct timespec timeout; + + timeout.tv_sec = 0; + timeout.tv_nsec = 5000 * 1000; + + /* set ourselves to blocked */ + td->futex = FUTEX_BLOCKED; + gettimeofday(&td->wake_time, NULL); + + /* add us to the list */ + xlist_add(td->msg_thread, td); + + fpost(&td->msg_thread->futex); + + /* + * don't wait if the main threads are shutting down, + * they will never kick us fpost has a full barrier, so as long + * as the message thread walks his list after setting stopping, + * we shouldn't miss the wakeup + */ + if (!stopping) { + /* if he hasn't already woken us up, wait */ + fwait(&td->futex, NULL); + } + + gettimeofday(&now, NULL); + delta = tvdelta(&td->wake_time, &now); + if (delta > 0) + add_lat(&td->stats, delta); +} + +/* + * once the message thread starts all his children, this is where he + * loops until our runtime is up. Basically this sits around waiting + * for posting by the worker threads, replying to their messages after + * a delay of 'sleeptime' + some jitter. + */ +static void run_msg_thread(struct thread_data *td) +{ + struct timeval now; + struct timespec timeout; + unsigned int seed = pthread_self(); + int max_jitter = sleeptime / 4; + int jitter; + + jitter = rand_r(&seed) % max_jitter; + timeout.tv_sec = 0; + timeout.tv_nsec = (sleeptime + jitter) * 1000; + + while (1) { + td->futex = FUTEX_BLOCKED; + xlist_wake_all(td); + + gettimeofday(&now, NULL); + if (now.tv_sec > global_stop.tv_sec) { + stopping = 1; + __sync_synchronize(); + xlist_wake_all(td); + break; + } + fwait(&td->futex, &timeout); + + /* + * messages shouldn't be instant, sleep a little to make them + * wait + */ + jitter = rand_r(&seed) % max_jitter; + usleep(sleeptime + jitter); + } +} + +#define nop __asm__ __volatile__("rep;nop": : :"memory") + +static void usec_spin(unsigned long spin_time) +{ + struct timeval now; + struct timeval start; + unsigned long long delta; + + gettimeofday(&start, NULL); + while (1) { + gettimeofday(&now, NULL); + delta = tvdelta(&start, &now); + if (delta > spin_time) + return; + nop; + } +} + +/* + * the worker thread is pretty simple, it just does a single spin and + * then waits on a message from the message thread + */ +void *worker_thread(void *arg) +{ + struct thread_data *td = arg; + + while(1) { + if (stopping) + break; + + usec_spin(cputime); + msg_and_wait(td); + } + return NULL; +} + +/* + * the message thread starts his own gaggle of workers and then sits around + * replying when they post him. He collects latency stats as all the threads + * exit + */ +void *message_thread(void *arg) +{ + struct thread_data *td = arg; + struct thread_data *worker_threads_mem = NULL; + int i; + int ret; + + worker_threads_mem = calloc(worker_threads, sizeof(struct thread_data)); + + if (!worker_threads_mem) { + perror("unable to allocate ram"); + pthread_exit((void *)-ENOMEM); + } + + for (i = 0; i < worker_threads; i++) { + pthread_t tid; + + worker_threads_mem[i].msg_thread = td; + ret = pthread_create(&tid, NULL, worker_thread, + worker_threads_mem + i); + if (ret) { + fprintf(stderr, "error %d from pthread_create\n", ret); + exit(1); + } + worker_threads_mem[i].tid = tid; + } + + run_msg_thread(td); + + for (i = 0; i < worker_threads; i++) { + pthread_join(worker_threads_mem[i].tid, NULL); + combine_stats(&td->stats, &worker_threads_mem[i].stats); + } + free(worker_threads_mem); + + return NULL; +} + +int main(int ac, char **av) +{ + int i; + int ret; + struct thread_data *message_threads_mem = NULL; + struct stats stats; + + parse_options(ac, av); +again: + stopping = 0; + memset(&stats, 0, sizeof(stats)); + + message_threads_mem = calloc(message_threads, + sizeof(struct thread_data)); + + + if (!message_threads_mem) { + perror("unable to allocate ram"); + exit(1); + } + gettimeofday(&global_stop, NULL); + global_stop.tv_sec += runtime; + + /* start our message threads, each one starts its own workers */ + for (i = 0; i < message_threads; i++) { + pthread_t tid; + ret = pthread_create(&tid, NULL, message_thread, + message_threads_mem + i); + if (ret) { + fprintf(stderr, "error %d from pthread_create\n", ret); + exit(1); + } + message_threads_mem[i].tid = tid; + } + for (i = 0; i < message_threads; i++) { + pthread_join(message_threads_mem[i].tid, NULL); + combine_stats(&stats, &message_threads_mem[i].stats); + } + + free(message_threads_mem); + + /* + * in auto bench mode, keep adding workers until our latencies get + * horrible + */ + if (autobench) { + int p99 = calc_p99(&stats); + fprintf(stderr, "cputime %Lu threads %d p99 %d\n", + cputime, worker_threads, p99); + if (p99 < 2000) { + worker_threads++; + goto again; + } + } + + show_latencies(&stats); + + return 0; +} |