summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@kernel.org>2023-06-09 11:26:58 -0700
committerAndy Lutomirski <luto@kernel.org>2023-06-09 11:26:58 -0700
commitdebe447e454d182ac06c049d78e92372b38a4dd9 (patch)
treeb74660bc2eab06f29aff1d3a53538c717a7a30db
parent47c81ffebffd1d03f3ed2e990e6d3e553dfcf3d4 (diff)
downloadmisc-tests-debe447e454d182ac06c049d78e92372b38a4dd9.tar.gz
Add how_bad_is_smap, which is a hackHEADmaster
-rw-r--r--tight_loop/how_bad_is_smap.c383
1 files changed, 383 insertions, 0 deletions
diff --git a/tight_loop/how_bad_is_smap.c b/tight_loop/how_bad_is_smap.c
new file mode 100644
index 0000000..c2df0e8
--- /dev/null
+++ b/tight_loop/how_bad_is_smap.c
@@ -0,0 +1,383 @@
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <err.h>
+#include <fcntl.h>
+
+struct psm_counter {
+ int fd;
+ struct perf_event_mmap_page *metadata;
+};
+
+struct psm_counter *psm_counter_create(void)
+{
+ struct psm_counter *counter = malloc(sizeof(struct psm_counter));
+
+ struct perf_event_attr attr;
+ memset(&attr, 0, sizeof(attr));
+
+ attr.type = PERF_TYPE_HARDWARE;
+ attr.size = sizeof(struct perf_event_attr);
+ attr.config = PERF_COUNT_HW_CPU_CYCLES;
+
+ counter->fd = syscall(
+ SYS_perf_event_open,
+ &attr, /* attributes */
+ 0, /* monitor me */
+ -1, /* all CPUs */
+ -1, /* group leader */
+ PERF_FLAG_FD_CLOEXEC /* flags */
+ );
+
+ if (counter->fd == -1)
+ err(1, "perf_event_open");
+
+ counter->metadata = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED,
+ counter->fd, 0);
+ if (counter->metadata == MAP_FAILED) {
+ err(1, "mmap");
+ }
+
+ if (!counter->metadata->cap_user_rdpmc)
+ errx(1, "RDPMC not supported (cap_user_rdpmc == 0)\n");
+
+ if (!counter->metadata->index)
+ errx(1, "RDPMC not supported (no index assigned)\n");
+
+ return counter;
+}
+
+void psm_counter_destroy(struct psm_counter *counter)
+{
+ munmap(counter->metadata, PAGE_SIZE);
+ counter->metadata = MAP_FAILED;
+ close(counter->fd);
+ counter->fd = -1;
+}
+
+static inline void psm_barrier(void)
+{
+ asm volatile ("" : : : "memory");
+}
+
+static inline void psm_serialize(void)
+{
+ unsigned int eax = 0, ecx = 0;
+ asm volatile ("cpuid"
+ : "+a" (eax), "+c" (ecx) : : "ebx", "edx", "memory");
+}
+
+static inline uint64_t psm_rdpmc(unsigned int ecx)
+{
+ unsigned int eax, edx;
+ asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ return (((uint64_t)edx << 32) | eax);
+}
+
+typedef bool (*psm_sample_fn)(uint64_t *count,
+ const struct psm_counter *counter,
+ int reps,
+ void *opaque);
+
+/*
+ * psm_atomic is a mechanism for very precise counter measurement of
+ * very short operations. A psm_atomic interval will fail if the
+ * process is preempted during the measurement interval.
+ */
+
+struct psm_atomic {
+ /* Copied from metadata in psm_atomic_start. */
+ uint32_t perf_lock; /* If odd, this sample is bad. */
+ uint64_t perf_time_offset;
+
+ uint64_t initial_raw_count;
+
+ /*
+ * This is here to improve code generation. struct psm_atomic
+ * is unlikely to be referenced by an escapted pointer and hence
+ * be affected by a "memory" clobber.
+ */
+ unsigned int rdpmc_ecx;
+};
+
+/*
+ * Starts measuring an uninterrupted duration.
+ */
+static inline struct psm_atomic psm_atomic_start(const struct psm_counter *ctr)
+{
+ struct psm_atomic state;
+ state.perf_lock = ctr->metadata->lock;
+ psm_barrier();
+ state.perf_time_offset = ctr->metadata->time_offset;
+ state.rdpmc_ecx = ctr->metadata->index - 1;
+ psm_barrier(); /* Do rdpmc last to reduce noise */
+ state.initial_raw_count = psm_rdpmc(state.rdpmc_ecx);
+ return state;
+}
+
+static inline bool psm_atomic_elapsed(uint64_t *elapsed_count,
+ const struct psm_atomic *state,
+ const struct psm_counter *ctr)
+{
+ /* Do the RDPMC first to reduce noise. */
+ uint64_t count_now = psm_rdpmc(state->rdpmc_ecx);
+ psm_barrier(); /* No, really, do it first. */
+ unsigned int shift = 64 - ctr->metadata->pmc_width;
+ count_now <<= shift;
+ uint64_t initial_count = state->initial_raw_count;
+ initial_count <<= shift;
+ *elapsed_count = (count_now - initial_count) >> shift;
+ psm_barrier();
+
+ if (ctr->metadata->time_offset != state->perf_time_offset)
+ return false; /* We were interrupted. */
+
+ /* Now check the lock. */
+ psm_barrier();
+ if (ctr->metadata->lock != state->perf_lock || (state->perf_lock & 1))
+ return false;
+
+ return true;
+}
+
+bool psm_atomic_sample_empty(uint64_t *count,
+ const struct psm_counter *ctr,
+ int reps, void *opaque)
+{
+ struct psm_atomic duration = psm_atomic_start(ctr);
+ return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+bool psm_atomic_sample_enosys(uint64_t *count,
+ const struct psm_counter *ctr,
+ int reps, void *opaque)
+{
+ struct psm_atomic duration = psm_atomic_start(ctr);
+#ifdef __x86_64__
+ unsigned long rax;
+ for (int i = 0; i < reps; i++) {
+ rax = 0xbfffffff;
+ asm volatile ("syscall" : "+a" (rax) : : "rcx", "r11");
+ }
+#else
+ for (int i = 0; i < reps; i++)
+ syscall(0x3fffffff);
+#endif
+ return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+bool psm_atomic_sample_bad_prctl(uint64_t *count,
+ const struct psm_counter *ctr,
+ int reps, void *opaque)
+{
+ struct psm_atomic duration = psm_atomic_start(ctr);
+#ifdef __x86_64__
+ unsigned long rax;
+ register unsigned long rdi asm("rdi");
+ for (int i = 0; i < reps; i++) {
+ rax = SYS_prctl;
+ rdi = -1;
+ asm volatile ("syscall" : "+a" (rax), "+r" (rdi) : : "rcx", "r11");
+ }
+#else
+ for (int i = 0; i < reps; i++)
+ syscall(SYS_prctl, -1);
+#endif
+ return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+bool psm_atomic_sample_pread_fd(uint64_t *count,
+ const struct psm_counter *ctr,
+ int reps, void *opaque)
+{
+ int fd = (int)(intptr_t)opaque;
+ char buf[16];
+
+ struct psm_atomic duration = psm_atomic_start(ctr);
+ for (int i = 0; i < reps; i++)
+ if (pread(fd, buf, sizeof(buf), 0) == -1)
+ err(1, "pread");
+ return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+static int compare_ui64(const void *a_void, const void *b_void)
+{
+ const uint64_t a = *(uint64_t*)a_void;
+ const uint64_t b = *(uint64_t*)b_void;
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ else
+ return 0;
+}
+
+uint64_t psm_integer_quantile(const struct psm_counter *ctr, psm_sample_fn fn,
+ int reps, void *opaque,
+ size_t q, size_t n)
+{
+ if (q >= n)
+ abort();
+
+ uint64_t *array = calloc(sizeof(uint64_t), n);
+ for (size_t i = 0; i < n; ) {
+ uint64_t sample;
+ if (!fn(&sample, ctr, reps, opaque))
+ continue;
+ array[i++] = sample;
+ }
+
+ qsort(array, n, sizeof(uint64_t), compare_ui64);
+
+ uint64_t ret = array[q];
+ free(array);
+ return ret;
+}
+
+uint64_t psm_settle(const struct psm_counter *ctr, psm_sample_fn fn,
+ int reps, void *opaque)
+{
+ uint64_t val = UINT64_MAX;
+ int good_iters = 0;
+ for (int total_iters = 0; total_iters < 500; total_iters++) {
+ uint64_t best = UINT64_MAX;
+ for (int inner = 0; inner < 10; ) {
+ uint64_t count;
+ if (!fn(&count, ctr, reps, opaque))
+ continue; /* Rejected sample */
+ if (count < best)
+ best = count;
+ inner++;
+ }
+
+ if (best != val) {
+ good_iters = 1;
+ val = best;
+ } else {
+ good_iters++;
+ if (good_iters == 10)
+ return val;
+ }
+ }
+
+ return ~0ULL;
+}
+
+int reparray[] = {1, 200};
+
+struct psm_costestimate
+{
+ double baseline;
+ double per_rep;
+};
+
+struct pair
+{
+ int x;
+ double y;
+};
+
+bool psm_estimate_cost(struct psm_costestimate *est,
+ const struct psm_counter *ctr,
+ psm_sample_fn fn, void *opaque)
+{
+ int rounds = 50;
+ int scale = 1;
+ int steps = 100;
+ int n = rounds * steps;
+ int i = 0;
+ uint64_t sample;
+
+ struct pair *array = calloc(sizeof(struct pair), n);
+ for (int r = 0; r < rounds; r++) {
+ for (int s = 0; s < steps; s++) {
+ array[i].x = s * scale;
+ i++;
+ }
+ }
+
+ /* Now randomly permute it. */
+ for (i = n - 1; i >= 1; i--) {
+ int j = rand() % i;
+ int tmp = array[i].x;
+ array[i].x = array[j].x;
+ array[j].x = tmp;
+ }
+
+ /* Burn a big sample. */
+ fn(&sample, ctr, scale * steps * (rounds / 2), opaque);
+
+ /* Now get all the samples and accumulate. */
+ double sum_xy = 0.0, sum_x = 0.0, sum_xx = 0.0, sum_y = 0.0;
+ for (i = 0; i < n; i++) {
+ while (!fn(&sample, ctr, array[i].x, opaque))
+ ;
+ array[i].y = sample;
+
+ sum_x += array[i].x;
+ sum_xy += array[i].x * array[i].y;
+ sum_xx += (double)array[i].x * (double)array[i].x;
+ sum_y += array[i].y;
+ }
+
+ /* Calculate a simple linear regression. */
+ est->per_rep = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
+ est->baseline = (sum_y - est->per_rep * sum_x) / n;
+
+ free(array);
+ return true;
+};
+
+int main()
+{
+ struct psm_counter *ctr = psm_counter_create();
+
+ uint64_t baseline = psm_settle(ctr, psm_atomic_sample_empty, 1, NULL);
+ if (baseline == (uint64_t)~0ULL)
+ printf("Self-monitoring warm-up didn't settle down\n");
+ else
+ printf("Self-monitoring warmed up: overhead is %llu cycles\n",
+ (unsigned long long)baseline);
+
+ /*
+ for (int repidx = 0; repidx < 2; repidx++) {
+ int reps = reparray[repidx];
+ for (int i = 0; i < 10; i++) {
+ uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_enosys, reps, NULL, 250, 500) - baseline;
+ printf("%dx ENOSYS: %llu\n", reps, (unsigned long long)cost/reps);
+ }
+ }
+
+ for (int repidx = 0; repidx < 2; repidx++) {
+ int reps = reparray[repidx];
+ for (int i = 0; i < 10; i++) {
+ uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_bad_prctl, reps, NULL, 250, 500) - baseline;
+ printf("%dx bad prctl: %llu\n", reps, (unsigned long long)cost/reps);
+ }
+ }
+ */
+
+ struct psm_costestimate est;
+
+ psm_estimate_cost(&est, ctr, psm_atomic_sample_enosys, NULL);
+ printf("ENOSYS:\t\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+ psm_estimate_cost(&est, ctr, psm_atomic_sample_bad_prctl, NULL);
+ printf("bad prctl:\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+ int fd = open("/dev/zero", O_RDONLY);
+ if (fd == -1)
+ err(1, "open /dev/zero");
+
+ psm_estimate_cost(&est, ctr, psm_atomic_sample_pread_fd, (void *)(intptr_t)fd);
+ printf("pread /dev/zero:\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+ psm_counter_destroy(ctr);
+}