From: David Mosberger Somebody recently pointed out a performance-anomaly to me where an unusual amount of time was being spent reading from /dev/urandom. The problem isn't really surprising as it happened only on >= 4-way machines and the random driver isn't terribly scalable the way it is written today. If scalability _really_ mattered, I suppose per-CPU data structures would be the way to go. However, I found that at least for 4-way machines, performance can be improved considerably with the attached patch. In particular, I saw the following performance on a 4-way ia64 machine: Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024": throughput: ----------- original driver: 2.2 GB/sec patched driver: 2.3 GB/sec Test: 4 tasks running "dd if=/dev/urandom of=/dev/null bs=1024": throughput: ----------- original driver: 0.4 GB/sec patched driver: 1.9 GB/sec In words: a slight improvement when there is little lock contention and a huge improvement with significant lock-contention. One reason for the scalability improvement comes from the reorganization of "struct entropy_store". Basically, the patch separates read-only data from read-write data. I also tried putting the spinlock in its own cacheline, but that reduced performance slightly. My theory is that co-locating the other read-write data with the lock improves overall throughput at the cost of some extra bus traffic (every time any read-write data is updated, the other CPUs spinning on the lock will re-fetch the data, even though the lock will remain taken). The other reason for the scalability improvement is the prefetching of the pool[] data, because the underlying cache-lines almost certainly will have been dirtied by the other CPUs, so they'll miss in the cache. The rest of the patch is fairly obvious streamlining of the code (such as read-ahead of the input data etc.). I didn't measure the impact of these separately, but since it makes live easier for the compiler, it should help most, if not all platforms. --- 25-akpm/drivers/char/random.c | 51 ++++++++++++++++++++++++++------------- 25-akpm/include/linux/prefetch.h | 12 +++++++++ 2 files changed, 47 insertions(+), 16 deletions(-) diff -puN drivers/char/random.c~urandom-scalability-fix drivers/char/random.c --- 25/drivers/char/random.c~urandom-scalability-fix 2004-04-03 03:00:04.198593272 -0800 +++ 25-akpm/drivers/char/random.c 2004-04-03 03:00:04.205592208 -0800 @@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 **********************************************************************/ struct entropy_store { + /* mostly-read data: */ + struct poolinfo poolinfo; + __u32 *pool; + + /* read-write data: */ + spinlock_t lock ____cacheline_aligned_in_smp; unsigned add_ptr; int entropy_count; int input_rotate; - struct poolinfo poolinfo; - __u32 *pool; - spinlock_t lock; }; /* @@ -571,38 +574,54 @@ static void add_entropy_words(struct ent static __u32 const twist_table[8] = { 0, 0x3b6e20c8, 0x76dc4190, 0x4db26158, 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; - unsigned i; - int new_rotate; + unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5; + int new_rotate, input_rotate; int wordmask = r->poolinfo.poolwords - 1; - __u32 w; + __u32 w, next_w; unsigned long flags; + /* Taps are constant, so we can load them without holding r->lock. */ + tap1 = r->poolinfo.tap1; + tap2 = r->poolinfo.tap2; + tap3 = r->poolinfo.tap3; + tap4 = r->poolinfo.tap4; + tap5 = r->poolinfo.tap5; + next_w = *in++; + spin_lock_irqsave(&r->lock, flags); + prefetch_range(r->pool, wordmask); + input_rotate = r->input_rotate; + add_ptr = r->add_ptr; while (nwords--) { - w = rotate_left(r->input_rotate, *in++); - i = r->add_ptr = (r->add_ptr - 1) & wordmask; + w = rotate_left(input_rotate, next_w); + if (nwords > 0) + next_w = *in++; + i = add_ptr = (add_ptr - 1) & wordmask; /* * Normally, we add 7 bits of rotation to the pool. * At the beginning of the pool, add an extra 7 bits * rotation, so that successive passes spread the * input bits across the pool evenly. */ - new_rotate = r->input_rotate + 14; + new_rotate = input_rotate + 14; if (i) - new_rotate = r->input_rotate + 7; - r->input_rotate = new_rotate & 31; + new_rotate = input_rotate + 7; + input_rotate = new_rotate & 31; /* XOR in the various taps */ - w ^= r->pool[(i + r->poolinfo.tap1) & wordmask]; - w ^= r->pool[(i + r->poolinfo.tap2) & wordmask]; - w ^= r->pool[(i + r->poolinfo.tap3) & wordmask]; - w ^= r->pool[(i + r->poolinfo.tap4) & wordmask]; - w ^= r->pool[(i + r->poolinfo.tap5) & wordmask]; + w ^= r->pool[(i + tap1) & wordmask]; + w ^= r->pool[(i + tap2) & wordmask]; + w ^= r->pool[(i + tap3) & wordmask]; + w ^= r->pool[(i + tap4) & wordmask]; + w ^= r->pool[(i + tap5) & wordmask]; w ^= r->pool[i]; r->pool[i] = (w >> 3) ^ twist_table[w & 7]; } + r->input_rotate = input_rotate; + r->add_ptr = add_ptr; + spin_unlock_irqrestore(&r->lock, flags); } diff -puN include/linux/prefetch.h~urandom-scalability-fix include/linux/prefetch.h --- 25/include/linux/prefetch.h~urandom-scalability-fix 2004-04-03 03:00:04.200592968 -0800 +++ 25-akpm/include/linux/prefetch.h 2004-04-03 03:00:04.205592208 -0800 @@ -10,6 +10,7 @@ #ifndef _LINUX_PREFETCH_H #define _LINUX_PREFETCH_H +#include #include #include @@ -54,4 +55,15 @@ static inline void prefetchw(const void #define PREFETCH_STRIDE (4*L1_CACHE_BYTES) #endif +static inline void prefetch_range(void *addr, size_t len) +{ +#ifdef ARCH_HAS_PREFETCH + char *cp; + char *end = addr + len; + + for (cp = addr; cp < end; cp += PREFETCH_STRIDE) + prefetch(cp); +#endif +} + #endif _