From: Manfred Spraul The memmove implementation of i386 is not optimized: it uses movsb, which is far slower than movsd. The optimization is trivial: if dest is less than source, then call memcpy(). markw tried it on a 4xXeon with dbt2, it saved around 300 million cpu ticks in cache_flusharray(): oprofile, GLOBAL_POWER_EVENTS, count 100k Before: c0144ed1 : /* cache_flusharray total: 21823 0.0165 */ 6 4.5e-06 :c0144f8e: cmp %esi,%ebx 11 8.3e-06 :c0144f90: jae c0144f9e 3 2.3e-06 :c0144f92: mov %ebx,%edi 7305 0.0055 :c0144f94: repz movsb %ds:(%esi),%es:(%edi) 201 1.5e-04 :c0144f96: add $0x10,%esp After: c0144f1d : /* cache_flusharray total: 17959 0.0136 */ 1270 9.6e-04 :c0144f1d: push %ebp [snip] 6 4.6e-06 :c0144fdc: cmp %esi,%ebx 13 9.9e-06 :c0144fde: jae c0145000 2 1.5e-06 :c0144fe0: mov %edx,%eax 1 7.6e-07 :c0144fe2: mov %ebx,%edi 11 8.4e-06 :c0144fe4: shr $0x2,%eax 1 7.6e-07 :c0144fe7: mov %eax,%ecx 4129 0.0031 :c0144fe9: repz movsl %ds:(%esi),%es:(%edi) 261 2.0e-04 :c0144feb: test $0x2,%dl 27 2.1e-05 :c0144fee: je c0144ff2 :c0144ff0: movsw %ds:(%esi),%es:(%edi) 95 7.2e-05 :c0144ff2: test $0x1,%dl 96 7.3e-05 :c0144ff5: je c0144ff8 :c0144ff7: movsb %ds:(%esi),%es:(%edi) 121 9.2e-05 :c0144ff8: add $0x1c,%esp 25-akpm/include/asm-i386/string.h | 11 +++-------- 1 files changed, 3 insertions(+), 8 deletions(-) diff -puN include/asm-i386/string.h~memmove-speedup include/asm-i386/string.h --- 25/include/asm-i386/string.h~memmove-speedup Fri Nov 7 15:10:26 2003 +++ 25-akpm/include/asm-i386/string.h Fri Nov 7 15:10:26 2003 @@ -299,14 +299,9 @@ extern void __struct_cpy_bug (void); static inline void * memmove(void * dest,const void * src, size_t n) { int d0, d1, d2; -if (dest