From: Manfred Spraul <manfred@colorfullife.com>

The memmove implementation of i386 is not optimized: it uses movsb, which is
far slower than movsd.  The optimization is trivial: if dest is less than
source, then call memcpy().  markw tried it on a 4xXeon with dbt2, it saved
around 300 million cpu ticks in cache_flusharray():

oprofile, GLOBAL_POWER_EVENTS, count 100k
Before:
c0144ed1 <cache_flusharray>: /* cache_flusharray total:  21823  0.0165 */
     6 4.5e-06 :c0144f8e:       cmp    %esi,%ebx
    11 8.3e-06 :c0144f90:       jae    c0144f9e <cache_flusharray+0xcd>
     3 2.3e-06 :c0144f92:       mov    %ebx,%edi
  7305  0.0055 :c0144f94:       repz movsb %ds:(%esi),%es:(%edi)
   201 1.5e-04 :c0144f96:       add    $0x10,%esp
 
After:
c0144f1d <cache_flusharray>: /* cache_flusharray total:  17959  0.0136 */
  1270 9.6e-04 :c0144f1d:       push   %ebp
[snip]
     6 4.6e-06 :c0144fdc:       cmp    %esi,%ebx
    13 9.9e-06 :c0144fde:       jae    c0145000 <cache_flusharray+0xe3>
     2 1.5e-06 :c0144fe0:       mov    %edx,%eax
     1 7.6e-07 :c0144fe2:       mov    %ebx,%edi
    11 8.4e-06 :c0144fe4:       shr    $0x2,%eax
     1 7.6e-07 :c0144fe7:       mov    %eax,%ecx
  4129  0.0031 :c0144fe9:       repz movsl %ds:(%esi),%es:(%edi)
   261 2.0e-04 :c0144feb:       test   $0x2,%dl
    27 2.1e-05 :c0144fee:       je     c0144ff2 <cache_flusharray+0xd5>
               :c0144ff0:       movsw  %ds:(%esi),%es:(%edi)
    95 7.2e-05 :c0144ff2:       test   $0x1,%dl
    96 7.3e-05 :c0144ff5:       je     c0144ff8 <cache_flusharray+0xdb>
               :c0144ff7:       movsb  %ds:(%esi),%es:(%edi)
   121 9.2e-05 :c0144ff8:       add    $0x1c,%esp
 

 25-akpm/include/asm-i386/string.h |   11 +++--------
 1 files changed, 3 insertions(+), 8 deletions(-)

diff -puN include/asm-i386/string.h~memmove-speedup include/asm-i386/string.h
--- 25/include/asm-i386/string.h~memmove-speedup	Fri Nov  7 15:10:26 2003
+++ 25-akpm/include/asm-i386/string.h	Fri Nov  7 15:10:26 2003
@@ -299,14 +299,9 @@ extern void __struct_cpy_bug (void);
 static inline void * memmove(void * dest,const void * src, size_t n)
 {
 int d0, d1, d2;
-if (dest<src)
-__asm__ __volatile__(
-	"rep\n\t"
-	"movsb"
-	: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-	:"0" (n),"1" (src),"2" (dest)
-	: "memory");
-else
+if (dest<src) {
+	memcpy(dest,src,n);
+} else
 __asm__ __volatile__(
 	"std\n\t"
 	"rep\n\t"

_