From: Manfred Spraul Real 80386 cpus ignore the write protected bit in the page tables when running in supervisory mode. Thus the write protected bit must be checked by software. The current implementation does that check during access_ok(). This can result in data corruptions, if kswapd starts a swap-out between the access_ok and the actual write operation. To fix this, the patch moves the check from access_ok() into __copy_to_user_ll(), and redirects all user space writes into __copy_to_user_ll(). The patch only affects kernels build for 80386 cpus. Additionally, the patch removes the dead prototypes for __put_user_{1,2,4,8}. Due to the uninlining of access_ok, the .text segment is now ~ 8 kB shorter. arch/i386/lib/usercopy.c | 50 +++++++++++++++++++++++++++ arch/i386/mm/fault.c | 81 --------------------------------------------- include/asm-i386/uaccess.h | 30 +++++++--------- 3 files changed, 64 insertions(+), 97 deletions(-) diff -puN arch/i386/lib/usercopy.c~386-access_ok-race-fix arch/i386/lib/usercopy.c --- 25/arch/i386/lib/usercopy.c~386-access_ok-race-fix 2003-05-05 19:04:28.000000000 -0700 +++ 25-akpm/arch/i386/lib/usercopy.c 2003-05-05 19:04:28.000000000 -0700 @@ -6,6 +6,9 @@ * Copyright 1997 Linus Torvalds */ #include +#include +#include +#include #include #include @@ -483,6 +486,53 @@ do { \ unsigned long __copy_to_user_ll(void *to, const void *from, unsigned long n) { +#ifndef CONFIG_X86_WP_WORKS_OK + if (unlikely(boot_cpu_data.wp_works_ok == 0) && + ((unsigned long )to) < TASK_SIZE) { + /* + * CPU does not honor the WP bit when writing + * from supervisory mode, and due to preemption or SMP, + * the page tables can change at any time. + * Do it manually. Manfred + */ + while (n) { + unsigned long offset = ((unsigned long)to)%PAGE_SIZE; + unsigned long len = PAGE_SIZE - offset; + int retval; + struct page *pg; + void *maddr; + + if (len > n) + len = n; + +survive: + down_read(¤t->mm->mmap_sem); + retval = get_user_pages(current, current->mm, + (unsigned long )to, 1, 1, 0, &pg, NULL); + + if (retval == -ENOMEM && current->pid == 1) { + up_read(¤t->mm->mmap_sem); + blk_congestion_wait(WRITE, HZ/50); + goto survive; + } + + if (retval != 1) + break; + + maddr = kmap_atomic(pg, KM_USER0); + memcpy(maddr + offset, from, len); + kunmap_atomic(maddr, KM_USER0); + set_page_dirty_lock(pg); + put_page(pg); + up_read(¤t->mm->mmap_sem); + + from += len; + to += len; + n -= len; + } + return n; + } +#endif if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else diff -puN arch/i386/mm/fault.c~386-access_ok-race-fix arch/i386/mm/fault.c --- 25/arch/i386/mm/fault.c~386-access_ok-race-fix 2003-05-05 19:04:28.000000000 -0700 +++ 25-akpm/arch/i386/mm/fault.c 2003-05-05 19:04:28.000000000 -0700 @@ -29,87 +29,6 @@ extern void die(const char *,struct pt_regs *,long); -#ifndef CONFIG_X86_WP_WORKS_OK -/* - * Ugly, ugly, but the goto's result in better assembly.. - */ -int __verify_write(const void * addr, unsigned long size) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct * vma; - unsigned long start = (unsigned long) addr; - - if (!size || segment_eq(get_fs(),KERNEL_DS)) - return 1; - - down_read(&mm->mmap_sem); - vma = find_vma(current->mm, start); - if (!vma) - goto bad_area; - if (vma->vm_start > start) - goto check_stack; - -good_area: - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - size--; - size += start & ~PAGE_MASK; - size >>= PAGE_SHIFT; - start &= PAGE_MASK; - - for (;;) { - survive: - switch (handle_mm_fault(current->mm, vma, start, 1)) { - case VM_FAULT_SIGBUS: - goto bad_area; - case VM_FAULT_OOM: - goto out_of_memory; - case VM_FAULT_MINOR: - case VM_FAULT_MAJOR: - break; - default: - BUG(); - } - if (!size) - break; - size--; - start += PAGE_SIZE; - if (start < vma->vm_end) - continue; - vma = vma->vm_next; - if (!vma || vma->vm_start != start) - goto bad_area; - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area;; - } - /* - * We really need to hold mmap_sem over the whole access to - * userspace, else another thread could change permissions. - * This is unfixable, so don't use i386-class machines for - * critical servers. - */ - up_read(&mm->mmap_sem); - return 1; - -check_stack: - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, start) == 0) - goto good_area; - -bad_area: - up_read(&mm->mmap_sem); - return 0; - -out_of_memory: - if (current->pid == 1) { - yield(); - goto survive; - } - goto bad_area; -} -#endif - /* * Unlock any spinlocks which will prevent us from getting the * message out diff -puN include/asm-i386/uaccess.h~386-access_ok-race-fix include/asm-i386/uaccess.h --- 25/include/asm-i386/uaccess.h~386-access_ok-race-fix 2003-05-05 19:04:28.000000000 -0700 +++ 25-akpm/include/asm-i386/uaccess.h 2003-05-05 19:04:28.000000000 -0700 @@ -62,8 +62,6 @@ int __verify_write(const void *, unsigne :"1" (addr),"g" ((int)(size)),"g" (current_thread_info()->addr_limit.seg)); \ flag; }) -#ifdef CONFIG_X86_WP_WORKS_OK - /** * access_ok: - Checks if a user space pointer is valid * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that @@ -85,14 +83,6 @@ int __verify_write(const void *, unsigne */ #define access_ok(type,addr,size) (__range_ok(addr,size) == 0) -#else - -#define access_ok(type,addr,size) ( (__range_ok(addr,size) == 0) && \ - ((type) == VERIFY_READ || boot_cpu_data.wp_works_ok || \ - __verify_write((void *)(addr),(size)))) - -#endif - /** * verify_area: - Obsolete, use access_ok() * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE @@ -191,14 +181,8 @@ extern void __get_user_4(void); __ret_gu; \ }) -extern void __put_user_1(void); -extern void __put_user_2(void); -extern void __put_user_4(void); -extern void __put_user_8(void); - extern void __put_user_bad(void); - /** * put_user: - Write a simple value into user space. * @x: Value to copy to user space. @@ -299,6 +283,8 @@ extern void __put_user_bad(void); : "=r"(err) \ : "A" (x), "r" (addr), "i"(-EFAULT), "0"(err)) +#ifdef CONFIG_X86_WP_WORKS_OK + #define __put_user_size(x,ptr,size,retval,errret) \ do { \ retval = 0; \ @@ -311,6 +297,18 @@ do { \ } \ } while (0) +#else + +#define __put_user_size(x,ptr,size,retval,errret) \ +do { \ + __typeof__(*(ptr)) __pus_tmp = x; \ + retval = 0; \ + \ + if(unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \ + retval = errret; \ +} while (0) + +#endif struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct *)(x)) _