From: Ingo Molnar /dev/null | 0 Makefile | 0 arch/i386/Kconfig | 37 +++ arch/i386/boot/setup.S | 2 arch/i386/kernel/Makefile | 2 arch/i386/kernel/asm-offsets.c | 15 + arch/i386/kernel/cpu/common.c | 10 - arch/i386/kernel/cpu/intel.c | 4 arch/i386/kernel/doublefault.c | 7 arch/i386/kernel/entry.S | 222 ++++++++++++++++++----- arch/i386/kernel/entry_trampoline.c | 75 +++++++ arch/i386/kernel/head.S | 49 +++-- arch/i386/kernel/i386_ksyms.c | 7 arch/i386/kernel/i387.c | 24 +- arch/i386/kernel/init_task.c | 4 arch/i386/kernel/ldt.c | 132 +++++++++----- arch/i386/kernel/mpparse.c | 2 arch/i386/kernel/process.c | 37 +++ arch/i386/kernel/reboot.c | 13 - arch/i386/kernel/signal.c | 95 ++++------ arch/i386/kernel/smp.c | 22 -- arch/i386/kernel/sysenter.c | 5 arch/i386/kernel/traps.c | 75 ++++++- arch/i386/kernel/vm86.c | 5 arch/i386/kernel/vsyscall-sysenter.S | 5 arch/i386/kernel/vsyscall.lds | 2 arch/i386/lib/checksum.S | 6 arch/i386/lib/getuser.S | 7 arch/i386/lib/usercopy.c | 11 - arch/i386/mm/extable.c | 56 +++++ arch/i386/mm/fault.c | 8 arch/i386/mm/init.c | 329 +++++++++++++++++++---------------- arch/i386/mm/pgtable.c | 90 ++++++--- drivers/block/scsi_ioctl.c | 2 include/asm-i386/atomic_kmap.h | 95 ++++++++++ include/asm-i386/checksum.h | 30 ++- include/asm-i386/desc.h | 37 +-- include/asm-i386/fixmap.h | 39 ++-- include/asm-i386/highmem.h | 11 - include/asm-i386/kmap_types.h | 50 ++--- include/asm-i386/mmu.h | 5 include/asm-i386/mmu_context.h | 12 + include/asm-i386/page.h | 37 ++- include/asm-i386/pgtable.h | 14 + include/asm-i386/processor.h | 13 - include/asm-i386/string.h | 23 ++ include/asm-i386/thread_info.h | 20 -- include/asm-i386/tlbflush.h | 9 include/asm-i386/uaccess.h | 127 +++++++++++-- kernel/futex.c | 18 + kernel/ksyms.c | 1 mm/Makefile | 3 mm/memory.c | 19 +- mm/usercopy.c | 269 ++++++++++++++++++++++++++++ 54 files changed, 1644 insertions(+), 548 deletions(-) diff -puN arch/i386/boot/setup.S~4g-2.6.0-test2-mm2-A5 arch/i386/boot/setup.S --- 25/arch/i386/boot/setup.S~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/boot/setup.S 2003-08-07 15:24:50.000000000 -0700 @@ -162,7 +162,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd diff -puN arch/i386/Kconfig~4g-2.6.0-test2-mm2-A5 arch/i386/Kconfig --- 25/arch/i386/Kconfig~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/Kconfig 2003-08-07 15:24:50.000000000 -0700 @@ -397,6 +397,43 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_4G + bool "4 GB kernel-space and 4 GB user-space virtual memory support" + help + This option is only useful for systems that have more than 1 GB + of RAM. + + The default kernel VM layout leaves 1 GB of virtual memory for + kernel-space mappings, and 3 GB of VM for user-space applications. + This option ups both the kernel-space VM and the user-space VM to + 4 GB. + + The cost of this option is additional TLB flushes done at + system-entry points that transition from user-mode into kernel-mode. + I.e. system calls and page faults, and IRQs that interrupt user-mode + code. There's also additional overhead to kernel operations that copy + memory to/from user-space. The overhead from this is hard to tell and + depends on the workload - it can be anything from no visible overhead + to 20-30% overhead. A good rule of thumb is to count with a runtime + overhead of 20%. + + The upside is the much increased kernel-space VM, which more than + quadruples the maximum amount of RAM supported. Kernels compiled with + this option boot on 64GB of RAM and still have more than 3.1 GB of + 'lowmem' left. Another bonus is that highmem IO bouncing decreases, + if used with drivers that still use bounce-buffers. + + There's also a 33% increase in user-space VM size - database + applications might see a boost from this. + + But the cost of the TLB flushes and the runtime overhead has to be + weighed against the bonuses offered by the larger VM spaces. The + dividing line depends on the actual workload - there might be 4 GB + systems that benefit from this option. Systems with less than 4 GB + of RAM will rarely see a benefit from this option - but it's not + out of question, the exact circumstances have to be considered. + + config HUGETLB_PAGE bool "Huge TLB Page Support" help diff -puN arch/i386/kernel/asm-offsets.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/asm-offsets.c --- 25/arch/i386/kernel/asm-offsets.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/asm-offsets.c 2003-08-07 15:24:50.000000000 -0700 @@ -4,9 +4,11 @@ * to extract and format the required data. */ +#include #include #include #include "sigframe.h" +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -28,4 +30,17 @@ void foo(void) DEFINE(RT_SIGFRAME_sigcontext, offsetof (struct rt_sigframe, uc.uc_mcontext)); + DEFINE(TI_task, offsetof (struct thread_info, exec_domain)); + DEFINE(TI_exec_domain, offsetof (struct thread_info, exec_domain)); + DEFINE(TI_flags, offsetof (struct thread_info, flags)); + DEFINE(TI_preempt_count, offsetof (struct thread_info, preempt_count)); + DEFINE(TI_addr_limit, offsetof (struct thread_info, addr_limit)); + DEFINE(TI_real_stack, offsetof (struct thread_info, real_stack)); + DEFINE(TI_virtual_stack, offsetof (struct thread_info, virtual_stack)); + DEFINE(TI_user_pgd, offsetof (struct thread_info, user_pgd)); + + DEFINE(FIX_ENTRY_TRAMPOLINE_0_addr, __fix_to_virt(FIX_ENTRY_TRAMPOLINE_0)); + DEFINE(FIX_VSYSCALL_addr, __fix_to_virt(FIX_VSYSCALL)); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(task_thread_db7, offsetof (struct task_struct, thread.debugreg[7])); } diff -puN arch/i386/kernel/cpu/common.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/cpu/common.c --- 25/arch/i386/kernel/cpu/common.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/common.c 2003-08-07 15:24:50.000000000 -0700 @@ -510,16 +510,20 @@ void __init cpu_init (void) BUG(); enter_lazy_tlb(&init_mm, current); - load_esp0(t, thread->esp0); - set_tss_desc(cpu,t); + t->esp0 = thread->esp0; + set_tss_desc(cpu, t); cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); - load_LDT(&init_mm.context); + if (cpu) + load_LDT(&init_mm.context); /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + if (cpu) + trap_init_virtual_GDT(); + /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); diff -puN arch/i386/kernel/cpu/intel.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/cpu/intel.c --- 25/arch/i386/kernel/cpu/intel.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/intel.c 2003-08-07 15:24:50.000000000 -0700 @@ -8,11 +8,11 @@ #include #include #include +#include #include "cpu.h" static int disable_P4_HT __initdata = 0; -extern int trap_init_f00f_bug(void); #ifdef CONFIG_X86_INTEL_USERCOPY /* @@ -165,7 +165,7 @@ static void __init init_intel(struct cpu c->f00f_bug = 1; if ( !f00f_workaround_enabled ) { - trap_init_f00f_bug(); + trap_init_virtual_IDT(); printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } diff -puN arch/i386/kernel/doublefault.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/doublefault.c --- 25/arch/i386/kernel/doublefault.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/doublefault.c 2003-08-07 15:24:50.000000000 -0700 @@ -7,12 +7,13 @@ #include #include #include +#include #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) -#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000) +#define ptr_ok(x) (((x) > __PAGE_OFFSET && (x) < (__PAGE_OFFSET + 0x01000000)) || ((x) >= FIXADDR_START)) static void doublefault_fn(void) { @@ -38,8 +39,8 @@ static void doublefault_fn(void) printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", t->eax, t->ebx, t->ecx, t->edx); - printk("esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + printk("esi = %08lx, edi = %08lx, ebp = %08lx\n", + t->esi, t->edi, t->ebp); } } diff -puN arch/i386/kernel/entry.S~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/entry.S --- 25/arch/i386/kernel/entry.S~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/entry.S 2003-08-07 15:24:50.000000000 -0700 @@ -43,8 +43,10 @@ #include #include #include +#include #include #include +#include #include #include #include "irq_vectors.h" @@ -97,7 +99,87 @@ TSS_ESP0_OFFSET = (4 - 0x200) #define resume_kernel restore_all #endif -#define SAVE_ALL \ +#ifdef CONFIG_X86_HIGH_ENTRY + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +/* clobbers ebx, edx and ebp */ + +#define __SWITCH_KERNELSPACE \ + cmpl $0xff000000, %esp; \ + jb 1f; \ + \ + /* \ + * switch pagetables and load the real stack, \ + * keep the stack offset: \ + */ \ + \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + \ + /* GET_THREAD_INFO(%ebp) intermixed */ \ + \ + movl %esp, %ebp; \ + movl %esp, %ebx; \ + andl $0xffffe000, %ebp; \ + andl $0x00001fff, %ebx; \ + orl TI_real_stack(%ebp), %ebx; \ + \ + movl %edx, %cr3; \ + movl %ebx, %esp; \ +1: + +#endif + + +#define __SWITCH_USERSPACE \ + /* interrupted any of the user return paths? */ \ + \ + movl EIP(%esp), %eax; \ + \ + cmpl $int80_ret_start_marker, %eax; \ + jb 33f; /* nope - continue with sysexit check */\ + cmpl $int80_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ +33: \ + cmpl $sysexit_ret_start_marker, %eax; \ + jb 44f; /* nope - continue with user check */ \ + cmpl $sysexit_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ + /* return to userspace? */ \ +44: \ + movl EFLAGS(%esp),%ecx; \ + movb CS(%esp),%cl; \ + testl $(VM_MASK | 3),%ecx; \ + jz 2f; \ +22: \ + /* \ + * switch to the virtual stack, then switch to \ + * the userspace pagetables. \ + */ \ + \ + GET_THREAD_INFO(%ebp); \ + movl TI_virtual_stack(%ebp), %edx; \ + movl TI_user_pgd(%ebp), %ecx; \ + \ + movl %esp, %ebx; \ + andl $0x1fff, %ebx; \ + orl %ebx, %edx; \ +int80_ret_start_marker: \ + movl %edx, %esp; \ + movl %ecx, %cr3; \ + \ + __RESTORE_ALL; \ +int80_ret_end_marker: \ +2: + +#else /* !CONFIG_X86_HIGH_ENTRY */ + +#define __SWITCH_KERNELSPACE +#define __SWITCH_USERSPACE + +#endif + +#define __SAVE_ALL \ cld; \ pushl %es; \ pushl %ds; \ @@ -110,10 +192,9 @@ TSS_ESP0_OFFSET = (4 - 0x200) pushl %ebx; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; \ - STACK_OVERFLOW_TEST + movl %edx, %es; -#define RESTORE_INT_REGS \ +#define __RESTORE_INT_REGS \ popl %ebx; \ popl %ecx; \ popl %edx; \ @@ -122,29 +203,28 @@ TSS_ESP0_OFFSET = (4 - 0x200) popl %ebp; \ popl %eax -#define RESTORE_REGS \ - RESTORE_INT_REGS; \ -1: popl %ds; \ -2: popl %es; \ +#define __RESTORE_REGS \ + __RESTORE_INT_REGS; \ +111: popl %ds; \ +222: popl %es; \ .section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ -4: movl $0,(%esp); \ - jmp 2b; \ +444: movl $0,(%esp); \ + jmp 111b; \ +555: movl $0,(%esp); \ + jmp 222b; \ .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ + .long 111b,444b;\ + .long 222b,555b;\ .previous - -#define RESTORE_ALL \ - RESTORE_REGS \ +#define __RESTORE_ALL \ + __RESTORE_REGS \ addl $4, %esp; \ -1: iret; \ +333: iret; \ .section .fixup,"ax"; \ -2: sti; \ +666: sti; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ @@ -153,10 +233,19 @@ TSS_ESP0_OFFSET = (4 - 0x200) .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,2b; \ + .long 333b,666b;\ .previous +#define SAVE_ALL \ + __SAVE_ALL; \ + __SWITCH_KERNELSPACE; \ + STACK_OVERFLOW_TEST; + +#define RESTORE_ALL \ + __SWITCH_USERSPACE; \ + __RESTORE_ALL; +.section .entry.text,"ax" ENTRY(lcall7) pushfl # We get a different stack layout with call @@ -174,7 +263,7 @@ do_lcall: movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # andl $-8192, %ebp # GET_THREAD_INFO - movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain + movl TI_exec_domain(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp popl %eax @@ -219,7 +308,7 @@ ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done on # int/exception return? jne work_pending @@ -227,18 +316,18 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cmpl $0,TI_PRE_COUNT(%ebp) # non-zero preempt_count ? + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_all need_resched: - movl TI_FLAGS(%ebp), %ecx # need_resched set ? + movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all - movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) + movl $PREEMPT_ACTIVE,TI_preempt_count(%ebp) sti call schedule - movl $0,TI_PRE_COUNT(%ebp) + movl $0,TI_preempt_count(%ebp) cli jmp need_resched #endif @@ -257,37 +346,50 @@ sysenter_past_esp: pushl $(__USER_CS) pushl $SYSENTER_RETURN -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault -1: movl (%ebp),%ebp -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous - pushl %eax SAVE_ALL GET_THREAD_INFO(%ebp) cmpl $(nr_syscalls), %eax jae syscall_badsys - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $_TIF_SYSCALL_TRACE,TI_flags(%ebp) jnz syscall_trace_entry call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) cli - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + + GET_THREAD_INFO(%ebp) + movl TI_virtual_stack(%ebp), %edx + movl TI_user_pgd(%ebp), %ecx + movl %esp, %ebx + andl $0x1fff, %ebx + orl %ebx, %edx +sysexit_ret_start_marker: + movl %edx, %esp + movl %ecx, %cr3 +#endif + /* + * only ebx is not restored by the userspace sysenter vsyscall + * code, it assumes it to be callee-saved. + */ + movl EBX(%esp), %ebx + /* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx movl OLDESP(%esp), %ecx + sti sysexit +#ifdef CONFIG_X86_SWITCH_PAGETABLES +sysexit_ret_end_marker: + nop +#endif # system call handler stub @@ -298,7 +400,7 @@ ENTRY(system_call) cmpl $(nr_syscalls), %eax jae syscall_badsys # system call tracing in operation - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $_TIF_SYSCALL_TRACE,TI_flags(%ebp) jnz syscall_trace_entry syscall_call: call *sys_call_table(,%eax,4) @@ -307,7 +409,7 @@ syscall_exit: cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: @@ -317,7 +419,7 @@ restore_all: testl $(VM_MASK | 3), %eax jz resume_kernelX # returning to kernel or vm86-space - cmpl $0,TI_PRE_COUNT(%ebx) # non-zero preempt_count ? + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jz resume_kernelX int $3 @@ -336,7 +438,7 @@ work_resched: cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all @@ -351,6 +453,21 @@ work_notifysig: # deal with pending s # vm86-space xorl %edx, %edx call do_notify_resume + +#if CONFIG_X86_HIGH_ENTRY + /* + * Reload db7 if necessary: + */ + testb $_TIF_DB7, %cl + jnz work_db7 + + jmp restore_all + +work_db7: + movl TI_task(%ebp), %edx; + movl task_thread_db7(%edx), %edx; + movl %edx, %db7; +#endif jmp restore_all ALIGN @@ -406,7 +523,7 @@ syscall_badsys: */ .data ENTRY(interrupt) -.text +.previous vector=0 ENTRY(irq_entries_start) @@ -416,7 +533,7 @@ ENTRY(irq_entries_start) jmp common_interrupt .data .long 1b -.text +.previous vector=vector+1 .endr @@ -457,12 +574,17 @@ error_code: movl ES(%esp), %edi # get the function address movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) - movl %esp, %edx pushl %esi # push the error code - pushl %edx # push the pt_regs pointer movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es + +/* clobbers edx, ebx and ebp */ + __SWITCH_KERNELSPACE + + leal 4(%esp), %edx # prepare pt_regs + pushl %edx # push pt_regs + call *%edi addl $8, %esp jmp ret_from_exception @@ -553,7 +675,7 @@ nmi_stack_correct: pushl %edx call do_nmi addl $8, %esp - RESTORE_ALL + jmp restore_all nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -630,6 +752,8 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code +.previous + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ diff -puN /dev/null arch/i386/kernel/entry_trampoline.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/arch/i386/kernel/entry_trampoline.c 2003-08-07 15:24:50.000000000 -0700 @@ -0,0 +1,75 @@ +/* + * linux/arch/i386/kernel/entry_trampoline.c + * + * (C) Copyright 2003 Ingo Molnar + * + * This file contains the needed support code for 4GB userspace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char __entry_tramp_start, __entry_tramp_end, __start___entry_text; + +void __init init_entry_mappings(void) +{ +#ifdef CONFIG_X86_HIGH_ENTRY + void *tramp; + + /* + * We need a high IDT and GDT for the 4G/4G split: + */ + trap_init_virtual_IDT(); + + __set_fixmap(FIX_ENTRY_TRAMPOLINE_0, __pa((unsigned long)&__entry_tramp_start), PAGE_KERNEL); + __set_fixmap(FIX_ENTRY_TRAMPOLINE_1, __pa((unsigned long)&__entry_tramp_start) + PAGE_SIZE, PAGE_KERNEL); + tramp = (void *)fix_to_virt(FIX_ENTRY_TRAMPOLINE_0); + + printk("mapped 4G/4G trampoline to %p.\n", tramp); + BUG_ON((void *)&__start___entry_text != tramp); + /* + * Virtual kernel stack: + */ + BUG_ON(__kmap_atomic_vaddr(KM_VSTACK0) & 8191); + BUG_ON(sizeof(struct desc_struct)*NR_CPUS*GDT_ENTRIES > 2*PAGE_SIZE); + BUG_ON((unsigned int)&__entry_tramp_end - (unsigned int)&__entry_tramp_start > 2*PAGE_SIZE); + + /* + * set up the initial thread's virtual stack related + * fields: + */ + current->thread.stack_page0 = virt_to_page((char *)current->thread_info); + current->thread.stack_page1 = virt_to_page((char *)current->thread_info + PAGE_SIZE); + current->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); + + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(current->thread.stack_page0, KM_VSTACK0); + __kmap_atomic(current->thread.stack_page1, KM_VSTACK1); + +#endif + printk("current: %p\n", current); + printk("current->thread_info: %p\n", current->thread_info); + current->thread_info->real_stack = (void *)current->thread_info; + current->thread_info->user_pgd = NULL; + current->thread.esp0 = (unsigned long)current->thread_info->real_stack + THREAD_SIZE; +} + + + +void __init entry_trampoline_setup(void) +{ + /* + * old IRQ entries set up by the boot code will still hang + * around - they are a sign of hw trouble anyway, now they'll + * produce a double fault message. + */ + trap_init_virtual_GDT(); +} diff -puN arch/i386/kernel/head.S~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/head.S --- 25/arch/i386/kernel/head.S~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/head.S 2003-08-07 15:24:50.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -330,7 +331,7 @@ ENTRY(stack_start) /* This is the default interrupt "handler" :-) */ int_msg: - .asciz "Unknown interrupt\n" + .asciz "Unknown interrupt or fault at EIP %p %p %p\n" ALIGN ignore_int: cld @@ -342,9 +343,17 @@ ignore_int: movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) pushl $int_msg call printk popl %eax + popl %eax + popl %eax + popl %eax + popl %eax popl %ds popl %es popl %edx @@ -377,23 +386,27 @@ cpu_gdt_descr: .fill NR_CPUS-1,8,0 # space for the other GDT descriptors /* - * This is initialized to create an identity-mapping at 0-8M (for bootup - * purposes) and another mapping of the 0-8M area at virtual address + * This is initialized to create an identity-mapping at 0-16M (for bootup + * purposes) and another mapping of the 0-16M area at virtual address * PAGE_OFFSET. */ .org 0x1000 ENTRY(swapper_pg_dir) .long 0x00102007 .long 0x00103007 - .fill BOOT_USER_PGD_PTRS-2,4,0 - /* default: 766 entries */ + .long 0x00104007 + .long 0x00105007 + .fill BOOT_USER_PGD_PTRS-4,4,0 + /* default: 764 entries */ .long 0x00102007 .long 0x00103007 - /* default: 254 entries */ - .fill BOOT_KERNEL_PGD_PTRS-2,4,0 + .long 0x00104007 + .long 0x00105007 + /* default: 252 entries */ + .fill BOOT_KERNEL_PGD_PTRS-4,4,0 /* - * The page tables are initialized to only 8MB here - the final page + * The page tables are initialized to only 16MB here - the final page * tables are set up later depending on memory size. */ .org 0x2000 @@ -402,15 +415,21 @@ ENTRY(pg0) .org 0x3000 ENTRY(pg1) +.org 0x4000 +ENTRY(pg2) + +.org 0x5000 +ENTRY(pg3) + /* * empty_zero_page must immediately follow the page tables ! (The * initialization loop counts until empty_zero_page) */ -.org 0x4000 +.org 0x6000 ENTRY(empty_zero_page) -.org 0x5000 +.org 0x7000 /* * Real beginning of normal "text" segment @@ -419,12 +438,12 @@ ENTRY(stext) ENTRY(_stext) /* - * This starts the data section. Note that the above is all - * in the text section because it has alignment requirements - * that we cannot fulfill any other way. + * This starts the data section. */ .data +.align PAGE_SIZE_asm + /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ @@ -439,7 +458,9 @@ ENTRY(boot_gdt_table) .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ #endif - .align L1_CACHE_BYTES + +.align PAGE_SIZE_asm + ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ diff -puN arch/i386/kernel/i386_ksyms.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/i386_ksyms.c --- 25/arch/i386/kernel/i386_ksyms.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/i386_ksyms.c 2003-08-07 15:24:50.000000000 -0700 @@ -100,7 +100,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); @@ -114,13 +113,17 @@ EXPORT_SYMBOL_NOVERS(__get_user_4); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); +#if !defined(CONFIG_X86_UACCESS_INDIRECT) EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__direct_strncpy_from_user); EXPORT_SYMBOL(clear_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_from_user_ll); EXPORT_SYMBOL(__copy_to_user_ll); EXPORT_SYMBOL(strnlen_user); +#else /* CONFIG_X86_UACCESS_INDIRECT */ +EXPORT_SYMBOL(direct_csum_partial_copy_generic); +#endif EXPORT_SYMBOL(dma_alloc_coherent); EXPORT_SYMBOL(dma_free_coherent); diff -puN arch/i386/kernel/i387.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/i387.c --- 25/arch/i386/kernel/i387.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/i387.c 2003-08-07 15:24:50.000000000 -0700 @@ -219,6 +219,7 @@ void set_fpu_mxcsr( struct task_struct * static int convert_fxsr_to_user( struct _fpstate __user *buf, struct i387_fxsave_struct *fxsave ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpreg __user *to; struct _fpxreg *from; @@ -235,23 +236,25 @@ static int convert_fxsr_to_user( struct if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) return 1; - to = &buf->_st[0]; + to = tmp; from = (struct _fpxreg *) &fxsave->st_space[0]; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f+1); + to->exponent = from->exponent; } + if (copy_to_user(buf->_st, tmp, sizeof(struct _fpreg [8]))) + return 1; return 0; } static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, struct _fpstate __user *buf ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpxreg *to; struct _fpreg __user *from; @@ -259,6 +262,8 @@ static int convert_fxsr_from_user( struc if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) return 1; + if (copy_from_user(tmp, buf->_st, sizeof(struct _fpreg [8]))) + return 1; fxsave->cwd = (unsigned short)(env[0] & 0xffff); fxsave->swd = (unsigned short)(env[1] & 0xffff); @@ -270,15 +275,14 @@ static int convert_fxsr_from_user( struc fxsave->fos = env[6]; to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; + from = tmp; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f + 1); + to->exponent = from->exponent; } return 0; } diff -puN arch/i386/kernel/init_task.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/init_task.c --- 25/arch/i386/kernel/init_task.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/init_task.c 2003-08-07 15:24:50.000000000 -0700 @@ -23,7 +23,7 @@ struct mm_struct init_mm = INIT_MM(init_ */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union) }; /* * Initial task structure. @@ -39,5 +39,5 @@ struct task_struct init_task = INIT_TASK * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +struct tss_struct init_tss[NR_CPUS] __attribute__((__section__(".data.tss"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; diff -puN arch/i386/kernel/ldt.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/ldt.c --- 25/arch/i386/kernel/ldt.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/ldt.c 2003-08-07 15:24:50.000000000 -0700 @@ -2,7 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 1999, 2003 Ingo Molnar */ #include @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -29,34 +31,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; - int oldsize; + int oldsize, newsize, i; if (mincount <= pc->size) return 0; + /* + * LDT got larger - reallocate if necessary. + */ oldsize = pc->size; mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } pc->size = mincount; - wmb(); - if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -67,21 +66,20 @@ static int alloc_ldt(mm_context_t *pc, i load_LDT(pc); #endif } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } return 0; } static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { - int err = alloc_ldt(new, old->size, 0); - if (err < 0) + int i, err, size = old->size, nr_pages = (size*LDT_ENTRY_SIZE + PAGE_SIZE-1)/PAGE_SIZE; + + err = alloc_ldt(new, size, 0); + if (err < 0) { + new->size = 0; return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + } + for (i = 0; i < nr_pages; i++) + copy_user_highpage(new->ldt_pages[i], old->ldt_pages[i], 0); return 0; } @@ -96,6 +94,7 @@ int init_new_context(struct task_struct init_MUTEX(&mm->context.sem); mm->context.size = 0; + memset(mm->context.ldt_pages, 0, sizeof(struct page *) * MAX_LDT_PAGES); old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); @@ -107,23 +106,21 @@ int init_new_context(struct task_struct /* * No need to lock the MM as we are the last user + * Do not touch the ldt register, we are already + * in the next thread. */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } + int i, nr_pages = (mm->context.size*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) + __free_page(mm->context.ldt_pages[i]); + mm->context.size = 0; } static int read_ldt(void __user * ptr, unsigned long bytecount) { - int err; + int err, i; unsigned long size; struct mm_struct * mm = current->mm; @@ -138,8 +135,25 @@ static int read_ldt(void __user * ptr, u size = bytecount; err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; + /* + * This is necessary just in case we got here straight from a + * context-switch where the ptes were set but no tlb flush + * was done yet. We rather avoid doing a TLB flush in the + * context-switch path and do it here instead. + */ + __flush_tlb_global(); + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + if (copy_to_user(ptr + i, kaddr, size - i)) + err = -EFAULT; + kunmap(mm->context.ldt_pages[nr]); + } up(&mm->context.sem); if (err < 0) return err; @@ -158,7 +172,7 @@ static int read_default_ldt(void __user err = 0; address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); + size = 5*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -200,7 +214,15 @@ static int write_ldt(void __user * ptr, goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + /* + * No rescheduling allowed from this point to the install. + * + * We do a TLB flush for the same reason as in the read_ldt() path. + */ + preempt_disable(); + __flush_tlb_global(); + lp = (__u32 *) ((ldt_info.entry_number << 3) + + (char *) __kmap_atomic_vaddr(KM_LDT_PAGE0)); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -221,6 +243,7 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + preempt_enable(); out_unlock: up(&mm->context.sem); @@ -248,3 +271,26 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +/* + * load one particular LDT into the current CPU + */ +void load_LDT_nolock(mm_context_t *pc, int cpu) +{ + struct page **pages = pc->ldt_pages; + int count = pc->size; + int nr_pages, i; + + if (likely(!count)) { + pages = &default_ldt_page; + count = 5; + } + nr_pages = (count*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + __kunmap_atomic_type(KM_LDT_PAGE0 - i); + __kmap_atomic(pages[i], KM_LDT_PAGE0 - i); + } + set_ldt_desc(cpu, (void *)__kmap_atomic_vaddr(KM_LDT_PAGE0), count); + load_LDT_desc(); +} diff -puN arch/i386/kernel/Makefile~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/Makefile --- 25/arch/i386/kernel/Makefile~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/Makefile 2003-08-07 15:24:50.000000000 -0700 @@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o + doublefault.o entry_trampoline.o obj-y += cpu/ obj-y += timers/ diff -puN arch/i386/kernel/mpparse.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/mpparse.c --- 25/arch/i386/kernel/mpparse.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/mpparse.c 2003-08-07 15:24:50.000000000 -0700 @@ -656,7 +656,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc((void *)phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); diff -puN arch/i386/kernel/process.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/process.c --- 25/arch/i386/kernel/process.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/process.c 2003-08-07 15:24:50.000000000 -0700 @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -252,6 +253,9 @@ void flush_thread(void) struct task_struct *tsk = current; memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); +#ifdef CONFIG_X86_HIGH_ENTRY + clear_thread_flag(TIF_DB7); +#endif memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -265,9 +269,8 @@ void release_thread(struct task_struct * if (dead_task->mm) { // temporary debugging check if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + printk("WARNING: dead process %8s still has LDT? <%d>\n", dead_task->comm, - dead_task->mm->context.ldt, dead_task->mm->context.size); BUG(); } @@ -302,7 +305,17 @@ int copy_thread(int nr, unsigned long cl p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); + /* + * get the two stack pages, for the virtual stack. + * + * IMPORTANT: this code relies on the fact that the task + * structure is an 8K aligned piece of physical memory. + */ + p->thread.stack_page0 = virt_to_page((unsigned long)p->thread_info); + p->thread.stack_page1 = virt_to_page((unsigned long)p->thread_info + PAGE_SIZE); + p->thread.eip = (unsigned long) ret_from_fork; + p->thread_info->real_stack = p->thread_info; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -454,10 +467,26 @@ struct task_struct * __switch_to(struct unlazy_fpu(prev_p); +#ifdef CONFIG_X86_HIGH_ENTRY + /* + * Set the ptes of the virtual stack. (NOTE: a one-page TLB flush is + * needed because otherwise NMIs could interrupt the + * user-return code with a virtual stack and stale TLBs.) + */ + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(next->stack_page0, KM_VSTACK0); + __kmap_atomic(next->stack_page1, KM_VSTACK1); + /* - * Reload esp0, LDT and the page table pointer: + * Reload esp0: */ - load_esp0(tss, next->esp0); + /* + * NOTE: here we rely on the task being the stack as well + */ + next_p->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); +#endif + load_esp0(tss, virtual_esp0(next_p)); /* * Load the per-thread Thread-Local Storage descriptor. diff -puN arch/i386/kernel/reboot.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/reboot.c --- 25/arch/i386/kernel/reboot.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/reboot.c 2003-08-07 15:24:50.000000000 -0700 @@ -151,12 +151,11 @@ void machine_real_restart(unsigned char CMOS_WRITE(0x00, 0x8f); spin_unlock_irqrestore(&rtc_lock, flags); - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + /* + * Remap the first 16 MB of RAM (which includes the kernel image) + * at virtual address zero: + */ + setup_identity_mappings(swapper_pg_dir, 0, 16*1024*1024); /* * Use `swapper_pg_dir' as our page directory. @@ -258,7 +257,7 @@ void machine_restart(char * __unused) for (;;) { mach_reboot(); /* That didn't work - force a triple fault.. */ - __asm__ __volatile__("lidt %0": :"m" (no_idt)); + __asm__ __volatile__("lidt %0": :"m" (no_idt[0])); __asm__ __volatile__("int3"); } } diff -puN arch/i386/kernel/signal.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/signal.c --- 25/arch/i386/kernel/signal.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/signal.c 2003-08-07 15:24:50.000000000 -0700 @@ -128,25 +128,25 @@ sys_sigaltstack(const stack_t __user *us */ static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) +restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *__sc, int *peax) { - unsigned int err = 0; + struct sigcontext scratch; /* 88 bytes of scratch area */ -#define COPY(x) err |= __get_user(regs->x, &sc->x) + if (copy_from_user(&scratch, __sc, sizeof(scratch))) + return -EFAULT; + +#define COPY(x) regs->x = scratch.x #define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp; } #define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp|3; } #define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ loadsegment(seg,tmp); } GET_SEG(gs); @@ -165,27 +165,23 @@ restore_sigcontext(struct pt_regs *regs, COPY_SEG_STRICT(ss); { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); + unsigned int tmpflags = scratch.eflags; regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); regs->orig_eax = -1; /* disable syscall checks */ } { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); + struct _fpstate * buf = scratch.fpstate; if (buf) { if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); + return -EFAULT; + if (restore_i387(buf)) + return -EFAULT; } } - err |= __get_user(*peax, &sc->eax); - return err; - -badframe: - return 1; + *peax = scratch.eax; + return 0; } asmlinkage int sys_sigreturn(unsigned long __unused) @@ -263,46 +259,47 @@ badframe: */ static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, +setup_sigcontext(struct sigcontext __user *__sc, struct _fpstate __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int tmp, err = 0; + struct sigcontext sc; /* 88 bytes of scratch area */ + int tmp; tmp = 0; __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->gs); + *(unsigned int *)&sc.gs = tmp; __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->fs); - - err |= __put_user(regs->xes, (unsigned int *)&sc->es); - err |= __put_user(regs->xds, (unsigned int *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int *)&sc->ss); + *(unsigned int *)&sc.fs = tmp; + *(unsigned int *)&sc.es = regs->xes; + *(unsigned int *)&sc.ds = regs->xds; + sc.edi = regs->edi; + sc.esi = regs->esi; + sc.ebp = regs->ebp; + sc.esp = regs->esp; + sc.ebx = regs->ebx; + sc.edx = regs->edx; + sc.ecx = regs->ecx; + sc.eax = regs->eax; + sc.trapno = current->thread.trap_no; + sc.err = current->thread.error_code; + sc.eip = regs->eip; + *(unsigned int *)&sc.cs = regs->xcs; + sc.eflags = regs->eflags; + sc.esp_at_signal = regs->esp; + *(unsigned int *)&sc.ss = regs->xss; tmp = save_i387(fpstate); if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + return 1; + sc.fpstate = tmp ? fpstate : NULL; /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); + sc.oldmask = mask; + sc.cr2 = current->thread.cr2; - return err; + if (copy_to_user(__sc, &sc, sizeof(sc))) + return 1; + return 0; } /* @@ -440,7 +437,7 @@ static void setup_rt_frame(int sig, stru /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(current->sas_ss_sp, (unsigned long *)&frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->esp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); diff -puN arch/i386/kernel/smp.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/smp.c --- 25/arch/i386/kernel/smp.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/smp.c 2003-08-07 15:24:50.000000000 -0700 @@ -327,10 +327,12 @@ asmlinkage void smp_invalidate_interrupt if (flush_mm == cpu_tlbstate[cpu].active_mm) { if (cpu_tlbstate[cpu].state == TLBSTATE_OK) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (flush_va == FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); +#endif } else leave_mm(cpu); } @@ -396,21 +398,6 @@ static void flush_tlb_others(cpumask_t c spin_unlock(&tlbstate_lock); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - void flush_tlb_mm (struct mm_struct * mm) { cpumask_t cpu_mask; @@ -442,7 +429,10 @@ void flush_tlb_page(struct vm_area_struc if (current->active_mm == mm) { if(current->mm) - __flush_tlb_one(va); +#ifndef CONFIG_X86_SWITCH_PAGETABLES + __flush_tlb_one(va) +#endif + ; else leave_mm(smp_processor_id()); } diff -puN arch/i386/kernel/sysenter.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/sysenter.c --- 25/arch/i386/kernel/sysenter.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/sysenter.c 2003-08-07 15:24:50.000000000 -0700 @@ -18,13 +18,18 @@ #include #include #include +#include extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void *info) { int cpu = get_cpu(); +#ifdef CONFIG_X86_HIGH_ENTRY + struct tss_struct *tss = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else struct tss_struct *tss = init_tss + cpu; +#endif tss->ss1 = __KERNEL_CS; tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; diff -puN arch/i386/kernel/traps.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/traps.c --- 25/arch/i386/kernel/traps.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/traps.c 2003-08-07 15:24:50.000000000 -0700 @@ -53,12 +53,8 @@ #include "mach_traps.h" -asmlinkage int system_call(void); -asmlinkage void lcall7(void); -asmlinkage void lcall27(void); - -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; +struct desc_struct default_ldt[] __attribute__((__section__(".data.default_ldt"))) = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }; +struct page *default_ldt_page; /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -231,6 +227,7 @@ void show_registers(struct pt_regs *regs printk("\nStack: "); show_stack(NULL, (unsigned long*)esp); +#if 0 printk("Code: "); eip = (u8 *)regs->eip - 43; @@ -246,6 +243,7 @@ void show_registers(struct pt_regs *regs else printk("%02x ", c); } +#endif } printk("\n"); } @@ -595,10 +593,18 @@ asmlinkage void do_debug(struct pt_regs if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); - /* Mask out spurious debug traps due to lazy DR7 setting */ + /* + * Mask out spurious debug traps due to lazy DR7 setting or + * due to 4G/4G kernel mode: + */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) goto clear_dr7; + if (!user_mode(regs)) { + // restore upon return-to-userspace: + set_thread_flag(TIF_DB7); + goto clear_dr7; + } } if (regs->eflags & VM_MASK) @@ -860,19 +866,53 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) +void __init trap_init_virtual_IDT(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. + * "idt" is magic - it overlaps the idt_descr + * variable so that updating idt will automatically + * update the idt descriptor.. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); + __set_fixmap(FIX_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + idt_descr.address = __fix_to_virt(FIX_IDT); + __asm__ __volatile__("lidt %0": "=m" (idt_descr)); } + +void __init trap_init_virtual_GDT(void) +{ + int cpu = smp_processor_id(); + struct Xgt_desc_struct *gdt_desc = cpu_gdt_descr + cpu; + struct Xgt_desc_struct tmp_desc = {0, 0}; + struct tss_struct * t; + + __asm__ __volatile__("sgdt %0": "=m" (tmp_desc): :"memory"); + +#ifdef CONFIG_X86_HIGH_ENTRY + if (!cpu) { + __set_fixmap(FIX_GDT_0, __pa(cpu_gdt_table), PAGE_KERNEL); + __set_fixmap(FIX_GDT_1, __pa(cpu_gdt_table) + PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_0, __pa(init_tss), PAGE_KERNEL); + __set_fixmap(FIX_TSS_1, __pa(init_tss) + 1*PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_2, __pa(init_tss) + 2*PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_3, __pa(init_tss) + 3*PAGE_SIZE, PAGE_KERNEL); + } + + gdt_desc->address = __fix_to_virt(FIX_GDT_0) + sizeof(cpu_gdt_table[0]) * cpu; +#else + gdt_desc->address = (unsigned long)cpu_gdt_table[cpu]; #endif + __asm__ __volatile__("lgdt %0": "=m" (*gdt_desc)); + +#ifdef CONFIG_X86_HIGH_ENTRY + t = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else + t = init_tss + cpu; +#endif + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); +} #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ @@ -899,17 +939,17 @@ void set_intr_gate(unsigned int n, void _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); } -static void __init set_trap_gate(unsigned int n, void *addr) +void __init set_trap_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); } -static void __init set_system_gate(unsigned int n, void *addr) +void __init set_system_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); } -static void __init set_call_gate(void *a, void *addr) +void __init set_call_gate(void *a, void *addr) { _set_gate(a,12,3,addr,__KERNEL_CS); } @@ -941,6 +981,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_LOCAL_APIC init_apic_mappings(); #endif + init_entry_mappings(); set_trap_gate(0,÷_error); set_intr_gate(1,&debug); diff -puN arch/i386/kernel/vm86.c~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/vm86.c --- 25/arch/i386/kernel/vm86.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/vm86.c 2003-08-07 15:24:50.000000000 -0700 @@ -117,7 +117,7 @@ struct pt_regs * save_v86_state(struct k tss = init_tss + get_cpu(); current->thread.esp0 = current->thread.saved_esp0; - load_esp0(tss, current->thread.esp0); + load_esp0(tss, virtual_esp0(current)); current->thread.saved_esp0 = 0; put_cpu(); @@ -294,7 +294,8 @@ static void do_sys_vm86(struct kernel_vm asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs)); tss = init_tss + get_cpu(); - tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tss->esp0 = virtual_esp0(tsk); disable_sysenter(tss); put_cpu(); diff -puN arch/i386/kernel/vsyscall.lds~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/vsyscall.lds --- 25/arch/i386/kernel/vsyscall.lds~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/vsyscall.lds 2003-08-07 15:24:50.000000000 -0700 @@ -5,7 +5,7 @@ */ /* This must match . */ -VSYSCALL_BASE = 0xffffe000; +VSYSCALL_BASE = 0xffffd000; SECTIONS { diff -puN arch/i386/kernel/vsyscall-sysenter.S~4g-2.6.0-test2-mm2-A5 arch/i386/kernel/vsyscall-sysenter.S --- 25/arch/i386/kernel/vsyscall-sysenter.S~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/kernel/vsyscall-sysenter.S 2003-08-07 15:24:50.000000000 -0700 @@ -7,6 +7,11 @@ .type __kernel_vsyscall,@function __kernel_vsyscall: .LSTART_vsyscall: + cmpl $192, %eax + jne 1f + int $0x80 + ret +1: push %ecx .Lpush_ecx: push %edx diff -puN arch/i386/lib/checksum.S~4g-2.6.0-test2-mm2-A5 arch/i386/lib/checksum.S --- 25/arch/i386/lib/checksum.S~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/lib/checksum.S 2003-08-07 15:24:50.000000000 -0700 @@ -280,14 +280,14 @@ unsigned int csum_partial_copy_generic ( .previous .align 4 -.globl csum_partial_copy_generic +.globl direct_csum_partial_copy_generic #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: subl $4,%esp pushl %edi pushl %esi @@ -422,7 +422,7 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: pushl %ebx pushl %edi pushl %esi diff -puN arch/i386/lib/getuser.S~4g-2.6.0-test2-mm2-A5 arch/i386/lib/getuser.S --- 25/arch/i386/lib/getuser.S~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/lib/getuser.S 2003-08-07 15:24:50.000000000 -0700 @@ -9,6 +9,7 @@ * return value. */ #include +#include /* @@ -28,7 +29,7 @@ .globl __get_user_1 __get_user_1: GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 1: movzbl (%eax),%edx xorl %eax,%eax @@ -40,7 +41,7 @@ __get_user_2: addl $1,%eax jc bad_get_user GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 2: movzwl -1(%eax),%edx xorl %eax,%eax @@ -52,7 +53,7 @@ __get_user_4: addl $3,%eax jc bad_get_user GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 3: movl -3(%eax),%edx xorl %eax,%eax diff -puN arch/i386/lib/usercopy.c~4g-2.6.0-test2-mm2-A5 arch/i386/lib/usercopy.c --- 25/arch/i386/lib/usercopy.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/lib/usercopy.c 2003-08-07 15:24:50.000000000 -0700 @@ -76,7 +76,7 @@ do { \ * and returns @count. */ long -__strncpy_from_user(char *dst, const char __user *src, long count) +__direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res; __do_strncpy_from_user(dst, src, count, res); @@ -102,7 +102,7 @@ __strncpy_from_user(char *dst, const cha * and returns @count. */ long -strncpy_from_user(char *dst, const char __user *src, long count) +direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) @@ -147,7 +147,7 @@ do { \ * On success, this will be zero. */ unsigned long -clear_user(void __user *to, unsigned long n) +direct_clear_user(void __user *to, unsigned long n) { might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) @@ -167,7 +167,7 @@ clear_user(void __user *to, unsigned lon * On success, this will be zero. */ unsigned long -__clear_user(void __user *to, unsigned long n) +__direct_clear_user(void __user *to, unsigned long n) { __do_clear_user(to, n); return n; @@ -184,7 +184,7 @@ __clear_user(void __user *to, unsigned l * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char __user *s, long n) +long direct_strnlen_user(const char __user *s, long n) { unsigned long mask = -__addr_ok(s); unsigned long res, tmp; @@ -553,3 +553,4 @@ unsigned long __copy_from_user_ll(void * n = __copy_user_zeroing_intel(to, (const void *) from, n); return n; } + diff -puN arch/i386/mm/extable.c~4g-2.6.0-test2-mm2-A5 arch/i386/mm/extable.c --- 25/arch/i386/mm/extable.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/mm/extable.c 2003-08-07 15:24:50.000000000 -0700 @@ -6,6 +6,52 @@ #include #include #include +#include + +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +/* + * The exception table needs to be sorted because we use the macros + * which put things into the exception table in a variety of sections + * as well as the init section and the main kernel text section. + */ +static inline void +sort_ex_table(struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + struct exception_table_entry el, *p, *q; + + /* insertion sort */ + for (p = start + 1; p < finish; ++p) { + /* start .. p-1 is sorted */ + if (p[0].insn < p[-1].insn) { + /* move element p down to its right place */ + el = *p; + q = p; + do { + /* el comes before q[-1], move q[-1] up one */ + q[0] = q[-1]; + --q; + } while (q > start && el.insn < q[-1].insn); + *q = el; + } + } +} + +void fixup_sort_exception_table(void) +{ + struct exception_table_entry *p; + + /* + * Fix up the trampoline exception addresses: + */ + for (p = __start___ex_table; p < __stop___ex_table; p++) { + p->insn = (unsigned long)(void *)p->insn; + p->fixup = (unsigned long)(void *)p->fixup; + } + sort_ex_table(__start___ex_table, __stop___ex_table); +} /* Simple binary search */ const struct exception_table_entry * @@ -15,13 +61,15 @@ search_extable(const struct exception_ta { while (first <= last) { const struct exception_table_entry *mid; - long diff; mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) + /* + * careful, the distance between entries can be + * larger than 2GB: + */ + if (mid->insn == value) return mid; - else if (diff < 0) + else if (mid->insn < value) first = mid+1; else last = mid-1; diff -puN arch/i386/mm/fault.c~4g-2.6.0-test2-mm2-A5 arch/i386/mm/fault.c --- 25/arch/i386/mm/fault.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/mm/fault.c 2003-08-07 15:24:50.000000000 -0700 @@ -79,6 +79,14 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); +#ifdef CONFIG_X86_SWITCH_PAGETABLES + if (!user_mode(regs)) { + console_verbose(); + printk("invalid kernel-mode pagefault %ld! [addr:%08lx, eip:%08lx]\n", error_code, address, regs->eip); + show_regs(regs); + BUG(); + } +#endif /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) diff -puN arch/i386/mm/init.c~4g-2.6.0-test2-mm2-A5 arch/i386/mm/init.c --- 25/arch/i386/mm/init.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/mm/init.c 2003-08-07 15:24:50.000000000 -0700 @@ -38,6 +38,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; @@ -83,80 +84,6 @@ static pte_t * __init one_page_table_ini return pte_offset_kernel(pmd, 0); } -/* - * This function initializes a certain range of kernel virtual memory - * with new bootmem page tables, everywhere page tables are missing in - * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without - * checking the pgd every time. - */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - int pgd_idx, pmd_idx; - unsigned long vaddr; - - vaddr = start; - pgd_idx = pgd_index(vaddr); - pmd_idx = pmd_index(vaddr); - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - - pmd = pmd_offset(pgd, vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); - - vaddr += PMD_SIZE; - } - pmd_idx = 0; - } -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. - */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) - continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - } - } -} - static inline int page_kills_ppro(unsigned long pagenr) { if (pagenr >= 0x70000 && pagenr <= 0x7003F) @@ -186,38 +113,14 @@ static inline int page_is_ram(unsigned l return 0; } -#ifdef CONFIG_HIGHMEM pte_t *kmap_pte; -pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -void __init permanent_kmaps_init(pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + kmap_pte = kmap_get_fixmap_pte(__fix_to_virt(FIX_KMAP_BEGIN)); } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -232,6 +135,8 @@ void __init one_highpage_init(struct pag SetPageReserved(page); } +#ifdef CONFIG_HIGHMEM + #ifndef CONFIG_DISCONTIGMEM void __init set_highmem_pages_init(int bad_ppro) { @@ -243,12 +148,9 @@ void __init set_highmem_pages_init(int b #else extern void set_highmem_pages_init(int); #endif /* !CONFIG_DISCONTIGMEM */ - #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) -#endif /* CONFIG_HIGHMEM */ +# define set_highmem_pages_init(bad_ppro) do { } while (0) +#endif unsigned long __PAGE_KERNEL = _PAGE_KERNEL; @@ -258,30 +160,125 @@ unsigned long __PAGE_KERNEL = _PAGE_KERN extern void __init remap_numa_kva(void); #endif -static void __init pagetable_init (void) +static __init void prepare_pagetables(pgd_t *pgd_base, unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(address); + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); + } +} + +static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; + for (vaddr = start; vaddr != end; vaddr += PAGE_SIZE) + prepare_pagetables(pgd_base, vaddr); +} + +void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) +{ + unsigned long vaddr; + pgd_t *pgd; + int i, j, k; + pmd_t *pmd; + pte_t *pte, *pte_base; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + if (cpu_has_pse) { + unsigned long __pe; + + set_in_cr4(X86_CR4_PSE); + boot_cpu_data.wp_works_ok = 1; + __pe = _KERNPG_TABLE + _PAGE_PSE + vaddr - start; + /* Make it "global" too if supported */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) + __pe += _PAGE_GLOBAL; + __PAGE_KERNEL |= _PAGE_GLOBAL; +#endif + } + set_pmd(pmd, __pmd(__pe)); + continue; + } + if (!pmd_present(*pmd)) + pte_base = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + else + pte_base = (pte_t *) page_address(pmd_page(*pmd)); + pte = pte_base; + for (k = 0; k < PTRS_PER_PTE; pte++, k++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + *pte = mk_pte_phys(vaddr-start, PAGE_KERNEL); + } + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + } + } +} + +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base; #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); #endif - /* Enable PSE if available */ - if (cpu_has_pse) { - set_in_cr4(X86_CR4_PSE); - } + /* + * This can be zero as well - no problem, in that case we exit + * the loops anyway due to the PTRS_PER_* conditions. + */ + end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; + pgd_base = swapper_pg_dir; +#ifdef CONFIG_X86_PAE + /* + * It causes too many problems if there's no proper pmd set up + * for all 4 entries of the PGD - so we allocate all of them. + * PAE systems will not miss this extra 4-8K anyway ... + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + pmd_t *pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pgd(pgd_base + i, __pgd(__pa(pmd) + 0x1)); } +#endif + /* + * Set up lowmem-sized identity mappings at PAGE_OFFSET: + */ + setup_identity_mappings(pgd_base, PAGE_OFFSET, end); - kernel_physical_mapping_init(pgd_base); + /* + * Add flat-mode identity-mappings - SMP needs it when + * starting up on an AP from real-mode. (In the non-PAE + * case we already have these mappings through head.S.) + * All user-space mappings are explicitly cleared after + * SMP startup. + */ +#if CONFIG_SMP && CONFIG_X86_PAE + setup_identity_mappings(pgd_base, 0, 16*1024*1024); +#endif remap_numa_kva(); /* @@ -289,38 +286,64 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + fixrange_init(vaddr, 0, pgd_base); - permanent_kmaps_init(pgd_base); +#if CONFIG_HIGHMEM + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } #endif } -void zap_low_mappings (void) +/* + * Clear kernel pagetables in a PMD_SIZE-aligned range. + */ +static void clear_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) { - int i; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + int i, j; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + pmd_clear(pmd); + } + } + flush_tlb_all(); +} + +void __init zap_low_mappings(void) +{ + printk("zapping low mappings.\n"); /* * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); + clear_mappings(swapper_pg_dir, 0, 16*1024*1024); } #ifndef CONFIG_DISCONTIGMEM @@ -348,6 +371,21 @@ void __init zone_sizes_init(void) extern void zone_sizes_init(void); #endif /* !CONFIG_DISCONTIGMEM */ +void __init touch_all_pages(void) +{ + int i; + char *page; + volatile char data; + + printk("touching all pages ...\n"); + for (i = 0; i < max_low_pfn; i++) { + page = (char *)PAGE_OFFSET + i * PAGE_SIZE; + data = *page; + } + + printk("done.\n"); +} + /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. @@ -373,6 +411,7 @@ void __init paging_init(void) kmap_init(); zone_sizes_init(); + touch_all_pages(); } /* @@ -424,6 +463,7 @@ extern void set_max_mapnr_init(void); #endif /* !CONFIG_DISCONTIGMEM */ static struct kcore_list kcore_mem, kcore_vmalloc; +extern void fixup_sort_exception_table(void); void __init mem_init(void) { @@ -432,6 +472,8 @@ void __init mem_init(void) int tmp; int bad_ppro; + fixup_sort_exception_table(); + #ifndef CONFIG_DISCONTIGMEM if (!mem_map) BUG(); @@ -507,6 +549,9 @@ void __init mem_init(void) #ifndef CONFIG_SMP zap_low_mappings(); #endif + entry_trampoline_setup(); + default_ldt_page = virt_to_page(default_ldt); + load_LDT(&init_mm.context); } kmem_cache_t *pgd_cache; @@ -529,7 +574,7 @@ void __init pgtable_cache_init(void) 0, SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + pgd_dtor); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff -puN arch/i386/mm/pgtable.c~4g-2.6.0-test2-mm2-A5 arch/i386/mm/pgtable.c --- 25/arch/i386/mm/pgtable.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/arch/i386/mm/pgtable.c 2003-08-07 15:24:50.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include void show_mem(void) { @@ -158,10 +159,8 @@ void pmd_ctor(void *pmd, kmem_cache_t *c } /* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking + * List of all pgd's needed so it can invalidate entries in both cached + * and uncached pgd's. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. @@ -174,23 +173,14 @@ void pmd_ctor(void *pmd, kmem_cache_t *c spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; LIST_HEAD(pgd_list); -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_ctor(void *__pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; + pgd_t *pgd0 = __pgd; - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); - - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - - if (PTRS_PER_PMD > 1) - return; - - list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_lock_irqsave(&pgd_lock, flags); + list_add(&virt_to_page(pgd0)->lru, &pgd_list); spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); } /* never called when PTRS_PER_PMD > 1 */ @@ -203,22 +193,42 @@ void pgd_dtor(void *pgd, kmem_cache_t *c spin_unlock_irqrestore(&pgd_lock, flags); } +#ifdef CONFIG_X86_PAE + pgd_t *pgd_alloc(struct mm_struct *mm) { int i; pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + if (pgd) { +#ifdef CONFIG_X86_4G_VM_LAYOUT + pmd_t *pmd0, *kernel_pmd0; +#endif + pmd_t *pmd; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); + } - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); +#ifdef CONFIG_X86_4G_VM_LAYOUT + /* + * In the 4G userspace case alias the top 16 MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ + pmd0 = pmd; + kernel_pmd0 = (pmd_t *)__va(pgd_val(swapper_pg_dir[PTRS_PER_PGD-1]) & PAGE_MASK); + memcpy(pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, kernel_pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, sizeof(pmd_t) * NR_SHARED_PMDS); +#else + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif } return pgd; - out_oom: for (i--; i >= 0; i--) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); @@ -226,6 +236,35 @@ out_oom: return NULL; } +#else /* ! PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (pgd) { +#ifdef CONFIG_X86_4G_VM_LAYOUT + memset(pgd, 0, PTRS_PER_PGD * sizeof(pgd_t)); + /* + * In the 4G userspace case alias the top 16 MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ + memcpy(pgd + PTRS_PER_PGD-NR_SHARED_PMDS, + swapper_pg_dir + PTRS_PER_PGD-NR_SHARED_PMDS, + NR_SHARED_PMDS * sizeof(pgd_t)); +#else + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif + } + return pgd; +} + +#endif /* CONFIG_X86_PAE */ + void pgd_free(pgd_t *pgd) { int i; @@ -237,3 +276,4 @@ void pgd_free(pgd_t *pgd) /* in the non-PAE case, clear_page_tables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } + diff -puN -L arch/i386/vmlinux.lds.S /dev/null /dev/null diff -puN drivers/block/scsi_ioctl.c~4g-2.6.0-test2-mm2-A5 drivers/block/scsi_ioctl.c --- 25/drivers/block/scsi_ioctl.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/drivers/block/scsi_ioctl.c 2003-08-07 15:24:50.000000000 -0700 @@ -319,7 +319,7 @@ static int sg_scsi_ioctl(request_queue_t return -EFAULT; if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) return -EINVAL; - if (get_user(opcode, sic->data)) + if (get_user(opcode, (int *)sic->data)) return -EFAULT; bytes = max(in_len, out_len); diff -puN /dev/null include/asm-i386/atomic_kmap.h --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/include/asm-i386/atomic_kmap.h 2003-08-07 15:24:50.000000000 -0700 @@ -0,0 +1,95 @@ +/* + * atomic_kmap.h: temporary virtual kernel memory mappings + * + * Copyright (C) 2003 Ingo Molnar + */ + +#ifndef _ASM_ATOMIC_KMAP_H +#define _ASM_ATOMIC_KMAP_H + +#ifdef __KERNEL__ + +#include +#include + +#ifdef CONFIG_DEBUG_HIGHMEM +#define HIGHMEM_DEBUG 1 +#else +#define HIGHMEM_DEBUG 0 +#endif + +extern pte_t *kmap_pte; +#define kmap_prot PAGE_KERNEL + +#define PKMAP_BASE (0xfe000000UL) +#define NR_SHARED_PMDS ((0xffffffff-PKMAP_BASE+1)/PMD_SIZE) + +static inline unsigned long __kmap_atomic_vaddr(enum km_type type) +{ + enum fixed_addresses idx; + + idx = type + KM_TYPE_NR*smp_processor_id(); + return __fix_to_virt(FIX_KMAP_BEGIN + idx); +} + +static inline void *__kmap_atomic_noflush(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + /* + * NOTE: entries that rely on some secondary TLB-flush + * effect must not be global: + */ + set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + + return (void*) vaddr; +} + +static inline void *__kmap_atomic(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#if HIGHMEM_DEBUG + BUG_ON(!pte_none(*(kmap_pte-idx))); +#else + /* + * Performance optimization - do not flush if the new + * pte is the same as the old one: + */ + if (pte_val(*(kmap_pte-idx)) == pte_val(mk_pte(page, kmap_prot))) + return (void *) vaddr; +#endif + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +static inline void __kunmap_atomic(void *kvaddr, enum km_type type) +{ +#if HIGHMEM_DEBUG + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(kmap_pte-idx); + __flush_tlb_one(vaddr); +#endif +} + +#define __kunmap_atomic_type(type) \ + __kunmap_atomic((void *)__kmap_atomic_vaddr(type), (type)) + +#endif /* __KERNEL__ */ + +#endif /* _ASM_ATOMIC_KMAP_H */ diff -puN include/asm-i386/checksum.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/checksum.h --- 25/include/asm-i386/checksum.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/checksum.h 2003-08-07 15:24:50.000000000 -0700 @@ -25,7 +25,7 @@ asmlinkage unsigned int csum_partial(con * better 64-bit) boundary */ -asmlinkage unsigned int csum_partial_copy_generic( const char *src, char *dst, int len, int sum, +asmlinkage unsigned int direct_csum_partial_copy_generic( const char *src, char *dst, int len, int sum, int *src_err_ptr, int *dst_err_ptr); /* @@ -39,14 +39,19 @@ static __inline__ unsigned int csum_partial_copy_nocheck ( const char *src, char *dst, int len, int sum) { - return csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL); + /* + * The direct function is OK for kernel-space => kernel-space copies: + */ + return direct_csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL); } static __inline__ unsigned int csum_partial_copy_from_user ( const char *src, char *dst, int len, int sum, int *err_ptr) { - return csum_partial_copy_generic ( src, dst, len, sum, err_ptr, NULL); + if (copy_from_user(dst, src, len)) + *err_ptr = -EFAULT; + return csum_partial(dst, len, sum); } /* @@ -171,11 +176,26 @@ static __inline__ unsigned short int csu * Copy and checksum to user */ #define HAVE_CSUM_COPY_USER -static __inline__ unsigned int csum_and_copy_to_user(const char *src, char *dst, +static __inline__ unsigned int direct_csum_and_copy_to_user(const char *src, char *dst, int len, int sum, int *err_ptr) { if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr); + return direct_csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr); + + if (len) + *err_ptr = -EFAULT; + + return -1; /* invalid checksum */ +} + +static __inline__ unsigned int csum_and_copy_to_user(const char *src, char *dst, + int len, int sum, int *err_ptr) +{ + if (access_ok(VERIFY_WRITE, dst, len)) { + if (copy_to_user(dst, src, len)) + *err_ptr = -EFAULT; + return csum_partial(src, len, sum); + } if (len) *err_ptr = -EFAULT; diff -puN include/asm-i386/desc.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/desc.h --- 25/include/asm-i386/desc.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/desc.h 2003-08-07 15:24:50.000000000 -0700 @@ -21,6 +21,13 @@ struct Xgt_desc_struct { extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern void trap_init_virtual_IDT(void); +extern void trap_init_virtual_GDT(void); + +asmlinkage int system_call(void); +asmlinkage void lcall7(void); +asmlinkage void lcall27(void); + #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) @@ -30,6 +37,7 @@ extern struct Xgt_desc_struct idt_descr, */ extern struct desc_struct default_ldt[]; extern void set_intr_gate(unsigned int irq, void * addr); +extern void set_trap_gate(unsigned int n, void *addr); #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ @@ -90,31 +98,8 @@ static inline void load_TLS(struct threa #undef C } -static inline void clear_LDT(void) -{ - int cpu = get_cpu(); - - set_ldt_desc(cpu, &default_ldt[0], 5); - load_LDT_desc(); - put_cpu(); -} - -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock(mm_context_t *pc, int cpu) -{ - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) { - segments = &default_ldt[0]; - count = 5; - } - - set_ldt_desc(cpu, segments, count); - load_LDT_desc(); -} +extern struct page *default_ldt_page; +extern void load_LDT_nolock(mm_context_t *pc, int cpu); static inline void load_LDT(mm_context_t *pc) { @@ -123,6 +108,6 @@ static inline void load_LDT(mm_context_t put_cpu(); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLY__ */ #endif diff -puN include/asm-i386/fixmap.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/fixmap.h --- 25/include/asm-i386/fixmap.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/fixmap.h 2003-08-07 15:24:50.000000000 -0700 @@ -18,17 +18,15 @@ #include #include #include -#ifdef CONFIG_HIGHMEM #include #include -#endif /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. + * in the boot process. We allocate these special addresses + * from the end of virtual memory (0xffffe000) backwards. * Also this lets us do fail-safe vmalloc(), we * can guarantee that these special addresses and * vmalloc()-ed addresses never overlap. @@ -41,11 +39,20 @@ * TLB entries of such buffers will not be flushed across * task switches. */ + +/* + * on UP currently we will have no trace of the fixmap mechanizm, + * no page table allocations, etc. This might change in the + * future, say framebuffers for the console driver(s) could be + * fix-mapped? + */ enum fixed_addresses { FIX_HOLE, FIX_VSYSCALL, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#else + FIX_VSTACK_HOLE_1, #endif #ifdef CONFIG_X86_IO_APIC FIX_IO_APIC_BASE_0, @@ -57,16 +64,21 @@ enum fixed_addresses { FIX_LI_PCIA, /* Lithium PCI Bridge A */ FIX_LI_PCIB, /* Lithium PCI Bridge B */ #endif -#ifdef CONFIG_X86_F00F_BUG - FIX_F00F_IDT, /* Virtual mapping for IDT */ -#endif -#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_IDT, + FIX_GDT_1, + FIX_GDT_0, + FIX_TSS_3, + FIX_TSS_2, + FIX_TSS_1, + FIX_TSS_0, + FIX_ENTRY_TRAMPOLINE_1, + FIX_ENTRY_TRAMPOLINE_0, +#ifdef CONFIG_X86_SUMMIT FIX_CYCLONE_TIMER, /*cyclone timer register*/ + FIX_VSTACK_HOLE_2, #endif -#ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif #ifdef CONFIG_ACPI_BOOT FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -95,12 +107,15 @@ extern void __set_fixmap (enum fixed_add __set_fixmap(idx, 0, __pgprot(0)) /* - * used by vmalloc.c. + * used by vmalloc.c and various other places. * * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. + * + * IMPORTANT: dont change FIXADDR_TOP without adjusting KM_VSTACK0 + * and KM_VSTACK1 so that the virtual stack is 8K aligned. */ -#define FIXADDR_TOP (0xfffff000UL) +#define FIXADDR_TOP (0xffffe000UL) #define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) diff -puN include/asm-i386/highmem.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/highmem.h --- 25/include/asm-i386/highmem.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/highmem.h 2003-08-07 15:24:50.000000000 -0700 @@ -25,26 +25,19 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; - -extern void kmap_init(void); +extern void kmap_init(void) __init; /* * Right now we initialize only a single pte table. It can be extended * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#if NR_CPUS <= 32 -#define PKMAP_BASE (0xff800000UL) -#else -#define PKMAP_BASE (0xff600000UL) -#endif #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 #else diff -puN include/asm-i386/kmap_types.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/kmap_types.h --- 25/include/asm-i386/kmap_types.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/kmap_types.h 2003-08-07 15:24:50.000000000 -0700 @@ -3,30 +3,32 @@ #include -#ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) -#endif - enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR -}; - -#undef D + /* + * IMPORTANT: dont move these 3 entries, the virtual stack + * must be 8K aligned. + */ + KM_BOUNCE_READ, + KM_VSTACK1, + KM_VSTACK0, + KM_LDT_PAGE15, + KM_LDT_PAGE0 = KM_LDT_PAGE15 + 16-1, + KM_USER_COPY, + KM_VSTACK_HOLE, + KM_SKB_SUNRPC_DATA, + KM_SKB_DATA_SOFTIRQ, + KM_USER0, + KM_USER1, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, + KM_PTE0, + KM_PTE1, + KM_PTE2, + KM_IRQ0, + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, + KM_TYPE_NR +}; #endif diff -puN include/asm-i386/mmu_context.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/mmu_context.h --- 25/include/asm-i386/mmu_context.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/mmu_context.h 2003-08-07 15:24:50.000000000 -0700 @@ -29,6 +29,10 @@ static inline void switch_mm(struct mm_s { int cpu = smp_processor_id(); +#ifdef CONFIG_X86_SWITCH_PAGETABLES + if (tsk->mm) + tsk->thread_info->user_pgd = (void *)__pa(tsk->mm->pgd); +#endif if (likely(prev != next)) { /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); @@ -39,12 +43,14 @@ static inline void switch_mm(struct mm_s cpu_set(cpu, next->cpu_vm_mask); /* Re-load page tables */ +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) load_cr3(next->pgd); +#endif /* * load the LDT, if the LDT is different: */ - if (unlikely(prev->context.ldt != next->context.ldt)) + if (unlikely(prev->context.size + next->context.size)) load_LDT_nolock(&next->context, cpu); } #ifdef CONFIG_SMP @@ -56,7 +62,9 @@ static inline void switch_mm(struct mm_s /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload %cr3. */ +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) load_cr3(next->pgd); +#endif load_LDT_nolock(&next->context, cpu); } } @@ -67,6 +75,6 @@ static inline void switch_mm(struct mm_s asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) #define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL) + switch_mm((prev),(next),current) #endif diff -puN include/asm-i386/mmu.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/mmu.h --- 25/include/asm-i386/mmu.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/mmu.h 2003-08-07 15:24:50.000000000 -0700 @@ -8,10 +8,13 @@ * * cpu_vm_mask is used to optimize ldt flushing. */ + +#define MAX_LDT_PAGES 16 + typedef struct { int size; struct semaphore sem; - void *ldt; + struct page *ldt_pages[MAX_LDT_PAGES]; } mm_context_t; #endif diff -puN include/asm-i386/page.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/page.h --- 25/include/asm-i386/page.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/page.h 2003-08-07 15:24:50.000000000 -0700 @@ -1,6 +1,8 @@ #ifndef _I386_PAGE_H #define _I386_PAGE_H +#include + /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 #define PAGE_SIZE (1UL << PAGE_SHIFT) @@ -9,11 +11,21 @@ #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +/* + * these will go away - right now they keep all the + * 4G/4G sub-features split up, for debugging: + */ +#ifdef CONFIG_X86_4G +#define CONFIG_X86_SWITCH_PAGETABLES y +#define CONFIG_X86_4G_VM_LAYOUT y +#define CONFIG_X86_UACCESS_INDIRECT y +#define CONFIG_X86_HIGH_ENTRY y +#endif #include +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_USE_3DNOW #include @@ -88,8 +100,19 @@ typedef struct { unsigned long pgprot; } * * If you want more physical memory than this then see the CONFIG_HIGHMEM4G * and CONFIG_HIGHMEM64G options in the kernel configuration. + * + * Note: on PAE the kernel must never go below 32 MB, we use the + * first 8 entries of the 2-level boot pgd for PAE magic. */ +#ifdef CONFIG_X86_4G_VM_LAYOUT +#define __PAGE_OFFSET (0x02000000) +#define TASK_SIZE (0xff000000) +#else +#define __PAGE_OFFSET (0xc0000000) +#define TASK_SIZE (0xc0000000) +#endif + /* * This much address space is reserved for vmalloc() and iomap() * as well as fixmap mappings. @@ -114,16 +137,10 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ -#ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) -#else -#define __PAGE_OFFSET (0xC0000000UL) -#endif - - #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define __MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) diff -puN include/asm-i386/pgtable.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/pgtable.h --- 25/include/asm-i386/pgtable.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/pgtable.h 2003-08-07 15:24:50.000000000 -0700 @@ -42,6 +42,7 @@ void pgd_ctor(void *, kmem_cache_t *, un void pgd_dtor(void *, kmem_cache_t *, unsigned long); void pgtable_cache_init(void); void paging_init(void); +void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end); #endif /* !__ASSEMBLY__ */ @@ -51,6 +52,11 @@ void paging_init(void); * newer 3-level PAE-mode page tables. */ #ifndef __ASSEMBLY__ + +extern void set_system_gate(unsigned int n, void *addr); +extern void init_entry_mappings(void); +extern void entry_trampoline_setup(void); + #ifdef CONFIG_X86_PAE # include #else @@ -63,7 +69,12 @@ void paging_init(void); #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#if defined(CONFIG_X86_PAE) && defined(CONFIG_X86_4G_VM_LAYOUT) +# define USER_PTRS_PER_PGD 4 +#else +# define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE) + ((TASK_SIZE % PGDIR_SIZE) + PGDIR_SIZE-1)/PGDIR_SIZE) +#endif + #define FIRST_USER_PGD_NR 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) @@ -234,6 +245,7 @@ static inline void ptep_mkdirty(pte_t *p #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) #define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE) +#define mk_pte_phys(physpage, pgprot) pfn_pte((physpage) >> PAGE_SHIFT, pgprot) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { diff -puN include/asm-i386/processor.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/processor.h 2003-08-07 15:24:50.000000000 -0700 @@ -280,11 +280,6 @@ extern unsigned int machine_submodel_id; extern unsigned int BIOS_revision; extern unsigned int mca_pentium_flag; -/* - * User space process size: 3GB (default). - */ -#define TASK_SIZE (PAGE_OFFSET) - /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ @@ -388,6 +383,7 @@ struct tss_struct { struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + void *stack_page0, *stack_page1; unsigned long esp0; unsigned long eip; unsigned long esp; @@ -467,6 +463,13 @@ extern void prepare_to_copy(struct task_ */ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); +#ifdef CONFIG_X86_HIGH_ENTRY +#define virtual_esp0(tsk) \ + ((unsigned long)(tsk)->thread_info->virtual_stack + ((tsk)->thread.esp0 - (unsigned long)(tsk)->thread_info->real_stack)) +#else +# define virtual_esp0(tsk) ((tsk)->thread.esp0) +#endif + extern unsigned long thread_saved_pc(struct task_struct *tsk); void show_trace(struct task_struct *task, unsigned long *stack); diff -puN include/asm-i386/string.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/string.h --- 25/include/asm-i386/string.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/string.h 2003-08-07 15:24:50.000000000 -0700 @@ -56,6 +56,29 @@ __asm__ __volatile__( return dest; } +/* + * This is a more generic variant of strncpy_count() suitable for + * implementing string-access routines with all sorts of return + * code semantics. It's used by mm/usercopy.c. + */ +static inline size_t strncpy_count(char * dest,const char *src,size_t count) +{ + __asm__ __volatile__( + + "1:\tdecl %0\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "2:" + "incl %0" + : "=c" (count) + :"S" (src),"D" (dest),"0" (count) : "memory"); + + return count; +} + #define __HAVE_ARCH_STRCAT static inline char * strcat(char * dest,const char * src) { diff -puN include/asm-i386/thread_info.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/thread_info.h --- 25/include/asm-i386/thread_info.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/thread_info.h 2003-08-07 15:24:50.000000000 -0700 @@ -33,23 +33,12 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; + void *real_stack, *virtual_stack, *user_pgd; + struct restart_block restart_block; __u8 supervisor_stack[0]; }; -#else /* !__ASSEMBLY__ */ - -/* offsets into the thread_info struct for assembly code access */ -#define TI_TASK 0x00000000 -#define TI_EXEC_DOMAIN 0x00000004 -#define TI_FLAGS 0x00000008 -#define TI_STATUS 0x0000000C -#define TI_CPU 0x00000010 -#define TI_PRE_COUNT 0x00000014 -#define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C - #endif #define PREEMPT_ACTIVE 0x4000000 @@ -61,7 +50,7 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ +#define INIT_THREAD_INFO(tsk, thread_info) \ { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ @@ -72,6 +61,7 @@ struct thread_info { .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ + .real_stack = &thread_info, \ } #define init_thread_info (init_thread_union.thread_info) @@ -113,6 +103,7 @@ static inline struct thread_info *curren #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ +#define TIF_DB7 6 /* has debug registers */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define _TIF_SYSCALL_TRACE (1<active_mm) __flush_tlb(); +#endif } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (vma->vm_mm == current->active_mm) __flush_tlb_one(addr); +#endif } static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (vma->vm_mm == current->active_mm) __flush_tlb(); +#endif } #else @@ -111,11 +117,10 @@ static inline void flush_tlb_range(struc __flush_tlb() extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); extern void flush_tlb_mm(struct mm_struct *); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); -#define flush_tlb() flush_tlb_current_task() +#define flush_tlb() flush_tlb_all() static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) { diff -puN include/asm-i386/uaccess.h~4g-2.6.0-test2-mm2-A5 include/asm-i386/uaccess.h --- 25/include/asm-i386/uaccess.h~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/include/asm-i386/uaccess.h 2003-08-07 15:26:00.000000000 -0700 @@ -26,7 +26,7 @@ #define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFFUL) -#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) +#define USER_DS MAKE_MM_SEG(TASK_SIZE) #define get_ds() (KERNEL_DS) #define get_fs() (current_thread_info()->addr_limit) @@ -149,6 +149,45 @@ extern void __get_user_4(void); :"=a" (ret),"=d" (x) \ :"0" (ptr)) +extern int get_user_size(unsigned int size, void *val, const void *ptr); +extern int put_user_size(unsigned int size, const void *val, void *ptr); +extern int zero_user_size(unsigned int size, void *ptr); +extern int copy_str_fromuser_size(unsigned int size, void *val, const void *ptr); +extern int strlen_fromuser_size(unsigned int size, const void *ptr); + + +# define indirect_get_user(x,ptr) \ +({ int __ret_gu,__val_gu; \ + __typeof__(ptr) __ptr_gu = (ptr); \ + __ret_gu = get_user_size(sizeof(*__ptr_gu), &__val_gu,__ptr_gu) ? -EFAULT : 0;\ + (x) = (__typeof__(*__ptr_gu))__val_gu; \ + __ret_gu; \ +}) +#define indirect_put_user(x,ptr) \ +({ \ + __typeof__(*(ptr)) *__ptr_pu = (ptr), __x_pu = (x); \ + put_user_size(sizeof(*__ptr_pu), &__x_pu, __ptr_pu) ? -EFAULT : 0; \ +}) +#define __indirect_put_user indirect_put_user +#define __indirect_get_user indirect_get_user + +#define indirect_copy_from_user(to,from,n) get_user_size(n,to,from) +#define indirect_copy_to_user(to,from,n) put_user_size(n,from,to) + +#define __indirect_copy_from_user indirect_copy_from_user +#define __indirect_copy_to_user indirect_copy_to_user + +#define indirect_strncpy_from_user(dst, src, count) \ + copy_str_fromuser_size(count, dst, src) + +extern int strlen_fromuser_size(unsigned int size, const void *ptr); +#define indirect_strnlen_user(str, n) strlen_fromuser_size(n, str) +#define indirect_strlen_user(str) indirect_strnlen_user(str, ~0UL >> 1) + +extern int zero_user_size(unsigned int size, void *ptr); + +#define indirect_clear_user(mem, len) zero_user_size(len, mem) +#define __indirect_clear_user clear_user /* Careful: we have to cast the result to the type of the pointer for sign reasons */ /** @@ -168,7 +207,7 @@ extern void __get_user_4(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define get_user(x,ptr) \ +#define direct_get_user(x,ptr) \ ({ int __ret_gu,__val_gu; \ switch(sizeof (*(ptr))) { \ case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break; \ @@ -198,7 +237,7 @@ extern void __put_user_bad(void); * * Returns zero on success, or -EFAULT on error. */ -#define put_user(x,ptr) \ +#define direct_put_user(x,ptr) \ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) @@ -222,7 +261,7 @@ extern void __put_user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define __get_user(x,ptr) \ +#define __direct_get_user(x,ptr) \ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) @@ -245,7 +284,7 @@ extern void __put_user_bad(void); * * Returns zero on success, or -EFAULT on error. */ -#define __put_user(x,ptr) \ +#define __direct_put_user(x,ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) #define __put_user_nocheck(x,ptr,size) \ @@ -396,7 +435,7 @@ unsigned long __copy_from_user_ll(void * * On success, this will be zero. */ static inline unsigned long -__copy_to_user(void __user *to, const void *from, unsigned long n) +__direct_copy_to_user(void __user *to, const void *from, unsigned long n) { if (__builtin_constant_p(n)) { unsigned long ret; @@ -434,7 +473,7 @@ __copy_to_user(void __user *to, const vo * data to the requested size using zero bytes. */ static inline unsigned long -__copy_from_user(void *to, const void __user *from, unsigned long n) +__direct_copy_from_user(void *to, const void __user *from, unsigned long n) { if (__builtin_constant_p(n)) { unsigned long ret; @@ -468,11 +507,11 @@ __copy_from_user(void *to, const void __ * On success, this will be zero. */ static inline unsigned long -copy_to_user(void __user *to, const void *from, unsigned long n) +direct_copy_to_user(void __user *to, const void *from, unsigned long n) { might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + n = __direct_copy_to_user(to, from, n); return n; } @@ -493,11 +532,11 @@ copy_to_user(void __user *to, const void * data to the requested size using zero bytes. */ static inline unsigned long -copy_from_user(void *to, const void __user *from, unsigned long n) +direct_copy_from_user(void *to, const void __user *from, unsigned long n) { might_sleep(); if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); + n = __direct_copy_from_user(to, from, n); else memset(to, 0, n); return n; @@ -520,10 +559,68 @@ long __strncpy_from_user(char *dst, cons * If there is a limit on the length of a valid string, you may wish to * consider using strnlen_user() instead. */ -#define strlen_user(str) strnlen_user(str, ~0UL >> 1) -long strnlen_user(const char __user *str, long n); -unsigned long clear_user(void __user *mem, unsigned long len); -unsigned long __clear_user(void __user *mem, unsigned long len); +long direct_strncpy_from_user(char *dst, const char *src, long count); +long __direct_strncpy_from_user(char *dst, const char *src, long count); +#define direct_strlen_user(str) direct_strnlen_user(str, ~0UL >> 1) +long direct_strnlen_user(const char *str, long n); +unsigned long direct_clear_user(void *mem, unsigned long len); +unsigned long __direct_clear_user(void *mem, unsigned long len); + +extern int indirect_uaccess; + +#ifdef CONFIG_X86_UACCESS_INDIRECT + +/* + * Return code and zeroing semantics: + + __clear_user 0 <-> bytes not done + clear_user 0 <-> bytes not done + __copy_to_user 0 <-> bytes not done + copy_to_user 0 <-> bytes not done + __copy_from_user 0 <-> bytes not done, zero rest + copy_from_user 0 <-> bytes not done, zero rest + __get_user 0 <-> -EFAULT + get_user 0 <-> -EFAULT + __put_user 0 <-> -EFAULT + put_user 0 <-> -EFAULT + strlen_user strlen + 1 <-> 0 + strnlen_user strlen + 1 (or n+1) <-> 0 + strncpy_from_user strlen (or n) <-> -EFAULT + + */ + +#define __clear_user(mem,len) __indirect_clear_user(mem,len) +#define clear_user(mem,len) indirect_clear_user(mem,len) +#define __copy_to_user(to,from,n) __indirect_copy_to_user(to,from,n) +#define copy_to_user(to,from,n) indirect_copy_to_user(to,from,n) +#define __copy_from_user(to,from,n) __indirect_copy_from_user(to,from,n) +#define copy_from_user(to,from,n) indirect_copy_from_user(to,from,n) +#define __get_user(val,ptr) __indirect_get_user(val,ptr) +#define get_user(val,ptr) indirect_get_user(val,ptr) +#define __put_user(val,ptr) __indirect_put_user(val,ptr) +#define put_user(val,ptr) indirect_put_user(val,ptr) +#define strlen_user(str) indirect_strlen_user(str) +#define strnlen_user(src,count) indirect_strnlen_user(src,count) +#define strncpy_from_user(dst,src,count) \ + indirect_strncpy_from_user(dst,src,count) + +#else + +#define __clear_user __direct_clear_user +#define clear_user direct_clear_user +#define __copy_to_user __direct_copy_to_user +#define copy_to_user direct_copy_to_user +#define __copy_from_user __direct_copy_from_user +#define copy_from_user direct_copy_from_user +#define __get_user __direct_get_user +#define get_user direct_get_user +#define __put_user __direct_put_user +#define put_user direct_put_user +#define strlen_user direct_strlen_user +#define strnlen_user direct_strnlen_user +#define strncpy_from_user direct_strncpy_from_user + +#endif /* CONFIG_X86_UACCESS_INDIRECT */ #endif /* __i386_UACCESS_H */ diff -puN kernel/futex.c~4g-2.6.0-test2-mm2-A5 kernel/futex.c --- 25/kernel/futex.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/kernel/futex.c 2003-08-07 15:24:50.000000000 -0700 @@ -34,6 +34,7 @@ #include #include #include +#include #define FUTEX_HASHBITS 8 @@ -315,6 +316,7 @@ static inline int futex_wait(unsigned lo int ret = 0, curval; struct page *page; struct futex_q q; + void *kaddr; init_waitqueue_head(&q.waiters); @@ -331,11 +333,12 @@ static inline int futex_wait(unsigned lo * Page is pinned, but may no longer be in this address space. * It cannot schedule, so we access it with the spinlock held. */ - if (get_user(curval, (int *)uaddr) != 0) { - unlock_futex_mm(); - ret = -EFAULT; - goto out; - } + if (!access_ok(VERIFY_READ, uaddr, 4)) + goto out_fault; + kaddr = kmap_atomic(page, KM_USER0); + curval = *(int*)(kaddr + offset); + kunmap_atomic(kaddr, KM_USER0); + if (curval != val) { unlock_futex_mm(); ret = -EWOULDBLOCK; @@ -372,6 +375,11 @@ out: put_page(q.page); return ret; + +out_fault: + unlock_futex_mm(); + ret = -EFAULT; + goto out; } static int futex_close(struct inode *inode, struct file *filp) diff -puN kernel/ksyms.c~4g-2.6.0-test2-mm2-A5 kernel/ksyms.c --- 25/kernel/ksyms.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/kernel/ksyms.c 2003-08-07 15:24:50.000000000 -0700 @@ -131,7 +131,6 @@ EXPORT_SYMBOL(blk_congestion_wait); EXPORT_SYMBOL(kmap_high); EXPORT_SYMBOL(kunmap_high); EXPORT_SYMBOL(highmem_start_page); -EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #endif #ifdef HASHED_PAGE_VIRTUAL diff -puN Makefile~4g-2.6.0-test2-mm2-A5 Makefile diff -puN mm/Makefile~4g-2.6.0-test2-mm2-A5 mm/Makefile --- 25/mm/Makefile~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/mm/Makefile 2003-08-07 15:24:50.000000000 -0700 @@ -12,3 +12,6 @@ obj-y := bootmem.o filemap.o mempool.o slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +obj-$(CONFIG_X86_4G) += usercopy.o + diff -puN mm/memory.c~4g-2.6.0-test2-mm2-A5 mm/memory.c --- 25/mm/memory.c~4g-2.6.0-test2-mm2-A5 2003-08-07 15:24:50.000000000 -0700 +++ 25-akpm/mm/memory.c 2003-08-07 15:24:50.000000000 -0700 @@ -100,7 +100,8 @@ static inline void free_one_pmd(struct m pte_free_tlb(tlb, page); } -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) +static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir, + int pgd_idx) { int j; pmd_t * pmd; @@ -114,8 +115,11 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) + for (j = 0; j < PTRS_PER_PMD ; j++) { + if (pgd_idx * PGDIR_SIZE + j * PMD_SIZE >= TASK_SIZE) + break; free_one_pmd(tlb, pmd+j); + } pmd_free_tlb(tlb, pmd); } @@ -128,11 +132,13 @@ static inline void free_one_pgd(struct m void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) { pgd_t * page_dir = tlb->mm->pgd; + int pgd_idx = first; page_dir += first; do { - free_one_pgd(tlb, page_dir); + free_one_pgd(tlb, page_dir, pgd_idx); page_dir++; + pgd_idx++; } while (--nr); } @@ -430,7 +436,7 @@ zap_pmd_range(struct mmu_gather *tlb, pg unsigned long address, unsigned long size) { pmd_t * pmd; - unsigned long end; + unsigned long end, pgd_boundary; if (pgd_none(*dir)) return; @@ -441,8 +447,9 @@ zap_pmd_range(struct mmu_gather *tlb, pg } pmd = pmd_offset(dir, address); end = address + size; - if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) - end = ((address + PGDIR_SIZE) & PGDIR_MASK); + pgd_boundary = ((address + PGDIR_SIZE) & PGDIR_MASK); + if (pgd_boundary && (end > pgd_boundary)) + end = pgd_boundary; do { zap_pte_range(tlb, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; diff -puN /dev/null mm/usercopy.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/mm/usercopy.c 2003-08-07 15:24:50.000000000 -0700 @@ -0,0 +1,269 @@ +/* + * linux/mm/usercopy.c + * + * (C) Copyright 2003 Ingo Molnar + * + * Generic implementation of all the user-VM access functions, without + * relying on being able to access the VM directly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Get kernel address of the user page and pin it. + */ +static inline struct page *pin_page(unsigned long addr, int write) +{ + struct mm_struct *mm = current->mm ? : &init_mm; + struct page *page = NULL; + int ret; + + spin_lock(&mm->page_table_lock); + /* + * Do a quick atomic lookup first - this is the fastpath. + */ + page = follow_page(mm, addr, write); + if (likely(page != NULL)) { + if (!PageReserved(page)) + get_page(page); + spin_unlock(&mm->page_table_lock); + return page; + } + + /* + * No luck - bad address or need to fault in the page: + */ + spin_unlock(&mm->page_table_lock); + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, addr, 1, write, 0, &page, NULL); + up_read(&mm->mmap_sem); + if (ret <= 0) + return NULL; + return page; +} + +static inline void unpin_page(struct page *page) +{ + put_page(page); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +static int rw_vm(unsigned long addr, void *buf, int len, int write) +{ + if (!len) + return 0; + + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + struct page *page = NULL; + int bytes, offset; + void *maddr; + + page = pin_page(addr, write); + if (!page) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap_atomic(page, KM_USER_COPY); + +#define HANDLE_TYPE(type) \ + case sizeof(type): *(type *)(maddr+offset) = *(type *)(buf); break; + + if (write) { + switch (bytes) { + HANDLE_TYPE(char); + HANDLE_TYPE(int); + HANDLE_TYPE(long long); + default: + memcpy(maddr + offset, buf, bytes); + } + } else { +#undef HANDLE_TYPE +#define HANDLE_TYPE(type) \ + case sizeof(type): *(type *)(buf) = *(type *)(maddr+offset); break; + switch (bytes) { + HANDLE_TYPE(char); + HANDLE_TYPE(int); + HANDLE_TYPE(long long); + default: + memcpy(buf, maddr + offset, bytes); + } +#undef HANDLE_TYPE + } + kunmap_atomic(maddr, KM_USER_COPY); + unpin_page(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + + return len; +} + +static int str_vm(unsigned long addr, void *buf0, int len, int copy) +{ + struct mm_struct *mm = current->mm ? : &init_mm; + struct page *page; + void *buf = buf0; + + if (!len) + return len; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset, left, copied; + char *maddr; + + ret = get_user_pages(current, mm, addr, 1, copy == 2, 0, &page, NULL); + if (ret <= 0) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap_atomic(page, KM_USER_COPY); + if (copy == 2) { + memset(maddr + offset, 0, bytes); + copied = bytes; + left = 0; + } else if (copy == 1) { + left = strncpy_count(buf, maddr + offset, bytes); + copied = bytes - left; + } else { + copied = strnlen(maddr + offset, bytes); + left = bytes - copied; + } + BUG_ON(bytes < 0 || copied < 0); + kunmap_atomic(maddr, KM_USER_COPY); + page_cache_release(page); + len -= copied; + buf += copied; + addr += copied; + if (left) + break; + } + up_read(&mm->mmap_sem); + + return len; +} + +/* + * Copies memory from userspace (ptr) into kernelspace (val). + * + * returns # of bytes not copied. + */ +int get_user_size(unsigned int size, void *val, const void *ptr) +{ + int ret; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(val, ptr, size); + return 0; + } + ret = rw_vm((unsigned long)ptr, val, size, 0); + if (ret) + /* + * Zero the rest: + */ + memset(val + size - ret, 0, ret); + return ret; +} + +/* + * Copies memory from kernelspace (val) into userspace (ptr). + * + * returns # of bytes not copied. + */ +int put_user_size(unsigned int size, const void *val, void *ptr) +{ + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(ptr, val, size); + return 0; + } + return rw_vm((unsigned long)ptr, (void *)val, size, 1); +} + +int copy_str_fromuser_size(unsigned int size, void *val, const void *ptr) +{ + int copied, left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + left = strncpy_count(val, ptr, size); + copied = size - left; + BUG_ON(copied < 0); + + return copied; + } + left = str_vm((unsigned long)ptr, val, size, 1); + if (left < 0) + return left; + copied = size - left; + BUG_ON(copied < 0); + + return copied; +} + +int strlen_fromuser_size(unsigned int size, const void *ptr) +{ + int copied, left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + copied = strnlen(ptr, size) + 1; + BUG_ON(copied < 0); + + return copied; + } + left = str_vm((unsigned long)ptr, NULL, size, 0); + if (left < 0) + return 0; + copied = size - left + 1; + BUG_ON(copied < 0); + + return copied; +} + +int zero_user_size(unsigned int size, void *ptr) +{ + int left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memset(ptr, 0, size); + return 0; + } + left = str_vm((unsigned long)ptr, NULL, size, 2); + if (left < 0) + return size; + return left; +} + +EXPORT_SYMBOL(get_user_size); +EXPORT_SYMBOL(put_user_size); +EXPORT_SYMBOL(zero_user_size); +EXPORT_SYMBOL(copy_str_fromuser_size); +EXPORT_SYMBOL(strlen_fromuser_size); + _