From: "H. Peter Anvin" This patch cleans up the very early memory setup on the i386 platform. In particular, it removes the hard-coded 8 MB limit completely by dynamically creating the early-boot pagetables rather than having them hard coded. While I was at it, I changed head.S so that it always sets up a local GDT; this means among other things that SMP and VISWS are no longer special cases, and is conceptually cleaner to boot. The VISWS people have confirmed it works on VISWS. It also uses a separate entrypoint for non-boot processors since this is completely kernel-internal anyway. This eliminates the need to set %bx on boot. (If you think this is a bad idea I can eliminate this change; it just seemed cleaner to me to do it this way.) Additionally, zero bss with rep;stosl rather that rep;stosb. --- arch/i386/Kconfig | 5 arch/i386/boot/tools/build.c | 6 - arch/i386/defconfig | 1 arch/i386/kernel/Makefile | 3 arch/i386/kernel/asm-offsets.c | 4 arch/i386/kernel/head.S | 212 +++++++++++++++++++++++------------------ arch/i386/kernel/setup.c | 10 + arch/i386/kernel/trampoline.S | 35 +++--- arch/i386/kernel/vmlinux.lds.S | 5 arch/i386/mm/discontig.c | 4 include/asm-i386/pgtable.h | 4 11 files changed, 162 insertions(+), 127 deletions(-) diff -puN arch/i386/boot/tools/build.c~i386-early-memory-cleanup arch/i386/boot/tools/build.c --- 25/arch/i386/boot/tools/build.c~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/boot/tools/build.c 2004-03-02 18:18:28.000000000 -0800 @@ -150,10 +150,8 @@ int main(int argc, char ** argv) sz = sb.st_size; fprintf (stderr, "System is %d kB\n", sz/1024); sys_size = (sz + 15) / 16; - /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ - if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) - die("System is too big. Try using %smodules.", - is_big_kernel ? "" : "bzImage or "); + if (!is_big_kernel && sys_size > DEF_SYSSIZE) + die("System is too big. Try using bzImage or modules."); while (sz > 0) { int l, n; diff -puN arch/i386/defconfig~i386-early-memory-cleanup arch/i386/defconfig --- 25/arch/i386/defconfig~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/defconfig 2004-03-02 18:18:28.000000000 -0800 @@ -1212,5 +1212,4 @@ CONFIG_CRC32=y CONFIG_X86_SMP=y CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y -CONFIG_X86_TRAMPOLINE=y CONFIG_PC=y diff -puN arch/i386/Kconfig~i386-early-memory-cleanup arch/i386/Kconfig --- 25/arch/i386/Kconfig~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/Kconfig 2004-03-02 18:18:28.000000000 -0800 @@ -1535,11 +1535,6 @@ config X86_BIOS_REBOOT depends on !(X86_VISWS || X86_VOYAGER) default y -config X86_TRAMPOLINE - bool - depends on SMP || X86_VISWS - default y - config PC bool depends on X86 && !EMBEDDED diff -puN arch/i386/kernel/asm-offsets.c~i386-early-memory-cleanup arch/i386/kernel/asm-offsets.c --- 25/arch/i386/kernel/asm-offsets.c~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/kernel/asm-offsets.c 2004-03-02 18:18:28.000000000 -0800 @@ -4,9 +4,11 @@ * to extract and format the required data. */ +#include #include #include #include "sigframe.h" +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -28,4 +30,6 @@ void foo(void) DEFINE(RT_SIGFRAME_sigcontext, offsetof (struct rt_sigframe, uc.uc_mcontext)); + + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); } diff -puN arch/i386/kernel/head.S~i386-early-memory-cleanup arch/i386/kernel/head.S --- 25/arch/i386/kernel/head.S~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/kernel/head.S 2004-03-02 18:18:28.000000000 -0800 @@ -17,7 +17,7 @@ #include #include #include - +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -40,49 +40,89 @@ #define X86_VENDOR_ID CPU_PARAMS+36 /* offset dependent on NCAPINTS */ /* - * Initialize page tables + * This is how much memory *in addition to the memory covered up to + * and including _end* we need mapped initially. We need one bit for + * each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. */ -#define INIT_PAGE_TABLES \ - movl $pg0 - __PAGE_OFFSET, %edi; \ - /* "007" doesn't mean with license to kill, but PRESENT+RW+USER */ \ - movl $007, %eax; \ -2: stosl; \ - add $0x1000, %eax; \ - cmp $empty_zero_page - __PAGE_OFFSET, %edi; \ - jne 2b; +#define INIT_MAP_BEYOND_END (128*1024) + /* - * swapper_pg_dir is the main page directory, address 0x00101000 - * - * On entry, %esi points to the real-mode code as a 32-bit pointer. + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. */ ENTRY(startup_32) -#ifdef CONFIG_X86_VISWS /* - * On SGI Visual Workstations boot CPU starts in protected mode. + * Set segments to known values. */ - orw %bx, %bx - jnz 1f - INIT_PAGE_TABLES - movl $swapper_pg_dir - __PAGE_OFFSET, %eax - movl %eax, %cr3 - lgdt boot_gdt -1: -#endif + cld + lgdt boot_gdt_descr - __PAGE_OFFSET + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs /* - * Set segments to known values + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond _end. The variable + * init_pg_tables_end is set up to point to the first "safe" location. + * + * Warning: don't use %esi or the stack in this code. However, %esp + * can be used as a GPR if you really need it... */ +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $(pg0 - __PAGE_OFFSET), %edi + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +10: + leal 0x007(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ + /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ + leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + cmpl %ebp,%eax + jb 10b + movl %edi,(init_pg_tables_end - __PAGE_OFFSET) + +#ifdef CONFIG_SMP + xorl %ebx,%ebx /* This is the boot CPU (BSP) */ + jmp 3f + +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt_table GDT + * for us. + */ +ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es movl %eax,%fs movl %eax,%gs -#ifdef CONFIG_SMP - orw %bx,%bx - jz 1f + + xorl %ebx,%ebx + incl %ebx /* This is a secondary processor (AP) */ /* * New page tables may be in 4Mbyte page mode and may @@ -99,37 +139,40 @@ ENTRY(startup_32) * not yet offset PAGE_OFFSET.. */ #define cr4_bits mmu_cr4_features-__PAGE_OFFSET - cmpl $0,cr4_bits - je 3f + movl cr4_bits,%edx + andl %edx,%edx + jz 3f movl %cr4,%eax # Turn on paging options (PSE,PAE,..) - orl cr4_bits,%eax + orl %edx,%eax movl %eax,%cr4 - jmp 3f -1: -#endif - INIT_PAGE_TABLES + +3: +#endif /* CONFIG_SMP */ + /* * Enable paging */ -3: movl $swapper_pg_dir-__PAGE_OFFSET,%eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $0x80000000,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ - jmp 1f /* flush the prefetch-queue */ -1: - movl $1f,%eax - jmp *%eax /* make sure eip is relocated */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: /* Set up the stack pointer */ lss stack_start,%esp -#ifdef CONFIG_SMP - orw %bx,%bx - jz 1f /* Initial CPU cleans BSS */ +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ pushl $0 popfl + +#ifdef CONFIG_SMP + andl %ebx,%ebx + jz 1f /* Initial CPU cleans BSS */ jmp checkCPUtype 1: #endif /* CONFIG_SMP */ @@ -142,21 +185,15 @@ ENTRY(startup_32) movl $__bss_start,%edi movl $__bss_stop,%ecx subl %edi,%ecx - rep - stosb + shrl $2,%ecx + rep ; stosl /* * start system 32-bit setup. We need to re-do some of the things done * in 16-bit mode for the "real" operations. */ call setup_idt -/* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. - */ - pushl $0 - popfl + /* * Copy bootup parameters out of the way. First 2kB of * _empty_zero_page is for boot parameters, second 2kB @@ -273,7 +310,7 @@ is386: movl $2,%ecx # set MP call initialize_secondary jmp L6 1: -#endif +#endif /* CONFIG_SMP */ call start_kernel L6: jmp L6 # main should never return here, but @@ -309,6 +346,8 @@ check_x87: * and the kernel moved to PAGE_OFFSET. Interrupts * are enabled elsewhere, when we can be relatively * sure everything is ok. + * + * Warning: %esi is live across this function. */ setup_idt: lea ignore_int,%edx @@ -332,7 +371,7 @@ ENTRY(stack_start) /* This is the default interrupt "handler" :-) */ int_msg: - .asciz "Unknown interrupt\n" + .asciz "Unknown interrupt or fault at EIP %p %p %p\n" ALIGN ignore_int: cld @@ -344,9 +383,13 @@ ignore_int: movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) pushl $int_msg call printk - popl %eax + addl $(5*4),%esp popl %ds popl %es popl %edx @@ -361,10 +404,17 @@ ignore_int: * segment size, and 32-bit linear address value: */ +.globl boot_gdt_descr .globl idt_descr .globl cpu_gdt_descr ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +boot_gdt_descr: + .word __BOOT_DS+7 + .long boot_gdt_table - __PAGE_OFFSET + .word 0 # 32-bit align idt_desc.address idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries @@ -379,41 +429,25 @@ cpu_gdt_descr: .fill NR_CPUS-1,8,0 # space for the other GDT descriptors /* - * This is initialized to create an identity-mapping at 0-8M (for bootup - * purposes) and another mapping of the 0-8M area at virtual address - * PAGE_OFFSET. + * swapper_pg_dir is the main page directory, address 0x00101000 + * + * This is initialized to create an identity-mapping at 0 (for bootup + * purposes) and another mapping at virtual address PAGE_OFFSET. The + * values put here should be all invalid (zero); the valid + * entries are created dynamically at boot time. + * + * The code creates enough page tables to map 0-_end, the page tables + * themselves, plus INIT_MAP_BEYOND_END bytes; see comment at beginning. */ .org 0x1000 ENTRY(swapper_pg_dir) - .long 0x00102007 - .long 0x00103007 - .fill BOOT_USER_PGD_PTRS-2,4,0 - /* default: 766 entries */ - .long 0x00102007 - .long 0x00103007 - /* default: 254 entries */ - .fill BOOT_KERNEL_PGD_PTRS-2,4,0 + .fill 1024,4,0 -/* - * The page tables are initialized to only 8MB here - the final page - * tables are set up later depending on memory size. - */ .org 0x2000 -ENTRY(pg0) - -.org 0x3000 -ENTRY(pg1) - -/* - * empty_zero_page must immediately follow the page tables ! (The - * initialization loop counts until empty_zero_page) - */ - -.org 0x4000 ENTRY(empty_zero_page) + .fill 4096,1,0 -.org 0x5000 - +.org 0x3000 /* * Real beginning of normal "text" segment */ @@ -428,20 +462,19 @@ ENTRY(_stext) .data /* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_X86_VISWS) -/* * The boot_gdt_table must mirror the equivalent in setup.S and is - * used only by the trampoline for booting other CPUs + * used only for booting. */ .align L1_CACHE_BYTES ENTRY(boot_gdt_table) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ -#endif - .align L1_CACHE_BYTES + +/* + * The Global Descriptor Table contains 28 quadwords, per-CPU. + */ + .align PAGE_SIZE_asm ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -488,4 +521,3 @@ ENTRY(cpu_gdt_table) #ifdef CONFIG_SMP .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ #endif - diff -puN arch/i386/kernel/Makefile~i386-early-memory-cleanup arch/i386/kernel/Makefile --- 25/arch/i386/kernel/Makefile~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/kernel/Makefile 2004-03-02 18:18:28.000000000 -0800 @@ -19,8 +19,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o smpboot.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o diff -puN arch/i386/kernel/setup.c~i386-early-memory-cleanup arch/i386/kernel/setup.c --- 25/arch/i386/kernel/setup.c~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/kernel/setup.c 2004-03-02 18:18:28.000000000 -0800 @@ -50,6 +50,11 @@ #include "setup_arch_pre.h" #include "mach_resources.h" +/* This value is set up by the early boot code to point to the value + immediately after the boot time page tables. It contains a *physical* + address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_end __initdata = ~0UL; + int disable_pse __initdata = 0; static inline char * __init machine_specific_memory_setup(void); @@ -115,7 +120,6 @@ extern void early_cpu_init(void); extern void dmi_scan_machine(void); extern void generic_apic_probe(char *); extern int root_mountflags; -extern char _end[]; unsigned long saved_videomode; @@ -790,7 +794,7 @@ static unsigned long __init setup_memory * partially used pages are not usable - thus * we are rounding upwards: */ - start_pfn = PFN_UP(__pa(_end)); + start_pfn = PFN_UP(init_pg_tables_end); find_max_pfn(); @@ -1102,7 +1106,7 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; + init_mm.brk = init_pg_tables_end + PAGE_OFFSET; code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; diff -puN arch/i386/kernel/trampoline.S~i386-early-memory-cleanup arch/i386/kernel/trampoline.S --- 25/arch/i386/kernel/trampoline.S~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/kernel/trampoline.S 2004-03-02 18:18:28.000000000 -0800 @@ -23,9 +23,13 @@ * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries except for the gdt one.. + * If you work on this file, check the object module with + * objdump --reloc to make sure there are no relocation + * entries except for: + * + * TYPE VALUE + * R_386_32 startup_32_smp + * R_386_32 boot_gdt_table */ #include @@ -42,7 +46,6 @@ r_base = . mov %cs, %ax # Code and data in the same place mov %ax, %ds - mov $1, %bx # Flag an SMP trampoline cli # We should be safe anyway movl $0xA5A5A5A5, trampoline_data - r_base @@ -54,22 +57,18 @@ r_base = . xor %ax, %ax inc %ax # protected mode (PE) bit lmsw %ax # into protected mode - jmp flush_instr -flush_instr: - ljmpl $__BOOT_CS, $0x00100000 - # jump to startup_32 in arch/i386/kernel/head.S - -boot_idt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L + # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S + ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) -# -# NOTE: here we actually use CPU#0's GDT - but that is OK, we reload -# the proper GDT shortly after booting up the secondary CPUs. -# -ENTRY(boot_gdt) + # These need to be in the same 64K segment as the above; + # hence we don't use the boot_gdt_descr defined in head.S +boot_gdt: .word __BOOT_DS + 7 # gdt limit - .long boot_gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU) + .long boot_gdt_table-__PAGE_OFFSET # gdt base + +boot_idt: + .word 0 # idt limit = 0 + .long 0 # idt base = 0L .globl trampoline_end trampoline_end: diff -puN arch/i386/kernel/vmlinux.lds.S~i386-early-memory-cleanup arch/i386/kernel/vmlinux.lds.S --- 25/arch/i386/kernel/vmlinux.lds.S~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/kernel/vmlinux.lds.S 2004-03-02 18:18:28.000000000 -0800 @@ -105,10 +105,15 @@ SECTIONS __bss_start = .; /* BSS */ .bss : { *(.bss) } + . = ALIGN(4); __bss_stop = .; _end = . ; + /* This is where the kernel creates the early boot page tables */ + . = ALIGN(4096); + pg0 = .; + /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) diff -puN arch/i386/mm/discontig.c~i386-early-memory-cleanup arch/i386/mm/discontig.c --- 25/arch/i386/mm/discontig.c~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/arch/i386/mm/discontig.c 2004-03-02 18:18:28.000000000 -0800 @@ -66,7 +66,7 @@ extern void find_max_pfn(void); extern void one_highpage_init(struct page *, int, int); extern struct e820map e820; -extern char _end; +extern unsigned long init_pg_tables_end; extern unsigned long highend_pfn, highstart_pfn; extern unsigned long max_low_pfn; extern unsigned long totalram_pages; @@ -237,7 +237,7 @@ unsigned long __init setup_memory(void) reserve_pages = calculate_numa_remap_pages(); /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end)); + system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); find_max_pfn(); system_max_low_pfn = max_low_pfn = find_max_low_pfn(); diff -puN include/asm-i386/pgtable.h~i386-early-memory-cleanup include/asm-i386/pgtable.h --- 25/include/asm-i386/pgtable.h~i386-early-memory-cleanup 2004-03-02 18:18:28.000000000 -0800 +++ 25-akpm/include/asm-i386/pgtable.h 2004-03-02 18:18:28.000000000 -0800 @@ -173,8 +173,8 @@ extern unsigned long __PAGE_KERNEL; */ #undef TEST_VERIFY_AREA -/* page table for 0-4MB for everybody */ -extern unsigned long pg0[1024]; +/* The boot page tables (all created as a single array) */ +extern unsigned long pg0[]; #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) _