From: Andi Kleen x86-64 merge for 2.6.0test3. Without these changes an x86-64 NUMA kernel won't boot in many configurations. Main change is the improved IOMMU code which supports merging of mappings and has various bugfixes. It seems to be stable now, so I'm merging it. These are only x86-64 specific changes. I have some other changes outside arch/x86_6 that I'm sending separately. Please consider merging. - Update defconfig - Use argument ptregs in 32bit elf_core_copy_task_fpregs - Harden aperture fixup code: read aperture from the AGP bridge if needed, better error checking. - Support nmi_watchdog=panic to panic on watchdog trigger - IOMMU: Support panic on IOMMU overflow (iommu=panic) - IOMMU: Force SAC for mappings >40bits when iommu=force is active (this can potentially give better performance) - IOMMU: Cache northbridges for faster TLB flush - IOMMU: Fix SMP race in TLB flush - IOMMU: Merge pci_alloc_consistent and pci_map_single - IOMMU: Clean up leak tracing - IOMMU: Rewrite pci_map_sg, support merging of mappings On overflow fall back to piece-by-piece mapping. - IOMMU: Tell block layer to assume merging when iommu force is active (this gives better performance with MTP fusion, drawback is that the overflow/fragmentation handling of the IOMMU area is still a big dubious with that) - Fix/clean up per cpu data - Add 64bit clean time(2) - Export cpu_callout_map for IPv6 - Handle nodes with no own memory in NUMA discovery. This fixes boot on various newer Opteron motherboards where the memory is only connected to a single CPU. - Fix fallback path for failed NUMA discovery. numnodes has to be reset. - Check for enabled nodes in NUMA discovery (Eric Biederman) - Remove NUMA emunodes support. Has badly bitrotted. - Add __clear_bit_string for IOMMU code - Add new 32bit system calls to ia32_unistd.h - Remove duplicate default_do_nmi prototype - Make PCI_DMA_BUS_IS_PHYS dependent on no_iommu - Fix padding length of siginfo_t to match glibc - More pci direct access functions. arch/x86_64/defconfig | 133 ++++++---- arch/x86_64/ia32/ia32_binfmt.c | 3 arch/x86_64/kernel/Makefile | 5 arch/x86_64/kernel/aperture.c | 194 +++++++++++++-- arch/x86_64/kernel/nmi.c | 12 arch/x86_64/kernel/pci-dma.c | 9 arch/x86_64/kernel/pci-gart.c | 484 ++++++++++++++++++++++++++++----------- arch/x86_64/kernel/pci-nommu.c | 25 +- arch/x86_64/kernel/setup64.c | 23 - arch/x86_64/kernel/sys_x86_64.c | 14 + arch/x86_64/kernel/x8664_ksyms.c | 1 arch/x86_64/mm/k8topology.c | 43 ++- arch/x86_64/mm/numa.c | 31 -- include/asm-x86_64/bitops.h | 4 include/asm-x86_64/ia32_unistd.h | 15 + include/asm-x86_64/io.h | 6 include/asm-x86_64/nmi.h | 2 include/asm-x86_64/pci-direct.h | 21 + include/asm-x86_64/pci.h | 35 +- include/asm-x86_64/percpu.h | 56 ++-- include/asm-x86_64/proto.h | 2 include/asm-x86_64/siginfo.h | 2 include/asm-x86_64/unistd.h | 2 23 files changed, 787 insertions(+), 335 deletions(-) diff -puN arch/x86_64/defconfig~x86_64-merge-test3 arch/x86_64/defconfig --- 25/arch/x86_64/defconfig~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/defconfig 2003-08-10 14:38:32.000000000 -0700 @@ -22,7 +22,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y -CONFIG_LOG_BUF_SHIFT=16 +CONFIG_LOG_BUF_SHIFT=18 # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y CONFIG_FUTEX=y @@ -149,17 +149,12 @@ CONFIG_LBD=y # ATA/ATAPI/MFM/RLL support # CONFIG_IDE=y - -# -# IDE, ATA and ATAPI Block devices -# CONFIG_BLK_DEV_IDE=y # # Please see Documentation/ide.txt for help/info on IDE drives # # CONFIG_BLK_DEV_HD_IDE is not set -# CONFIG_BLK_DEV_HD is not set CONFIG_BLK_DEV_IDEDISK=y CONFIG_IDEDISK_MULTI_MODE=y # CONFIG_IDEDISK_STROKE is not set @@ -174,15 +169,16 @@ CONFIG_BLK_DEV_IDECD=y # # CONFIG_BLK_DEV_CMD640 is not set CONFIG_BLK_DEV_IDEPCI=y -# CONFIG_BLK_DEV_GENERIC is not set # CONFIG_IDEPCI_SHARE_IRQ is not set +# CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_GENERIC is not set +# CONFIG_BLK_DEV_OPTI621 is not set +# CONFIG_BLK_DEV_RZ1000 is not set CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_IDE_TCQ is not set -# CONFIG_BLK_DEV_OFFBOARD is not set # CONFIG_BLK_DEV_IDEDMA_FORCED is not set CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set -CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_PCI_WIP is not set CONFIG_BLK_DEV_ADMA=y # CONFIG_BLK_DEV_AEC62XX is not set @@ -192,23 +188,25 @@ CONFIG_BLK_DEV_AMD74XX=y # CONFIG_BLK_DEV_TRIFLEX is not set # CONFIG_BLK_DEV_CY82C693 is not set # CONFIG_BLK_DEV_CS5520 is not set +# CONFIG_BLK_DEV_CS5530 is not set # CONFIG_BLK_DEV_HPT34X is not set # CONFIG_BLK_DEV_HPT366 is not set # CONFIG_BLK_DEV_SC1200 is not set # CONFIG_BLK_DEV_PIIX is not set # CONFIG_BLK_DEV_NS87415 is not set -# CONFIG_BLK_DEV_OPTI621 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set # CONFIG_BLK_DEV_PDC202XX_NEW is not set -# CONFIG_BLK_DEV_RZ1000 is not set # CONFIG_BLK_DEV_SVWKS is not set # CONFIG_BLK_DEV_SIIMAGE is not set # CONFIG_BLK_DEV_SIS5513 is not set # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set -CONFIG_IDEDMA_AUTO=y +CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_DMA_NONPCI is not set +# CONFIG_BLK_DEV_HD is not set # # SCSI device support @@ -251,7 +249,7 @@ CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_EATA_PIO is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set # CONFIG_SCSI_GDTH is not set -# CONFIG_SCSI_IPS is not set +CONFIG_SCSI_IPS=m # CONFIG_SCSI_INITIO is not set # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_SYM53C8XX_2 is not set @@ -301,7 +299,6 @@ CONFIG_NET=y CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set # CONFIG_NETLINK_DEV is not set -# CONFIG_NETFILTER is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -317,12 +314,10 @@ CONFIG_IP_MULTICAST=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set -CONFIG_IPV6=y -CONFIG_IPV6_PRIVACY=y -# CONFIG_INET6_AH is not set -# CONFIG_INET6_ESP is not set -# CONFIG_INET6_IPCOMP is not set -# CONFIG_IPV6_TUNNEL is not set +# CONFIG_IPV6 is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_NETFILTER is not set # CONFIG_XFRM_USER is not set # @@ -333,8 +328,6 @@ CONFIG_IPV6_SCTP__=y # CONFIG_ATM is not set # CONFIG_VLAN_8021Q is not set # CONFIG_LLC is not set -# CONFIG_DECNET is not set -# CONFIG_BRIDGE is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set # CONFIG_NET_DIVERT is not set @@ -546,11 +539,7 @@ CONFIG_UNIX98_PTY_COUNT=256 # # IPMI # -CONFIG_IPMI_HANDLER=y -CONFIG_IPMI_PANIC_EVENT=y -CONFIG_IPMI_DEVICE_INTERFACE=y -CONFIG_IPMI_KCS=y -CONFIG_IPMI_WATCHDOG=y +# CONFIG_IPMI_HANDLER is not set # # Watchdog Cards @@ -570,12 +559,7 @@ CONFIG_RTC=y # CONFIG_FTAPE is not set CONFIG_AGP=y CONFIG_AGP_AMD_8151=y -CONFIG_DRM=y -# CONFIG_DRM_TDFX is not set -# CONFIG_DRM_GAMMA is not set -# CONFIG_DRM_R128 is not set -CONFIG_DRM_RADEON=y -# CONFIG_DRM_MGA is not set +# CONFIG_DRM is not set # CONFIG_MWAVE is not set CONFIG_RAW_DRIVER=y CONFIG_HANGCHECK_TIMER=y @@ -598,19 +582,25 @@ CONFIG_HANGCHECK_TIMER=y # File systems # CONFIG_EXT2_FS=y -# CONFIG_EXT2_FS_XATTR is not set +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +# CONFIG_EXT2_FS_SECURITY is not set CONFIG_EXT3_FS=y -# CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +# CONFIG_EXT3_FS_SECURITY is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y CONFIG_REISERFS_FS=y # CONFIG_REISERFS_CHECK is not set # CONFIG_REISERFS_PROC_INFO is not set -# CONFIG_JFS_FS is not set -CONFIG_XFS_FS=m -# CONFIG_XFS_RT is not set -# CONFIG_XFS_QUOTA is not set -# CONFIG_XFS_POSIX_ACL is not set +CONFIG_JFS_FS=y +CONFIG_JFS_POSIX_ACL=y +# CONFIG_JFS_DEBUG is not set +# CONFIG_JFS_STATISTICS is not set +CONFIG_FS_POSIX_ACL=y +# CONFIG_XFS_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set # CONFIG_QUOTA is not set @@ -684,6 +674,49 @@ CONFIG_SUNRPC=y # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +# CONFIG_NLS_CODEPAGE_437 is not set +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +# CONFIG_NLS_ISO8859_1 is not set +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set # # Graphics support @@ -759,8 +792,10 @@ CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_INIT_DEBUG is not set +# CONFIG_DEBUG_INFO is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_IOMMU_DEBUG is not set +CONFIG_IOMMU_DEBUG=y +CONFIG_IOMMU_LEAK=y CONFIG_MCE_DEBUG=y # @@ -771,21 +806,7 @@ CONFIG_MCE_DEBUG=y # # Cryptographic options # -CONFIG_CRYPTO=y -# CONFIG_CRYPTO_HMAC is not set -# CONFIG_CRYPTO_NULL is not set -# CONFIG_CRYPTO_MD4 is not set -CONFIG_CRYPTO_MD5=y -# CONFIG_CRYPTO_SHA1 is not set -# CONFIG_CRYPTO_SHA256 is not set -# CONFIG_CRYPTO_SHA512 is not set -# CONFIG_CRYPTO_DES is not set -# CONFIG_CRYPTO_BLOWFISH is not set -# CONFIG_CRYPTO_TWOFISH is not set -# CONFIG_CRYPTO_SERPENT is not set -# CONFIG_CRYPTO_AES is not set -# CONFIG_CRYPTO_DEFLATE is not set -# CONFIG_CRYPTO_TEST is not set +# CONFIG_CRYPTO is not set # # Library routines diff -puN arch/x86_64/ia32/ia32_binfmt.c~x86_64-merge-test3 arch/x86_64/ia32/ia32_binfmt.c --- 25/arch/x86_64/ia32/ia32_binfmt.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/ia32/ia32_binfmt.c 2003-08-10 14:38:32.000000000 -0700 @@ -204,10 +204,9 @@ static inline int elf_core_copy_task_reg } static inline int -elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *xregs, elf_fpregset_t *fpu) +elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu) { struct _fpstate_ia32 *fpstate = (void*)fpu; - struct pt_regs *regs = (struct pt_regs *)(tsk->thread.rsp0); mm_segment_t oldfs = get_fs(); if (!tsk->used_math) diff -puN arch/x86_64/kernel/aperture.c~x86_64-merge-test3 arch/x86_64/kernel/aperture.c --- 25/arch/x86_64/kernel/aperture.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/aperture.c 2003-08-10 14:38:32.000000000 -0700 @@ -1,14 +1,14 @@ /* * Firmware replacement code. * - * Work around broken BIOSes that don't set an aperture. - * The IOMMU code needs an aperture even who no AGP is present in the system. - * Map the aperture over some low memory. This is cheaper than doing bounce - * buffering. The memory is lost. This is done at early boot because only - * the bootmem allocator can allocate 32+MB. + * Work around broken BIOSes that don't set an aperture or only set the + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. * * Copyright 2002 Andi Kleen, SuSE Labs. - * $Id: aperture.c,v 1.2 2002/09/19 19:25:32 ak Exp $ + * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ */ #include #include @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -45,10 +47,10 @@ static u32 __init allocate_aperture(void aper_size = (32 * 1024 * 1024) << fallback_aper_order; /* - * Aperture has to be naturally aligned it seems. This means an - * 2GB aperture won't have much changes to succeed in the lower 4GB of - * memory. Unfortunately we cannot move it up because that would make - * the IOMMU useless. + * Aperture has to be naturally aligned. This means an 2GB aperture won't + * have much chances to find a place in the lower 4GB of memory. + * Unfortunately we cannot move it up because that would make the + * IOMMU useless. */ p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { @@ -63,21 +65,136 @@ static u32 __init allocate_aperture(void return (u32)__pa(p); } +static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) +{ + if (!aper_base) + return 0; + if (aper_size < 64*1024*1024) { + printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); + return 0; + } + if (aper_base + aper_size >= 0xffffffff) { + printk("Aperture from %s beyond 4GB. Ignoring.\n",name); + return 0; + } + if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); + return 0; + } + return 1; +} + +/* Find a PCI capability */ +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ + if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + return 0; + u8 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); + int bytes; + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + pos &= ~3; + u8 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +{ + printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); + u32 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + + if (apsizereg == 0xffffffff) { + printk("APSIZE in AGP bridge unreadable\n"); + return 0; + } + + u32 apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + int nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + u32 aper_low = read_pci_config(num,slot,func, 0x10); + u32 aper_hi = read_pci_config(num,slot,func,0x14); + u64 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); + + if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) + return 0; + return (u32)aper; +} + +/* Look for an AGP bridge. Windows only expects the aperture in the + AGP bridge and some BIOS forget to initialize the Northbridge too. + Work around this here. + + Do an PCI bus scan by hand because we're running before the PCI + subsystem. + + All K8 AGP bridges are AGPv3 compliant, so we can do this scan + generically. It's probably overkill to always scan all slots because + the AGP bridges should be always an own bus on the HT hierarchy, + but do it here for future safety. */ +static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int num, slot, func; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(num,slot,func,cap,order); + } + + /* No multi-function device? */ + u8 type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + printk("No AGP bridge found\n"); + return 0; +} + void __init iommu_hole_init(void) { int fix, num; - u32 aper_size, aper_alloc, aper_order; + u32 aper_size, aper_alloc = 0, aper_order; u64 aper_base; - - if (no_iommu) - return; - if (end_pfn < (0xffffffff>>PAGE_SHIFT) && !force_mmu) - return; + int valid_agp = 0; printk("Checking aperture...\n"); fix = 0; for (num = 24; num < 32; num++) { + char name[30]; if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) continue; @@ -86,15 +203,12 @@ void __init iommu_hole_init(void) aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; aper_base <<= 25; - printk("CPU %d: aperture @ %Lx size %u KB\n", num-24, - aper_base, aper_size>>10); - if (!aper_base || aper_base + aper_size >= 0xffffffff) { - fix = 1; - break; - } + printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, + aper_base, aper_size>>20); - if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture pointing to e820 RAM. Ignoring.\n"); + sprintf(name, "northbridge cpu %d", num-24); + + if (!aperture_valid(name, aper_base, aper_size)) { fix = 1; break; } @@ -103,12 +217,40 @@ void __init iommu_hole_init(void) if (!fix && !fallback_aper_force) return; + if (!fallback_aper_force) + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || + force_iommu || + valid_agp || + fallback_aper_force) { + /* When there is a AGP bridge in the system assume the + user wants to use the AGP driver too and needs an + aperture. However this case (AGP but no good + aperture) should only happen with a more broken than + usual BIOS, because it would even break Windows. */ + printk("Your BIOS doesn't leave a aperture memory hole\n"); printk("Please enable the IOMMU option in the BIOS setup\n"); + printk("This costs you %d MB of RAM\n", 32 << fallback_aper_order); + + aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); - if (!aper_alloc) + if (!aper_alloc) { + /* Could disable AGP and IOMMU here, but it's probably + not worth it. But the later users cannot deal with + bad apertures and turning on the aperture over memory + causes very strange problems, so it's better to + panic early. */ + panic("Not enough memory for aperture"); + } + } else { return; + } + /* Fix up the north bridges */ for (num = 24; num < 32; num++) { if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) continue; @@ -116,7 +258,7 @@ void __init iommu_hole_init(void) /* Don't enable translation yet. That is done later. Assume this BIOS didn't initialise the GART so just overwrite all previous bits */ - write_pci_config(0, num, 3, 0x90, fallback_aper_order<<1); + write_pci_config(0, num, 3, 0x90, aper_order<<1); write_pci_config(0, num, 3, 0x94, aper_alloc>>25); } } diff -puN arch/x86_64/kernel/Makefile~x86_64-merge-test3 arch/x86_64/kernel/Makefile --- 25/arch/x86_64/kernel/Makefile~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/Makefile 2003-08-10 14:38:32.000000000 -0700 @@ -6,7 +6,7 @@ extra-y := head.o head64.o init_task.o EXTRA_AFLAGS := -traditional obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_x86_64.o \ - pci-dma.o x8664_ksyms.o i387.o syscall.o vsyscall.o \ + x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bluesmoke.o bootflag.o e820.o reboot.o warmreboot.o obj-$(CONFIG_MTRR) += mtrr/ @@ -19,7 +19,8 @@ obj-$(CONFIG_X86_IO_APIC) += io_apic.o m obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o -obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o +obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o + obj-$(CONFIG_MODULES) += module.o $(obj)/bootflag.c: diff -puN arch/x86_64/kernel/nmi.c~x86_64-merge-test3 arch/x86_64/kernel/nmi.c --- 25/arch/x86_64/kernel/nmi.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/nmi.c 2003-08-10 14:38:32.000000000 -0700 @@ -40,6 +40,7 @@ * -1: the lapic NMI watchdog is disabled, but can be enabled */ static int nmi_active; +static int panic_on_timeout; unsigned int nmi_watchdog = NMI_IO_APIC; static unsigned int nmi_hz = HZ; @@ -115,6 +116,14 @@ static int __init setup_nmi_watchdog(cha { int nmi; + if (!strncmp(str,"panic",5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + get_option(&str, &nmi); if (nmi >= NMI_INVALID) @@ -327,6 +336,8 @@ void nmi_watchdog_tick (struct pt_regs * bust_spinlocks(1); printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu); show_registers(regs); + if (panic_on_timeout) + panic("nmi watchdog"); printk("console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); @@ -374,3 +385,4 @@ EXPORT_SYMBOL(disable_lapic_nmi_watchdog EXPORT_SYMBOL(enable_lapic_nmi_watchdog); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); +EXPORT_SYMBOL(touch_nmi_watchdog); diff -puN arch/x86_64/kernel/pci-dma.c~x86_64-merge-test3 arch/x86_64/kernel/pci-dma.c --- 25/arch/x86_64/kernel/pci-dma.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/pci-dma.c 2003-08-10 14:38:32.000000000 -0700 @@ -9,8 +9,6 @@ #include #include -dma_addr_t bad_dma_address = -1UL; - /* Map a set of buffers described by scatterlist in streaming * mode for DMA. This is the scatter-gather version of the * above pci_map_single interface. Here the scatter gather list @@ -34,16 +32,9 @@ int pci_map_sg(struct pci_dev *hwdev, st BUG_ON(direction == PCI_DMA_NONE); for (i = 0; i < nents; i++ ) { struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - s->dma_address = pci_map_page(hwdev, s->page, s->offset, s->length, direction); - - if (unlikely(s->dma_address == bad_dma_address)) { - pci_unmap_sg(hwdev, sg, i, direction); - return 0; - } } return nents; } diff -puN arch/x86_64/kernel/pci-gart.c~x86_64-merge-test3 arch/x86_64/kernel/pci-gart.c --- 25/arch/x86_64/kernel/pci-gart.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/pci-gart.c 2003-08-10 14:38:32.000000000 -0700 @@ -8,20 +8,8 @@ * See Documentation/DMA-mapping.txt for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. - * $Id: pci-gart.c,v 1.20 2003/03/12 08:23:29 ak Exp $ */ -/* - * Notebook: - -possible future tuning: - fast path for sg streaming mappings - only take the locks once. - more intelligent flush strategy - flush only the NB of the CPU directly - connected to the device? - move boundary between IOMMU and AGP in GART dynamically - -*/ - #include #include #include @@ -32,6 +20,8 @@ possible future tuning: #include #include #include +#include +#include #include #include #include @@ -41,6 +31,8 @@ possible future tuning: #include #include +dma_addr_t bad_dma_address; + unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ static unsigned long iommu_pages; /* .. and in pages */ @@ -50,9 +42,13 @@ u32 *iommu_gatt_base; /* Remapping tab int no_iommu; static int no_agp; #ifdef CONFIG_IOMMU_DEBUG -int force_mmu = 1; +int panic_on_overflow = 1; +int force_iommu = 1; +int sac_force_size = 0; #else -int force_mmu = 0; +int panic_on_overflow = 1; /* for testing */ +int force_iommu = 0; +int sac_force_size = 256*1024*1024; #endif /* Allocation bitmap for the remapping area */ @@ -65,12 +61,18 @@ static unsigned long *iommu_gart_bitmap; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) +#define to_pages(addr,size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + #define for_all_nb(dev) \ - dev=NULL; \ - while ((dev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) \ - if (dev->bus->number == 0 && PCI_FUNC(dev->devfn) == 3 && \ + dev = NULL; \ + while ((dev = pci_find_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\ + if (dev->bus->number == 0 && \ (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31)) +static struct pci_dev *northbridges[NR_CPUS + 1]; +static u32 northbridge_flush_word[NR_CPUS + 1]; + #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -85,15 +87,16 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(int size, int *flush) +static unsigned long alloc_iommu(int size) { unsigned long offset, flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); if (offset == -1) { - *flush = 1; + need_flush = 1; offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); } if (offset != -1) { @@ -101,7 +104,7 @@ static unsigned long alloc_iommu(int siz next_bit = offset+size; if (next_bit >= iommu_pages) { next_bit = 0; - *flush = 1; + need_flush = 1; } } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -110,32 +113,59 @@ static unsigned long alloc_iommu(int siz static void free_iommu(unsigned long offset, int size) { + if (size == 1) { + clear_bit(offset, iommu_gart_bitmap); + return; + } unsigned long flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); - clear_bit_string(iommu_gart_bitmap, offset, size); + __clear_bit_string(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } -static inline void flush_gart(void) +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void __flush_gart(struct pci_dev *dev) { - struct pci_dev *nb; - for_all_nb(nb) { - u32 flag; - pci_read_config_dword(nb, 0x9c, &flag); /* could cache this */ - /* could complain for PTE walk errors here (bit 1 of flag) */ - flag |= 1; - pci_write_config_dword(nb, 0x9c, flag); + unsigned long flags; + int bus = dev ? dev->bus->number : -1; + int flushed = 0; + int i; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + /* recheck flush count inside lock */ + if (need_flush) { + for (i = 0; northbridges[i]; i++) { + if (bus >= 0 && !(pcibus_to_cpumask(bus) & (1UL << i))) + continue; + pci_write_config_dword(northbridges[i], 0x9c, + northbridge_flush_word[i] | 1); + flushed++; + } + if (!flushed) + printk("nothing to flush? %d\n", bus); + need_flush = 0; } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } +static inline void flush_gart(struct pci_dev *dev) +{ + if (need_flush) + __flush_gart(dev); +} + +/* + * Allocate memory for a consistent mapping. + * All mappings are consistent here, so this is just a wrapper around + * pci_map_single. + */ void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle) { void *memory; int gfp = GFP_ATOMIC; - int i; - int flush = 0; - unsigned long iommu_page; unsigned long dma_mask; if (hwdev == NULL) { @@ -149,20 +179,14 @@ void *pci_alloc_consistent(struct pci_de if (dma_mask < 0xffffffff || no_iommu) gfp |= GFP_DMA; - /* - * First try to allocate continuous and use directly if already - * in lowmem. - */ - size = round_up(size, PAGE_SIZE); memory = (void *)__get_free_pages(gfp, get_order(size)); if (memory == NULL) { return NULL; } else { - int high = 0, mmu; - if (((unsigned long)virt_to_bus(memory) + size) > dma_mask) - high = 1; - mmu = 1; - if (force_mmu && !(gfp & GFP_DMA)) + int high, mmu; + high = ((unsigned long)virt_to_bus(memory) + size) >= dma_mask; + mmu = high; + if (force_iommu && !(gfp & GFP_DMA)) mmu = 1; if (no_iommu) { if (high) goto error; @@ -175,27 +199,15 @@ void *pci_alloc_consistent(struct pci_de } } - size >>= PAGE_SHIFT; - - iommu_page = alloc_iommu(size, &flush); - if (iommu_page == -1) + *dma_handle = pci_map_single(hwdev, memory, size, 0); + if (*dma_handle == bad_dma_address) goto error; - /* Fill in the GATT */ - for (i = 0; i < size; i++) { - unsigned long phys_mem; - void *mem = memory + i*PAGE_SIZE; - phys_mem = virt_to_phys(mem); - BUG_ON(phys_mem & ~PHYSICAL_PAGE_MASK); - iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - } - - if (flush) - flush_gart(); - *dma_handle = iommu_bus_base + (iommu_page << PAGE_SHIFT); return memory; - error: +error: + if (panic_on_overflow) + panic("pci_map_single: overflow %lu bytes\n", size); free_pages((unsigned long)memory, get_order(size)); return NULL; } @@ -207,25 +219,17 @@ void *pci_alloc_consistent(struct pci_de void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t bus) { - unsigned long iommu_page; - - size = round_up(size, PAGE_SIZE); - if (bus >= iommu_bus_base && bus <= iommu_bus_base + iommu_size) { - unsigned pages = size >> PAGE_SHIFT; - int i; - iommu_page = (bus - iommu_bus_base) >> PAGE_SHIFT; - vaddr = __va(GPTE_DECODE(iommu_gatt_base[iommu_page])); - for (i = 0; i < pages; i++) { - u64 pte = iommu_gatt_base[iommu_page + i]; - BUG_ON((pte & GPTE_VALID) == 0); - iommu_gatt_base[iommu_page + i] = 0; - } - free_iommu(iommu_page, pages); - } + pci_unmap_single(hwdev, bus, size, 0); free_pages((unsigned long)vaddr, get_order(size)); } #ifdef CONFIG_IOMMU_LEAK + +#define SET_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0); +#define CLEAR_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = 0; + /* Debugging aid for drivers that don't free their IOMMU tables */ static void **iommu_leak_tab; static int leak_trace; @@ -246,9 +250,12 @@ void dump_leak(void) } printk("\n"); } +#else +#define SET_LEAK(x) +#define CLEAR_LEAK(x) #endif -static void iommu_full(struct pci_dev *dev, void *addr, size_t size, int dir) +static void iommu_full(struct pci_dev *dev, size_t size, int dir) { /* * Ran out of IOMMU space for this operation. This is very bad. @@ -261,8 +268,8 @@ static void iommu_full(struct pci_dev *d */ printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %p size %lu at device %s[%s]\n", - addr,size, dev ? dev->dev.name : "?", dev ? pci_name(dev) : "?"); + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s[%s]\n", + size, dev ? dev->dev.name : "?", dev ? dev->slot_name : "?"); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) @@ -279,24 +286,61 @@ static void iommu_full(struct pci_dev *d static inline int need_iommu(struct pci_dev *dev, unsigned long addr, size_t size) { u64 mask = dev ? dev->dma_mask : 0xffffffff; - int high = (~mask & (unsigned long)(addr + size)) != 0; + int high = addr + size >= mask; int mmu = high; - if (force_mmu) + if (force_iommu) mmu = 1; if (no_iommu) { if (high) - panic("pci_map_single: high address but no IOMMU.\n"); + panic("PCI-DMA: high address but no IOMMU.\n"); + mmu = 0; + } + return mmu; +} + +static inline int nonforced_iommu(struct pci_dev *dev, unsigned long addr, size_t size) +{ + u64 mask = dev ? dev->dma_mask : 0xffffffff; + int high = addr + size >= mask; + int mmu = high; + if (no_iommu) { + if (high) + panic("PCI-DMA: high address but no IOMMU.\n"); mmu = 0; } return mmu; } +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t pci_map_area(struct pci_dev *dev, unsigned long phys_mem, + size_t size, int dir) +{ + unsigned long npages = to_pages(phys_mem, size); + unsigned long iommu_page = alloc_iommu(npages); + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("pci_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return bad_dma_address; + } + + int i; + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + SET_LEAK(iommu_page + i); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ dma_addr_t pci_map_single(struct pci_dev *dev, void *addr, size_t size, int dir) { - unsigned long iommu_page; unsigned long phys_mem, bus; - int i, npages; - int flush = 0; BUG_ON(dir == PCI_DMA_NONE); @@ -304,39 +348,158 @@ dma_addr_t pci_map_single(struct pci_dev if (!need_iommu(dev, phys_mem, size)) return phys_mem; - npages = round_up(size + ((u64)addr & ~PAGE_MASK), PAGE_SIZE) >> PAGE_SHIFT; + bus = pci_map_area(dev, phys_mem, size, dir); + flush_gart(dev); + return bus; +} - iommu_page = alloc_iommu(npages, &flush); - if (iommu_page == -1) { - iommu_full(dev, addr, size, dir); - return iommu_bus_base; +/* Fallback for pci_map_sg in case of overflow */ +static int pci_map_sg_nonforce(struct pci_dev *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + +#ifdef CONFIG_IOMMU_DEBUG + printk(KERN_DEBUG "pci_map_sg overflow\n"); +#endif + + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + unsigned long addr = page_to_phys(s->page) + s->offset; + if (nonforced_iommu(dev, addr, s->length)) { + addr = pci_map_area(dev, addr, s->length, dir); + if (addr == bad_dma_address) { + if (i > 0) + pci_unmap_sg(dev, sg, i, dir); + nents = 0; + break; + } + } + s->dma_address = addr; + } + flush_gart(dev); + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __pci_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(pages); + if (iommu_start == -1) + return -1; + + unsigned long iommu_page = iommu_start; + int i; + + for (i = start; i < stopat; i++) { + struct scatterlist *s = &sg[i]; + unsigned long start_addr = s->dma_address; + BUG_ON(i > 0 && s->offset); + if (i == start) { + *sout = *s; + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + } else { + sout->length += s->length; + } + unsigned long addr = start_addr; + while (addr < start_addr + s->length) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + SET_LEAK(iommu_page); + addr += PAGE_SIZE; + iommu_page++; } + BUG_ON(i > 0 && addr % PAGE_SIZE); + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; +} - phys_mem &= PAGE_MASK; - for (i = 0; i < npages; i++, phys_mem += PAGE_SIZE) { - BUG_ON(phys_mem & ~PHYSICAL_PAGE_MASK); +static inline int pci_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, + unsigned long pages, int need) +{ + if (!need) { + BUG_ON(stopat - start != 1); + *sout = sg[start]; + return 0; + } + return __pci_map_cont(sg, start, stopat, sout, pages); +} + +#define PCI_NO_MERGE 0 - /* - * Set coherent mapping here to avoid needing to flush - * the caches on mapping. +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. */ - iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); +int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + int out; + int start; + unsigned long pages = 0; + int need = 0; -#ifdef CONFIG_IOMMU_LEAK - /* XXX need eventually caller of pci_map_sg */ - if (iommu_leak_tab) - iommu_leak_tab[iommu_page + i] = __builtin_return_address(0); -#endif + unsigned long size = 0; + + BUG_ON(dir == PCI_DMA_NONE); + if (nents == 0) + return 0; + out = 0; + start = 0; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + dma_addr_t addr = page_to_phys(s->page) + s->offset; + s->dma_address = addr; + BUG_ON(s->length == 0); + + size += s->length; + + /* Handle the previous not yet processed entries */ + if (i > start) { + struct scatterlist *ps = &sg[i-1]; + /* Can only merge when the last chunk ends on a page + boundary. */ + if (PCI_NO_MERGE || !need || (i-1 > start && ps->offset) || + (ps->offset + ps->length) % PAGE_SIZE) { + if (pci_map_cont(sg, start, i, sg+out, pages, + need) < 0) + goto error; + out++; + pages = 0; + start = i; + } } - if (flush) - flush_gart(); - bus = iommu_bus_base + iommu_page*PAGE_SIZE; - return bus + ((unsigned long)addr & ~PAGE_MASK); + need = need_iommu(dev, addr, s->length); + pages += to_pages(s->offset, s->length); + } + if (pci_map_cont(sg, start, i, sg+out, pages, need) < 0) + goto error; + out++; + flush_gart(dev); + if (out < nents) + sg[out].length = 0; + return out; + +error: + flush_gart(NULL); + pci_unmap_sg(dev, sg, nents, dir); + /* When it was forced try again unforced */ + if (force_iommu) + return pci_map_sg_nonforce(dev, sg, nents, dir); + if (panic_on_overflow) + panic("pci_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir); + for (i = 0; i < nents; i++) + sg[i].dma_address = bad_dma_address; + return 0; } /* - * Free a temporary PCI mapping. + * Free a PCI mapping. */ void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction) @@ -347,20 +510,68 @@ void pci_unmap_single(struct pci_dev *hw dma_addr > iommu_bus_base + iommu_size) return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = round_up(size + (dma_addr & ~PAGE_MASK), PAGE_SIZE) >> PAGE_SHIFT; + npages = to_pages(dma_addr, size); int i; for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = 0; -#ifdef CONFIG_IOMMU_LEAK - if (iommu_leak_tab) - iommu_leak_tab[iommu_page + i] = 0; -#endif + CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +void pci_unmap_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, + int dir) +{ + int i; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + if (!s->length) + break; + pci_unmap_single(dev, s->dma_address, s->length, dir); + } +} + +int pci_dma_supported(struct pci_dev *dev, u64 mask) +{ + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_consistent. + The caller just has to use GFP_DMA in this case. */ + if (mask < 0x00ffffff) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. + This allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area + and return DAC as fallback address the device may not handle it correctly. + As a compromise we only do this if the IOMMU area is >= 256MB + which should make overflow unlikely enough. + + As a special case some controllers have a 39bit address mode + that is as efficient as 32bit (aic79xx). Don't force SAC for these. + Assume all masks <= 40 bits are of this type. Normally this doesn't + make any difference, but gives more gentle handling of IOMMU overflow. */ + if (force_iommu && (mask > 0xffffffffffULL) && (iommu_size >= sac_force_size)){ + printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->slot_name,mask); + return 0; + } + + if (no_iommu && (mask < (end_pfn << PAGE_SHIFT))) + return 0; + + return 1; +} + +EXPORT_SYMBOL(pci_unmap_sg); +EXPORT_SYMBOL(pci_map_sg); EXPORT_SYMBOL(pci_map_single); EXPORT_SYMBOL(pci_unmap_single); +EXPORT_SYMBOL(pci_dma_supported); +EXPORT_SYMBOL(no_iommu); +EXPORT_SYMBOL(force_iommu); static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) { @@ -452,13 +663,12 @@ static __init int init_k8_gatt(struct ag pci_write_config_dword(dev, 0x90, ctl); } - flush_gart(); + flush_gart(NULL); - printk("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); + printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); return 0; nommu: - /* XXX: reject 0xffffffff mask now in pci mapping functions */ printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); return -1; @@ -466,11 +676,12 @@ static __init int init_k8_gatt(struct ag extern int agp_amdk8_init(void); -int __init pci_iommu_init(void) +static int __init pci_iommu_init(void) { struct agp_kern_info info; unsigned long aper_size; unsigned long iommu_start; + struct pci_dev *dev; #ifndef CONFIG_AGP_AMD_8151 no_agp = 1; @@ -482,7 +693,7 @@ int __init pci_iommu_init(void) (agp_copy_info(&info) < 0); #endif - if (no_iommu || (!force_mmu && end_pfn < 0xffffffff>>PAGE_SHIFT)) { + if (no_iommu || (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT)) { printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); no_iommu = 1; return -1; @@ -492,7 +703,7 @@ int __init pci_iommu_init(void) int err = -1; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); no_agp = 1; - if (force_mmu || end_pfn >= 0xffffffff>>PAGE_SHIFT) + if (force_iommu || end_pfn >= 0xffffffff>>PAGE_SHIFT) err = init_k8_gatt(&info); if (err < 0) { printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); @@ -529,25 +740,38 @@ int __init pci_iommu_init(void) set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); agp_memory_reserved = iommu_size; - printk(KERN_INFO"PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + printk(KERN_INFO + "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size>>20); iommu_start = aper_size - iommu_size; iommu_bus_base = info.aper_base + iommu_start; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); bad_dma_address = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* - * Unmap the IOMMU part of the GART. The alias of the page is always mapped - * with cache enabled and there is no full cache coherency across the GART - * remapping. The unmapping avoids automatic prefetches from the CPU - * allocating cache lines in there. All CPU accesses are done via the - * direct mapping to the backing memory. The GART address is only used by PCI + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI * devices. */ clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); - flush_gart(); + for_all_nb(dev) { + u32 flag; + int cpu = PCI_SLOT(dev->devfn) - 24; + if (cpu >= NR_CPUS) + continue; + northbridges[cpu] = dev; + + pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ + northbridge_flush_word[cpu] = flag; + } + + flush_gart(NULL); return 0; } @@ -561,8 +785,8 @@ fs_initcall(pci_iommu_init); off don't use the IOMMU leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Should be fastest. - force Force IOMMU and turn on unmap debugging. + noforce don't force IOMMU usage. Default. + force Force IOMMU. */ __init int iommu_setup(char *opt) { @@ -575,15 +799,19 @@ __init int iommu_setup(char *opt) if (!memcmp(p,"off", 3)) no_iommu = 1; if (!memcmp(p,"force", 5)) - force_mmu = 1; + force_iommu = 1; if (!memcmp(p,"noforce", 7)) - force_mmu = 0; + force_iommu = 0; if (!memcmp(p, "memaper", 7)) { fallback_aper_force = 1; p += 7; if (*p == '=' && get_option(&p, &arg)) fallback_aper_order = arg; } + if (!memcmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!memcmp(p, "nopanic", 7)) + panic_on_overflow = 0; #ifdef CONFIG_IOMMU_LEAK if (!memcmp(p,"leak", 4)) { leak_trace = 1; diff -puN arch/x86_64/kernel/pci-nommu.c~x86_64-merge-test3 arch/x86_64/kernel/pci-nommu.c --- 25/arch/x86_64/kernel/pci-nommu.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/pci-nommu.c 2003-08-10 14:38:32.000000000 -0700 @@ -33,15 +33,30 @@ void pci_free_consistent(struct pci_dev free_pages((unsigned long)vaddr, get_order(size)); } +int pci_dma_supported(struct pci_dev *hwdev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + * RED-PEN this won't work for pci_map_single. Caller has to + * use GFP_DMA in the first place. + */ + if (mask < 0x00ffffff) + return 0; -static void __init check_ram(void) + return 1; +} + +EXPORT_SYMBOL(pci_dma_supported); + +static int __init check_ram(void) { if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { printk(KERN_ERR "WARNING more than 4GB of memory but no IOMMU.\n" KERN_ERR "WARNING 32bit PCI may malfunction.\n"); - /* Could play with highmem_start_page here to trick some subsystems - into bounce buffers. Unfortunately that would require setting - CONFIG_HIGHMEM too. - */ } + return 0; } +__initcall(check_ram); + diff -puN arch/x86_64/kernel/setup64.c~x86_64-merge-test3 arch/x86_64/kernel/setup64.c --- 25/arch/x86_64/kernel/setup64.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/setup64.c 2003-08-10 14:38:32.000000000 -0700 @@ -131,14 +131,16 @@ void __init setup_per_cpu_areas(void) size = PERCPU_ENOUGH_ROOM; #endif - /* We don't support CPU hotplug, so only allocate as much as needed here */ - - int maxi = max_t(unsigned, numnodes, num_online_cpus()); - - for (i = 0; i < maxi; i++) { + for (i = 0; i < NR_CPUS; i++) { + unsigned char *ptr; /* If possible allocate on the node of the CPU. In case it doesn't exist round-robin nodes. */ - unsigned char *ptr = alloc_bootmem_node(NODE_DATA(i % numnodes), size); + if (!NODE_DATA(i % numnodes)) { + printk("cpu with no node %d, numnodes %d\n", i, numnodes); + ptr = alloc_bootmem(size); + } else { + ptr = alloc_bootmem_node(NODE_DATA(i % numnodes), size); + } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); cpu_pda[i].data_offset = ptr - __per_cpu_start; @@ -158,7 +160,6 @@ void pda_init(int cpu) pda->me = pda; pda->cpunumber = cpu; pda->irqcount = -1; - pda->data_offset = 0; pda->kernelstack = (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; pda->active_mm = &init_mm; @@ -170,14 +171,14 @@ void pda_init(int cpu) pda->irqstackptr = boot_cpu_stack; level4 = init_level4_pgt; } else { + level4 = (pml4_t *)__get_free_pages(GFP_ATOMIC, 0); + if (!level4) + panic("Cannot allocate top level page for cpu %d", cpu); pda->irqstackptr = (char *) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d\n", cpu); - level4 = (pml4_t *)__get_free_pages(GFP_ATOMIC, 0); + panic("cannot allocate irqstack for cpu %d", cpu); } - if (!level4) - panic("Cannot allocate top level page for cpu %d", cpu); pda->level4_pgt = (unsigned long *)level4; if (level4 != init_level4_pgt) diff -puN arch/x86_64/kernel/sys_x86_64.c~x86_64-merge-test3 arch/x86_64/kernel/sys_x86_64.c --- 25/arch/x86_64/kernel/sys_x86_64.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/sys_x86_64.c 2003-08-10 14:38:32.000000000 -0700 @@ -122,3 +122,17 @@ asmlinkage long wrap_sys_shmat(int shmid unsigned long raddr; return sys_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr; } + +asmlinkage long sys_time64(long * tloc) +{ + struct timeval now; + int i; + + do_gettimeofday(&now); + i = now.tv_sec; + if (tloc) { + if (put_user(i,tloc)) + i = -EFAULT; + } + return i; +} diff -puN arch/x86_64/kernel/x8664_ksyms.c~x86_64-merge-test3 arch/x86_64/kernel/x8664_ksyms.c --- 25/arch/x86_64/kernel/x8664_ksyms.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/kernel/x8664_ksyms.c 2003-08-10 14:38:32.000000000 -0700 @@ -121,6 +121,7 @@ EXPORT_SYMBOL_NOVERS(__read_lock_failed) EXPORT_SYMBOL(synchronize_irq); EXPORT_SYMBOL(smp_call_function); +EXPORT_SYMBOL(cpu_callout_map); #endif #ifdef CONFIG_VT diff -puN arch/x86_64/mm/k8topology.c~x86_64-merge-test3 arch/x86_64/mm/k8topology.c --- 25/arch/x86_64/mm/k8topology.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/mm/k8topology.c 2003-08-10 14:38:32.000000000 -0700 @@ -47,6 +47,7 @@ int __init k8_scan_nodes(unsigned long s struct node nodes[MAXNODE]; int nodeid, i, nb; int found = 0; + int nmax; nb = find_northbridge(); if (nb < 0) @@ -54,22 +55,28 @@ int __init k8_scan_nodes(unsigned long s printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - numnodes = (1 << ((read_pci_config(0, nb, 0, 0x60 ) >> 4) & 3)); - - printk(KERN_INFO "Assuming %d nodes\n", numnodes - 1); + nmax = (1 << ((read_pci_config(0, nb, 0, 0x60 ) >> 4) & 3)); + numnodes = nmax; memset(&nodes,0,sizeof(nodes)); prevbase = 0; - for (i = 0; i < numnodes; i++) { + for (i = 0; i < 8; i++) { unsigned long base,limit; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); nodeid = limit & 3; + if ((base & 3) == 0) { + if (i < nmax) + printk("Skipping disabled node %d\n", i); + continue; + } + if (!limit) { - printk(KERN_ERR "Skipping node entry %d (base %lx)\n", i, base); - return -1; + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, + base); + continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", @@ -77,7 +84,8 @@ int __init k8_scan_nodes(unsigned long s return -1; } if ((1UL << nodeid) & nodes_present) { - printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); + printk(KERN_INFO "Node %d already present. Skipping\n", + nodeid); continue; } @@ -104,7 +112,7 @@ int __init k8_scan_nodes(unsigned long s if (limit < base) { printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", nodeid, base, limit); - return -1; + continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ @@ -135,11 +143,26 @@ int __init k8_scan_nodes(unsigned long s } printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); - for (i = 0; i < numnodes; i++) { + for (i = 0; i < MAXNODE; i++) { if (nodes[i].start != nodes[i].end) setup_node_bootmem(i, nodes[i].start, nodes[i].end); } + /* There are unfortunately some poorly designed mainboards around + that only connect memory to a single CPU. This breaks the 1:1 cpu->node + mapping. To avoid this fill in the mapping for all possible + CPUs, as the number of CPUs is not known yet. + We round robin the existing nodes. */ + int rr = 0; + for (i = 0; i < MAXNODE; i++) { + if (nodes_present & (1UL<> rr) == 0) + rr = 0; + rr = ffz(~nodes_present >> rr); + node_data[i] = node_data[rr]; + rr++; + } + return 0; } - diff -puN arch/x86_64/mm/numa.c~x86_64-merge-test3 arch/x86_64/mm/numa.c --- 25/arch/x86_64/mm/numa.c~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/arch/x86_64/mm/numa.c 2003-08-10 14:38:32.000000000 -0700 @@ -26,8 +26,6 @@ static int numa_off __initdata; unsigned long nodes_present; -static int emunodes __initdata; - int __init compute_hash_shift(struct node *nodes) { int i; @@ -103,11 +101,8 @@ void __init setup_node_bootmem(int nodei reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages< numnodes) { + if (nodeid + 1 > numnodes) numnodes = nodeid + 1; - printk(KERN_INFO - "setup_node_bootmem: enlarging numnodes to %d\n", numnodes); - } nodes_present |= (1UL << nodeid); } @@ -149,26 +144,6 @@ int __init numa_initmem_init(unsigned lo printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - if (!numa_off && emunodes > 0) { - struct node nodes[MAXNODE]; - unsigned long nodesize = (end_pfn << PAGE_SHIFT) / emunodes; - int i; - if (emunodes > MAXNODE) - emunodes = MAXNODE; - memset(&nodes, 0, sizeof(nodes)); - printk(KERN_INFO "Faking %d nodes of size %ld MB\n", emunodes, nodesize>>20); - for (i = 0; i < emunodes; i++) { - unsigned long end = (i+1)*nodesize; - if (i == emunodes-1) - end = end_pfn << PAGE_SHIFT; - nodes[i].start = i * nodesize; - nodes[i].end = end; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - memnode_shift = compute_hash_shift(nodes); - return 0; - } - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); @@ -176,6 +151,7 @@ int __init numa_initmem_init(unsigned lo fake_node = 1; memnode_shift = 63; memnodemap[0] = 0; + numnodes = 1; setup_node_bootmem(0, start_pfn< than biggest syscall! */ +#define IA32_NR_syscalls 275 /* must be > than biggest syscall! */ #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ diff -puN include/asm-x86_64/io.h~x86_64-merge-test3 include/asm-x86_64/io.h --- 25/include/asm-x86_64/io.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/io.h 2003-08-10 14:38:32.000000000 -0700 @@ -301,6 +301,12 @@ out: #define flush_write_buffers() +/* Disable vmerge for now. Need to fix the block layer code + to check for non iommu addresses first. + When the IOMMU is force it is safe to enable. */ +extern int force_iommu; +#define BIO_VERMGE_BOUNDARY (force_iommu ? 4096 : 0) + #endif /* __KERNEL__ */ #endif diff -puN include/asm-x86_64/nmi.h~x86_64-merge-test3 include/asm-x86_64/nmi.h --- 25/include/asm-x86_64/nmi.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/nmi.h 2003-08-10 14:38:32.000000000 -0700 @@ -48,6 +48,4 @@ static inline void unset_nmi_pm_callback extern void default_do_nmi(struct pt_regs *); -extern void default_do_nmi(struct pt_regs *); - #endif /* ASM_NMI_H */ diff -puN include/asm-x86_64/pci-direct.h~x86_64-merge-test3 include/asm-x86_64/pci-direct.h --- 25/include/asm-x86_64/pci-direct.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/pci-direct.h 2003-08-10 14:38:32.000000000 -0700 @@ -14,7 +14,26 @@ static inline u32 read_pci_config(u8 bus u32 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inl(0xcfc); - PDprintk("%x reading from %x: %x\n", slot, offset, v); + if (v != 0xffffffff) + PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); + return v; +} + +static inline u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) +{ + u8 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inb(0xcfc + (offset&3)); + PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); + return v; +} + +static inline u8 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) +{ + u16 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inw(0xcfc + (offset&2)); + PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); return v; } diff -puN include/asm-x86_64/pci.h~x86_64-merge-test3 include/asm-x86_64/pci.h --- 25/include/asm-x86_64/pci.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/pci.h 2003-08-10 14:38:32.000000000 -0700 @@ -8,9 +8,6 @@ #include /* for struct page */ - -extern dma_addr_t bad_dma_address; - /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ @@ -21,6 +18,8 @@ extern unsigned int pcibios_assign_all_b #define pcibios_assign_all_busses() 0 #endif +extern int no_iommu, force_iommu; + extern unsigned long pci_mem_start; #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM (pci_mem_start) @@ -46,6 +45,9 @@ struct pci_dev; extern int iommu_setup(char *opt); +extern dma_addr_t bad_dma_address; +#define pci_dma_error(x) ((x) == bad_dma_address) + /* Allocate and map kernel buffer using consistent mode DMA for a device. * hwdev should be valid struct pci_dev pointer for PCI devices, * NULL for PCI-like buses (ISA, EISA). @@ -119,10 +121,16 @@ static inline void pci_dma_sync_sg(struc /* The PCI address space does equal the physical memory * address space. The networking and block device layers use - * this boolean for bounce buffer decisions. + * this boolean for bounce buffer decisions + * + * On AMD64 it mostly equals, but we set it to zero to tell some subsystems + * that an IOMMU is available. */ -#define PCI_DMA_BUS_IS_PHYS (0) +#define PCI_DMA_BUS_IS_PHYS (no_iommu ? 1 : 0) +/* We lie slightly when the IOMMU is forced to get the device to + use SAC instead of DAC. */ +#define pci_dac_dma_supported(pci_dev, mask) (force_iommu ? 0 : 1) #else static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, @@ -206,6 +214,7 @@ static inline void pci_dma_sync_sg(struc #define PCI_DMA_BUS_IS_PHYS 1 +#define pci_dac_dma_supported(pci_dev, mask) 1 #endif extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, @@ -220,21 +229,7 @@ extern void pci_unmap_sg(struct pci_dev * only drive the low 24-bits during PCI bus mastering, then * you would pass 0x00ffffff as the mask to this function. */ -static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - */ - if(mask < 0x00ffffff) - return 0; - - return 1; -} - -/* This is always fine. */ -#define pci_dac_dma_supported(pci_dev, mask) (1) +extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask); static __inline__ dma64_addr_t pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) diff -puN include/asm-x86_64/percpu.h~x86_64-merge-test3 include/asm-x86_64/percpu.h --- 25/include/asm-x86_64/percpu.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/percpu.h 2003-08-10 14:38:32.000000000 -0700 @@ -1,53 +1,51 @@ #ifndef _ASM_X8664_PERCPU_H_ #define _ASM_X8664_PERCPU_H_ +#include -#include +/* Same as asm-generic/percpu.h, except that we store the per cpu offset + in the PDA. Longer term the PDA and every per cpu variable + should be just put into a single section and referenced directly + from %gs */ #ifdef CONFIG_SMP -/* Same as the generic code except that we cache the per cpu offset - in the pda. This gives an 3 instruction reference for per cpu data */ - -#include #include -#define __my_cpu_offset() read_pda(data_offset) + #define __per_cpu_offset(cpu) (cpu_pda[cpu].data_offset) +#define __my_cpu_offset() read_pda(data_offset) /* Separate out the type, so (int[3], foo) works. */ #define DEFINE_PER_CPU(type, name) \ - __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name /* var is in discarded region: offset to particular copy we want */ -#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset(cpu))) -#define __get_cpu_var(var) \ - (*RELOC_HIDE(&var##__per_cpu, __my_cpu_offset())) - -static inline void percpu_modcopy(void *pcpudst, const void *src, - unsigned long size) -{ - unsigned int i; - for (i = 0; i < NR_CPUS; i++) - if (cpu_possible(i)) - memcpy(pcpudst + __per_cpu_offset(i), src, size); -} - -extern void setup_per_cpu_areas(void); +#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) +#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +/* A macro to avoid #include hell... */ +#define percpu_modcopy(pcpudst, src, size) \ +do { \ + unsigned int __i; \ + for (__i = 0; __i < NR_CPUS; __i++) \ + if (cpu_possible(__i)) \ + memcpy((pcpudst)+__per_cpu_offset(__i), \ + (src), (size)); \ +} while (0) #else /* ! SMP */ #define DEFINE_PER_CPU(type, name) \ - __typeof__(type) name##__per_cpu + __typeof__(type) per_cpu__##name -#define per_cpu(var, cpu) ((void)cpu, var##__per_cpu) -#define __get_cpu_var(var) var##__per_cpu +#define per_cpu(var, cpu) ((void)cpu, per_cpu__##var) +#define __get_cpu_var(var) per_cpu__##var #endif /* SMP */ -#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name -#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu) -#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu) +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) -DECLARE_PER_CPU(struct x8664_pda, per_cpu_pda); +extern void setup_per_cpu_areas(void); -#endif +#endif /* _ASM_X8664_PERCPU_H_ */ diff -puN include/asm-x86_64/proto.h~x86_64-merge-test3 include/asm-x86_64/proto.h --- 25/include/asm-x86_64/proto.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/proto.h 2003-08-10 14:38:32.000000000 -0700 @@ -77,7 +77,7 @@ extern unsigned long end_pfn; extern unsigned long table_start, table_end; extern int exception_trace; -extern int no_iommu, force_mmu; +extern int force_iommu, no_iommu; extern int using_apic_timer; extern int disable_apic; extern unsigned cpu_khz; diff -puN include/asm-x86_64/siginfo.h~x86_64-merge-test3 include/asm-x86_64/siginfo.h --- 25/include/asm-x86_64/siginfo.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/siginfo.h 2003-08-10 14:38:32.000000000 -0700 @@ -1,6 +1,8 @@ #ifndef _X8664_SIGINFO_H #define _X8664_SIGINFO_H +#define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) + #include #endif diff -puN include/asm-x86_64/unistd.h~x86_64-merge-test3 include/asm-x86_64/unistd.h --- 25/include/asm-x86_64/unistd.h~x86_64-merge-test3 2003-08-10 14:38:32.000000000 -0700 +++ 25-akpm/include/asm-x86_64/unistd.h 2003-08-10 14:38:32.000000000 -0700 @@ -461,7 +461,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremove #define __NR_tkill 200 __SYSCALL(__NR_tkill, sys_tkill) #define __NR_time 201 -__SYSCALL(__NR_time, sys_time) +__SYSCALL(__NR_time, sys_time64) #define __NR_futex 202 __SYSCALL(__NR_futex, sys_futex) #define __NR_sched_setaffinity 203 _