From: Matthew Wilcox , "Durairaj, Sundarapandian" Here's a rewrite of Sundarapandian Durairaj's patch for accessing extended PCI configuration space. Changes of note: - Forward-ported to 2.6.2-rc2 - Renamed most of the 'Express' to 'MMCONFIG' since that is what we're actually doing (and it would seem to be the same for PCI-X 2.0) - Separate out the mmconfig accesses into its own file rather than lumping them in with direct. Inline the bits from include/asm-i386/pci.h. - Request the memory region we're going to use for MMCONFIG accesses. - Remove the EXPERIMENTAL tag. - Add support in sysfs for the extended config space. - Use i_size in proc_bus_pci_lseek(). - Move cfg_size to where it will pack better in pci_dev. ebiederm@xmission.com: Is it really safe to treat the base address as a u32? I know if I was doing the BIOS and that address was tied to a 32bit BAR I would be extremely tempted to put those 256M of address space above 4G. Putting something like that below 4G leads to 1/2 Gig of memory missing. You can also put the memory above 4G on most intel chipsets but I'd rather have my memory down low where my legacy OS could get to it rather than have my PCI extended configuration space down low where nothing really needs it. Point being I don't think it is safe to assume the BIOS always puts the extended PCI configuration space below 4G. --- 25-akpm/arch/i386/Kconfig | 22 ++++-- 25-akpm/arch/i386/kernel/acpi/boot.c | 34 ++++++++++ 25-akpm/arch/i386/pci/Makefile | 1 25-akpm/arch/i386/pci/common.c | 9 ++ 25-akpm/arch/i386/pci/mmconfig.c | 115 +++++++++++++++++++++++++++++++++++ 25-akpm/arch/i386/pci/pci.h | 3 25-akpm/drivers/acpi/tables.c | 1 25-akpm/drivers/pci/pci-sysfs.c | 24 +++++-- 25-akpm/drivers/pci/pci.c | 2 25-akpm/drivers/pci/probe.c | 17 +++++ 25-akpm/drivers/pci/proc.c | 26 +++---- 25-akpm/include/asm-i386/fixmap.h | 3 25-akpm/include/linux/acpi.h | 12 +++ 25-akpm/include/linux/pci.h | 2 14 files changed, 246 insertions(+), 25 deletions(-) diff -puN arch/i386/Kconfig~pcix-enhanced arch/i386/Kconfig --- 25/arch/i386/Kconfig~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/arch/i386/Kconfig Wed Feb 4 13:06:37 2004 @@ -1052,12 +1052,16 @@ config PCI_GOBIOS PCI-based systems don't have any BIOS at all. Linux can also try to detect the PCI hardware directly without using the BIOS. - With this option, you can specify how Linux should detect the PCI - devices. If you choose "BIOS", the BIOS will be used, if you choose - "Direct", the BIOS won't be used, and if you choose "Any", the - kernel will try the direct access method and falls back to the BIOS - if that doesn't work. If unsure, go with the default, which is - "Any". + With this option, you can specify how Linux should detect the + PCI devices. If you choose "BIOS", the BIOS will be used, + if you choose "Direct", the BIOS won't be used, and if you + choose "MMConfig", then PCI Express MMCONFIG will be used. + If you choose "Any", the kernel will try MMCONFIG, then the + direct access method and falls back to the BIOS if that doesn't + work. If unsure, go with the default, which is "Any". + +config PCI_GOMMCONFIG + bool "MMConfig" config PCI_GODIRECT bool "Direct" @@ -1077,6 +1081,12 @@ config PCI_DIRECT depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS) default y +config PCI_MMCONFIG + bool + depends on PCI && (PCI_GOMMCONFIG || PCI_GOANY) + select ACPI_BOOT + default y + config PCI_USE_VECTOR bool "Vector-based interrupt indexing" depends on X86_LOCAL_APIC && X86_IO_APIC diff -puN arch/i386/kernel/acpi/boot.c~pcix-enhanced arch/i386/kernel/acpi/boot.c --- 25/arch/i386/kernel/acpi/boot.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/arch/i386/kernel/acpi/boot.c Wed Feb 4 13:06:37 2004 @@ -97,6 +97,27 @@ char *__acpi_map_table(unsigned long phy } +#ifdef CONFIG_PCI_MMCONFIG +static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_mcfg *mcfg; + + if (!phys_addr || !size) + return -EINVAL; + + mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size); + if (!mcfg) { + printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); + return -ENODEV; + } + + if (mcfg->base_address) + pci_mmcfg_base_addr = mcfg->base_address; + + return 0; +} +#endif /* CONFIG_PCI_MMCONFIG */ + #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; @@ -601,6 +622,19 @@ acpi_boot_init (void) #endif /* CONFIG_X86_IO_APIC && CONFIG_ACPI_INTERPRETER */ +#ifdef CONFIG_PCI_MMCONFIG + result = acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + if (!result) { + printk(KERN_WARNING PREFIX "MCFG not present\n"); + return 0; + } else if (result < 0) { + printk(KERN_ERR PREFIX "Error parsing MCFG\n"); + return result; + } else if (result > 1) { + printk(KERN_WARNING PREFIX "Multiple MCFG tables exist\n"); + } +#endif /* CONFIG_PCI_MMCONFIG */ + #ifdef CONFIG_X86_LOCAL_APIC if (acpi_lapic && acpi_ioapic) { smp_found_config = 1; diff -puN arch/i386/pci/common.c~pcix-enhanced arch/i386/pci/common.c --- 25/arch/i386/pci/common.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/arch/i386/pci/common.c Wed Feb 4 13:06:37 2004 @@ -20,7 +20,8 @@ extern void pcibios_sort(void); #endif -unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2; +unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | + PCI_PROBE_MMCONF; int pcibios_last_bus = -1; struct pci_bus *pci_root_bus = NULL; @@ -198,6 +199,12 @@ char * __devinit pcibios_setup(char *st return NULL; } #endif +#ifdef CONFIG_PCI_MMCONFIG + else if (!strcmp(str, "nommconf")) { + pci_probe &= ~PCI_PROBE_MMCONF; + return NULL; + } +#endif else if (!strcmp(str, "noacpi")) { acpi_noirq_set(); return NULL; diff -puN arch/i386/pci/Makefile~pcix-enhanced arch/i386/pci/Makefile --- 25/arch/i386/pci/Makefile~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/arch/i386/pci/Makefile Wed Feb 4 13:06:37 2004 @@ -1,6 +1,7 @@ obj-y := i386.o obj-$(CONFIG_PCI_BIOS) += pcbios.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o obj-$(CONFIG_PCI_DIRECT) += direct.o pci-y := fixup.o diff -puN /dev/null arch/i386/pci/mmconfig.c --- /dev/null Thu Apr 11 07:25:15 2002 +++ 25-akpm/arch/i386/pci/mmconfig.c Wed Feb 4 13:06:37 2004 @@ -0,0 +1,115 @@ +/* + * mmconfig.c - Low-level direct PCI config space access via MMCONFIG + */ + +#include +#include +#include "pci.h" + +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +u32 pci_mmcfg_base_addr; + +#define mmcfg_virt_addr (fix_to_virt(FIX_PCIE_MCFG)) + +/* The base address of the last MMCONFIG device accessed */ +static u32 mmcfg_last_accessed_device; + +/* + * Functions for accessing PCI configuration space with MMCONFIG accesses + */ + +static inline void pci_exp_set_dev_base(int bus, int devfn) +{ + u32 dev_base = pci_mmcfg_base_addr | (bus << 20) | (devfn << 12); + if (dev_base != mmcfg_last_accessed_device) { + mmcfg_last_accessed_device = dev_base; + set_fixmap(FIX_PCIE_MCFG, dev_base); + } +} + +static int pci_mmcfg_read(int seg, int bus, int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + + if (!value || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + pci_exp_set_dev_base(bus, devfn); + + switch (len) { + case 1: + *value = readb(mmcfg_virt_addr + reg); + break; + case 2: + *value = readw(mmcfg_virt_addr + reg); + break; + case 4: + *value = readl(mmcfg_virt_addr + reg); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_mmcfg_write(int seg, int bus, int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + + if ((bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + pci_exp_set_dev_base(bus, devfn); + + switch (len) { + case 1: + writeb(value, mmcfg_virt_addr + reg); + break; + case 2: + writew(value, mmcfg_virt_addr + reg); + break; + case 4: + writel(value, mmcfg_virt_addr + reg); + break; + } + + /* Dummy read to flush PCI write */ + readl(mmcfg_virt_addr); + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static struct pci_raw_ops pci_mmcfg = { + .read = pci_mmcfg_read, + .write = pci_mmcfg_write, +}; + +static int __init pci_mmcfg_init(void) +{ + struct resource *region; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + goto out; + if (!pci_mmcfg_base_addr) + goto out; + region = request_mem_region(pci_mmcfg_base_addr, 256 * 1024 * 1024, + "PCI MMCONFIG"); + if (!region) + goto out; + + printk(KERN_INFO "PCI: Using MMCONFIG\n"); + raw_pci_ops = &pci_mmcfg; + pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + + out: + return 0; +} + +arch_initcall(pci_mmcfg_init); diff -puN arch/i386/pci/pci.h~pcix-enhanced arch/i386/pci/pci.h --- 25/arch/i386/pci/pci.h~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/arch/i386/pci/pci.h Wed Feb 4 13:06:37 2004 @@ -15,6 +15,9 @@ #define PCI_PROBE_BIOS 0x0001 #define PCI_PROBE_CONF1 0x0002 #define PCI_PROBE_CONF2 0x0004 +#define PCI_PROBE_MMCONF 0x0008 +#define PCI_PROBE_MASK 0x000f + #define PCI_NO_SORT 0x0100 #define PCI_BIOS_SORT 0x0200 #define PCI_NO_CHECKS 0x0400 diff -puN drivers/acpi/tables.c~pcix-enhanced drivers/acpi/tables.c --- 25/drivers/acpi/tables.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/drivers/acpi/tables.c Wed Feb 4 13:06:37 2004 @@ -58,6 +58,7 @@ static char *acpi_table_signatures[ACPI_ [ACPI_SSDT] = "SSDT", [ACPI_SPMI] = "SPMI", [ACPI_HPET] = "HPET", + [ACPI_MCFG] = "MCFG", }; static char *mps_inti_flags_polarity[] = { "dfl", "high", "res", "low" }; diff -puN drivers/pci/pci.c~pcix-enhanced drivers/pci/pci.c --- 25/drivers/pci/pci.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/drivers/pci/pci.c Wed Feb 4 13:06:37 2004 @@ -90,6 +90,8 @@ pci_max_busnr(void) * %PCI_CAP_ID_CHSWP CompactPCI HotSwap * * %PCI_CAP_ID_PCIX PCI-X + * + * %PCI_CAP_ID_EXP PCI Express */ int pci_find_capability(struct pci_dev *dev, int cap) diff -puN drivers/pci/pci-sysfs.c~pcix-enhanced drivers/pci/pci-sysfs.c --- 25/drivers/pci/pci-sysfs.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/drivers/pci/pci-sysfs.c Wed Feb 4 13:06:37 2004 @@ -71,7 +71,7 @@ pci_read_config(struct kobject *kobj, ch /* Several chips lock up trying to read undefined config space */ if (capable(CAP_SYS_ADMIN)) { - size = 256; + size = dev->cfg_size; } else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { size = 128; } @@ -123,10 +123,10 @@ pci_write_config(struct kobject *kobj, c unsigned int size = count; loff_t init_off = off; - if (off > 256) + if (off > dev->cfg_size) return 0; - if (off + count > 256) { - size = 256 - off; + if (off + count > dev->cfg_size) { + size = dev->cfg_size - off; count = size; } @@ -167,6 +167,16 @@ static struct bin_attribute pci_config_a .write = pci_write_config, }; +static struct bin_attribute pcie_config_attr = { + .attr = { + .name = "config", + .mode = S_IRUGO | S_IWUSR, + }, + .size = 4096, + .read = pci_read_config, + .write = pci_write_config, +}; + void pci_create_sysfs_dev_files (struct pci_dev *pdev) { struct device *dev = &pdev->dev; @@ -179,5 +189,9 @@ void pci_create_sysfs_dev_files (struct device_create_file (dev, &dev_attr_class); device_create_file (dev, &dev_attr_irq); device_create_file (dev, &dev_attr_resource); - sysfs_create_bin_file(&dev->kobj, &pci_config_attr); + if (pdev->cfg_size < 4096) { + sysfs_create_bin_file(&dev->kobj, &pci_config_attr); + } else { + sysfs_create_bin_file(&dev->kobj, &pcie_config_attr); + } } diff -puN drivers/pci/probe.c~pcix-enhanced drivers/pci/probe.c --- 25/drivers/pci/probe.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/drivers/pci/probe.c Wed Feb 4 13:06:37 2004 @@ -18,6 +18,8 @@ #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ #define CARDBUS_RESERVE_BUSNR 3 +#define PCI_CFG_SPACE_SIZE 256 +#define PCI_CFG_SPACE_EXP_SIZE 4096 /* Ugh. Need to stop exporting this to modules. */ LIST_HEAD(pci_root_buses); @@ -526,6 +528,20 @@ static void pci_release_dev(struct devic kfree(pci_dev); } +/** + * pci_cfg_space_size - get the configuration space size of the PCI device + */ +static int pci_cfg_space_size(struct pci_dev *dev) +{ +#ifdef CONFIG_PCI_MMCONFIG + /* Find whether the device is PCI Express */ + int is_pci_express_dev = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (is_pci_express_dev) + return PCI_CFG_SPACE_EXP_SIZE; +#endif + return PCI_CFG_SPACE_SIZE; +} + /* * Read the config data for a PCI device, sanity-check it * and fill in the dev structure... @@ -562,6 +578,7 @@ pci_scan_device(struct pci_bus *bus, int dev->multifunction = !!(hdr_type & 0x80); dev->vendor = l & 0xffff; dev->device = (l >> 16) & 0xffff; + dev->cfg_size = pci_cfg_space_size(dev); /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) set this higher, assuming the system even supports it. */ diff -puN drivers/pci/proc.c~pcix-enhanced drivers/pci/proc.c --- 25/drivers/pci/proc.c~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/drivers/pci/proc.c Wed Feb 4 13:06:37 2004 @@ -16,16 +16,15 @@ #include #include -#define PCI_CFG_SPACE_SIZE 256 - static int proc_initialized; /* = 0 */ static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence) { loff_t new = -1; + struct inode *inode = file->f_dentry->d_inode; - down(&file->f_dentry->d_inode->i_sem); + down(&inode->i_sem); switch (whence) { case 0: new = off; @@ -34,14 +33,14 @@ proc_bus_pci_lseek(struct file *file, lo new = file->f_pos + off; break; case 2: - new = PCI_CFG_SPACE_SIZE + off; + new = inode->i_size + off; break; } - if (new < 0 || new > PCI_CFG_SPACE_SIZE) + if (new < 0 || new > inode->i_size) new = -EINVAL; else file->f_pos = new; - up(&file->f_dentry->d_inode->i_sem); + up(&inode->i_sem); return new; } @@ -61,7 +60,7 @@ proc_bus_pci_read(struct file *file, cha */ if (capable(CAP_SYS_ADMIN)) - size = PCI_CFG_SPACE_SIZE; + size = dev->cfg_size; else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) size = 128; else @@ -134,14 +133,15 @@ proc_bus_pci_write(struct file *file, co const struct proc_dir_entry *dp = PDE(ino); struct pci_dev *dev = dp->data; int pos = *ppos; + int size = dev->cfg_size; int cnt; - if (pos >= PCI_CFG_SPACE_SIZE) + if (pos >= size) return 0; - if (nbytes >= PCI_CFG_SPACE_SIZE) - nbytes = PCI_CFG_SPACE_SIZE; - if (pos + nbytes > PCI_CFG_SPACE_SIZE) - nbytes = PCI_CFG_SPACE_SIZE - pos; + if (nbytes >= size) + nbytes = size; + if (pos + nbytes > size) + nbytes = size - pos; cnt = nbytes; if (!access_ok(VERIFY_READ, buf, cnt)) @@ -403,7 +403,7 @@ int pci_proc_attach_device(struct pci_de return -ENOMEM; e->proc_fops = &proc_bus_pci_operations; e->data = dev; - e->size = PCI_CFG_SPACE_SIZE; + e->size = dev->cfg_size; return 0; } diff -puN include/asm-i386/fixmap.h~pcix-enhanced include/asm-i386/fixmap.h --- 25/include/asm-i386/fixmap.h~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/include/asm-i386/fixmap.h Wed Feb 4 13:06:37 2004 @@ -71,6 +71,9 @@ enum fixed_addresses { FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, #endif +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif __end_of_permanent_fixed_addresses, /* temporary boot-time mappings, used before ioremap() is functional */ #define NR_FIX_BTMAPS 16 diff -puN include/linux/acpi.h~pcix-enhanced include/linux/acpi.h --- 25/include/linux/acpi.h~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/include/linux/acpi.h Wed Feb 4 13:06:37 2004 @@ -317,6 +317,15 @@ struct acpi_table_ecdt { char ec_id[0]; } __attribute__ ((packed)); +/* PCI MMCONFIG */ + +struct acpi_table_mcfg { + struct acpi_table_header header; + u8 reserved[8]; + u32 base_address; + u32 base_reserved; +} __attribute__ ((packed)); + /* Table Handlers */ enum acpi_table_id { @@ -338,6 +347,7 @@ enum acpi_table_id { ACPI_SSDT, ACPI_SPMI, ACPI_HPET, + ACPI_MCFG, ACPI_TABLE_COUNT }; @@ -369,6 +379,8 @@ void acpi_numa_arch_fixup(void); extern int acpi_mp_config; +extern u32 pci_mmcfg_base_addr; + #else /*!CONFIG_ACPI_BOOT*/ #define acpi_mp_config 0 diff -puN include/linux/pci.h~pcix-enhanced include/linux/pci.h --- 25/include/linux/pci.h~pcix-enhanced Wed Feb 4 13:06:37 2004 +++ 25-akpm/include/linux/pci.h Wed Feb 4 13:06:37 2004 @@ -409,6 +409,8 @@ struct pci_dev { unsigned short vendor_compatible[DEVICE_COUNT_COMPATIBLE]; unsigned short device_compatible[DEVICE_COUNT_COMPATIBLE]; + int cfg_size; /* Size of configuration space */ + /* * Instead of touching interrupt line and base address registers * directly, use the values stored here. They might be different! _