diff options
author | Greg Kroah-Hartman <gregkh@suse.de> | 2005-11-18 09:21:30 -0800 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@suse.de> | 2005-11-18 09:21:30 -0800 |
commit | c86829ed35162a012126e1866317d7d726a73b21 (patch) | |
tree | af937635ff9a141fbd3144306d30d18485d946a1 /pci | |
parent | 83af9d4ec9d9dd35d6f24f3f50db513a50a5c83c (diff) | |
download | patches-c86829ed35162a012126e1866317d7d726a73b21.tar.gz |
2.6.15-rc1-git6 refresh (dropped a lot of usb patches)
Diffstat (limited to 'pci')
-rw-r--r-- | pci/pci-documentation-for-pci-error-recovery.patch | 273 | ||||
-rw-r--r-- | pci/pci-pci-error-reporting-callbacks.patch | 105 |
2 files changed, 0 insertions, 378 deletions
diff --git a/pci/pci-documentation-for-pci-error-recovery.patch b/pci/pci-documentation-for-pci-error-recovery.patch deleted file mode 100644 index f49068a676846..0000000000000 --- a/pci/pci-documentation-for-pci-error-recovery.patch +++ /dev/null @@ -1,273 +0,0 @@ -From owner-linux-pci@atrey.karlin.mff.cuni.cz Thu Nov 3 16:52:36 2005 -Date: Thu, 3 Nov 2005 18:50:26 -0600 -From: Linas Vepstas <linas@linas.org> -To: <paulus@samba.org> -Cc: <johnrose@austin.ibm.com> -Subject: [PATCH 15/42]: PCI: Documentation for PCI Error Recovery -Message-ID: <20051104005026.GA26919@mail.gnucash.org> -Content-Disposition: inline - -PCI Error Recovery: documentation patch - -Various PCI bus errors can be signaled by newer PCI controllers. -Recovering from those errors requires an infrastructure to notify -affected device drivers of the error, and a way of walking through -a reset sequence. This patch adds documentation describing the -current error recovery proposal. - -Signed-off-by: Linas Vepstas <linas@austin.ibm.com> -Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> - ---- - Documentation/pci-error-recovery.txt | 246 +++++++++++++++++++++++++++++++++++ - 1 file changed, 246 insertions(+) - ---- /dev/null -+++ gregkh-2.6/Documentation/pci-error-recovery.txt -@@ -0,0 +1,246 @@ -+ -+ PCI Error Recovery -+ ------------------ -+ May 31, 2005 -+ -+ Current document maintainer: -+ Linas Vepstas <linas@austin.ibm.com> -+ -+ -+Some PCI bus controllers are able to detect certain "hard" PCI errors -+on the bus, such as parity errors on the data and address busses, as -+well as SERR and PERR errors. These chipsets are then able to disable -+I/O to/from the affected device, so that, for example, a bad DMA -+address doesn't end up corrupting system memory. These same chipsets -+are also able to reset the affected PCI device, and return it to -+working condition. This document describes a generic API form -+performing error recovery. -+ -+The core idea is that after a PCI error has been detected, there must -+be a way for the kernel to coordinate with all affected device drivers -+so that the pci card can be made operational again, possibly after -+performing a full electrical #RST of the PCI card. The API below -+provides a generic API for device drivers to be notified of PCI -+errors, and to be notified of, and respond to, a reset sequence. -+ -+Preliminary sketch of API, cut-n-pasted-n-modified email from -+Ben Herrenschmidt, circa 5 april 2005 -+ -+The error recovery API support is exposed to the driver in the form of -+a structure of function pointers pointed to by a new field in struct -+pci_driver. The absence of this pointer in pci_driver denotes an -+"non-aware" driver, behaviour on these is platform dependant. -+Platforms like ppc64 can try to simulate pci hotplug remove/add. -+ -+The definition of "pci_error_token" is not covered here. It is based on -+Seto's work on the synchronous error detection. We still need to define -+functions for extracting infos out of an opaque error token. This is -+separate from this API. -+ -+This structure has the form: -+ -+struct pci_error_handlers -+{ -+ int (*error_detected)(struct pci_dev *dev, pci_error_token error); -+ int (*mmio_enabled)(struct pci_dev *dev); -+ int (*resume)(struct pci_dev *dev); -+ int (*link_reset)(struct pci_dev *dev); -+ int (*slot_reset)(struct pci_dev *dev); -+}; -+ -+A driver doesn't have to implement all of these callbacks. The -+only mandatory one is error_detected(). If a callback is not -+implemented, the corresponding feature is considered unsupported. -+For example, if mmio_enabled() and resume() aren't there, then the -+driver is assumed as not doing any direct recovery and requires -+a reset. If link_reset() is not implemented, the card is assumed as -+not caring about link resets, in which case, if recover is supported, -+the core can try recover (but not slot_reset() unless it really did -+reset the slot). If slot_reset() is not supported, link_reset() can -+be called instead on a slot reset. -+ -+At first, the call will always be : -+ -+ 1) error_detected() -+ -+ Error detected. This is sent once after an error has been detected. At -+this point, the device might not be accessible anymore depending on the -+platform (the slot will be isolated on ppc64). The driver may already -+have "noticed" the error because of a failing IO, but this is the proper -+"synchronisation point", that is, it gives a chance to the driver to -+cleanup, waiting for pending stuff (timers, whatever, etc...) to -+complete; it can take semaphores, schedule, etc... everything but touch -+the device. Within this function and after it returns, the driver -+shouldn't do any new IOs. Called in task context. This is sort of a -+"quiesce" point. See note about interrupts at the end of this doc. -+ -+ Result codes: -+ - PCIERR_RESULT_CAN_RECOVER: -+ Driever returns this if it thinks it might be able to recover -+ the HW by just banging IOs or if it wants to be given -+ a chance to extract some diagnostic informations (see -+ below). -+ - PCIERR_RESULT_NEED_RESET: -+ Driver returns this if it thinks it can't recover unless the -+ slot is reset. -+ - PCIERR_RESULT_DISCONNECT: -+ Return this if driver thinks it won't recover at all, -+ (this will detach the driver ? or just leave it -+ dangling ? to be decided) -+ -+So at this point, we have called error_detected() for all drivers -+on the segment that had the error. On ppc64, the slot is isolated. What -+happens now typically depends on the result from the drivers. If all -+drivers on the segment/slot return PCIERR_RESULT_CAN_RECOVER, we would -+re-enable IOs on the slot (or do nothing special if the platform doesn't -+isolate slots) and call 2). If not and we can reset slots, we go to 4), -+if neither, we have a dead slot. If it's an hotplug slot, we might -+"simulate" reset by triggering HW unplug/replug though. -+ -+>>> Current ppc64 implementation assumes that a device driver will -+>>> *not* schedule or semaphore in this routine; the current ppc64 -+>>> implementation uses one kernel thread to notify all devices; -+>>> thus, of one device sleeps/schedules, all devices are affected. -+>>> Doing better requires complex multi-threaded logic in the error -+>>> recovery implementation (e.g. waiting for all notification threads -+>>> to "join" before proceeding with recovery.) This seems excessively -+>>> complex and not worth implementing. -+ -+>>> The current ppc64 implementation doesn't much care if the device -+>>> attempts i/o at this point, or not. I/O's will fail, returning -+>>> a value of 0xff on read, and writes will be dropped. If the device -+>>> driver attempts more than 10K I/O's to a frozen adapter, it will -+>>> assume that the device driver has gone into an infinite loop, and -+>>> it will panic the the kernel. -+ -+ 2) mmio_enabled() -+ -+ This is the "early recovery" call. IOs are allowed again, but DMA is -+not (hrm... to be discussed, I prefer not), with some restrictions. This -+is NOT a callback for the driver to start operations again, only to -+peek/poke at the device, extract diagnostic information, if any, and -+eventually do things like trigger a device local reset or some such, -+but not restart operations. This is sent if all drivers on a segment -+agree that they can try to recover and no automatic link reset was -+performed by the HW. If the platform can't just re-enable IOs without -+a slot reset or a link reset, it doesn't call this callback and goes -+directly to 3) or 4). All IOs should be done _synchronously_ from -+within this callback, errors triggered by them will be returned via -+the normal pci_check_whatever() api, no new error_detected() callback -+will be issued due to an error happening here. However, such an error -+might cause IOs to be re-blocked for the whole segment, and thus -+invalidate the recovery that other devices on the same segment might -+have done, forcing the whole segment into one of the next states, -+that is link reset or slot reset. -+ -+ Result codes: -+ - PCIERR_RESULT_RECOVERED -+ Driver returns this if it thinks the device is fully -+ functionnal and thinks it is ready to start -+ normal driver operations again. There is no -+ guarantee that the driver will actually be -+ allowed to proceed, as another driver on the -+ same segment might have failed and thus triggered a -+ slot reset on platforms that support it. -+ -+ - PCIERR_RESULT_NEED_RESET -+ Driver returns this if it thinks the device is not -+ recoverable in it's current state and it needs a slot -+ reset to proceed. -+ -+ - PCIERR_RESULT_DISCONNECT -+ Same as above. Total failure, no recovery even after -+ reset driver dead. (To be defined more precisely) -+ -+>>> The current ppc64 implementation does not implement this callback. -+ -+ 3) link_reset() -+ -+ This is called after the link has been reset. This is typically -+a PCI Express specific state at this point and is done whenever a -+non-fatal error has been detected that can be "solved" by resetting -+the link. This call informs the driver of the reset and the driver -+should check if the device appears to be in working condition. -+This function acts a bit like 2) mmio_enabled(), in that the driver -+is not supposed to restart normal driver I/O operations right away. -+Instead, it should just "probe" the device to check it's recoverability -+status. If all is right, then the core will call resume() once all -+drivers have ack'd link_reset(). -+ -+ Result codes: -+ (identical to mmio_enabled) -+ -+>>> The current ppc64 implementation does not implement this callback. -+ -+ 4) slot_reset() -+ -+ This is called after the slot has been soft or hard reset by the -+platform. A soft reset consists of asserting the adapter #RST line -+and then restoring the PCI BARs and PCI configuration header. If the -+platform supports PCI hotplug, then it might instead perform a hard -+reset by toggling power on the slot off/on. This call gives drivers -+the chance to re-initialize the hardware (re-download firmware, etc.), -+but drivers shouldn't restart normal I/O processing operations at -+this point. (See note about interrupts; interrupts aren't guaranteed -+to be delivered until the resume() callback has been called). If all -+device drivers report success on this callback, the patform will call -+resume() to complete the error handling and let the driver restart -+normal I/O processing. -+ -+A driver can still return a critical failure for this function if -+it can't get the device operational after reset. If the platform -+previously tried a soft reset, it migh now try a hard reset (power -+cycle) and then call slot_reset() again. It the device still can't -+be recovered, there is nothing more that can be done; the platform -+will typically report a "permanent failure" in such a case. The -+device will be considered "dead" in this case. -+ -+ Result codes: -+ - PCIERR_RESULT_DISCONNECT -+ Same as above. -+ -+>>> The current ppc64 implementation does not try a power-cycle reset -+>>> if the driver returned PCIERR_RESULT_DISCONNECT. However, it should. -+ -+ 5) resume() -+ -+ This is called if all drivers on the segment have returned -+PCIERR_RESULT_RECOVERED from one of the 3 prevous callbacks. -+That basically tells the driver to restart activity, tht everything -+is back and running. No result code is taken into account here. If -+a new error happens, it will restart a new error handling process. -+ -+That's it. I think this covers all the possibilities. The way those -+callbacks are called is platform policy. A platform with no slot reset -+capability for example may want to just "ignore" drivers that can't -+recover (disconnect them) and try to let other cards on the same segment -+recover. Keep in mind that in most real life cases, though, there will -+be only one driver per segment. -+ -+Now, there is a note about interrupts. If you get an interrupt and your -+device is dead or has been isolated, there is a problem :) -+ -+After much thinking, I decided to leave that to the platform. That is, -+the recovery API only precies that: -+ -+ - There is no guarantee that interrupt delivery can proceed from any -+device on the segment starting from the error detection and until the -+restart callback is sent, at which point interrupts are expected to be -+fully operational. -+ -+ - There is no guarantee that interrupt delivery is stopped, that is, ad -+river that gets an interrupts after detecting an error, or that detects -+and error within the interrupt handler such that it prevents proper -+ack'ing of the interrupt (and thus removal of the source) should just -+return IRQ_NOTHANDLED. It's up to the platform to deal with taht -+condition, typically by masking the irq source during the duration of -+the error handling. It is expected that the platform "knows" which -+interrupts are routed to error-management capable slots and can deal -+with temporarily disabling that irq number during error processing (this -+isn't terribly complex). That means some IRQ latency for other devices -+sharing the interrupt, but there is simply no other way. High end -+platforms aren't supposed to share interrupts between many devices -+anyway :) -+ -+ -+Revised: 31 May 2005 Linas Vepstas <linas@austin.ibm.com> diff --git a/pci/pci-pci-error-reporting-callbacks.patch b/pci/pci-pci-error-reporting-callbacks.patch deleted file mode 100644 index cab7bef2e1cae..0000000000000 --- a/pci/pci-pci-error-reporting-callbacks.patch +++ /dev/null @@ -1,105 +0,0 @@ -From owner-linux-pci@atrey.karlin.mff.cuni.cz Thu Nov 3 16:52:41 2005 -Date: Thu, 3 Nov 2005 18:50:35 -0600 -From: Linas Vepstas <linas@linas.org> -To: <paulus@samba.org> -Cc: <johnrose@austin.ibm.com -Subject: [PATCH 16/42]: PCI: PCI Error reporting callbacks -Message-ID: <20051104005035.GA26929@mail.gnucash.org> -Content-Disposition: inline - -PCI Error Recovery: header file patch - -Various PCI bus errors can be signaled by newer PCI controllers. -Recovering from those errors requires an infrastructure to notify -affected device drivers of the error, and a way of walking through a -reset sequence. This patch adds a set of callbacks to be used by error -recovery routines to notify device drivers of the various stages of -recovery. - -Signed-off-by: Linas Vepstas <linas@austin.ibm.com> -Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> - --- - include/linux/pci.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 49 insertions(+) - ---- ---- gregkh-2.6.orig/include/linux/pci.h -+++ gregkh-2.6/include/linux/pci.h -@@ -78,6 +78,16 @@ typedef int __bitwise pci_power_t; - #define PCI_UNKNOWN ((pci_power_t __force) 5) - #define PCI_POWER_ERROR ((pci_power_t __force) -1) - -+/** The pci_channel state describes connectivity between the CPU and -+ * the pci device. If some PCI bus between here and the pci device -+ * has crashed or locked up, this info is reflected here. -+ */ -+enum pci_channel_state { -+ pci_channel_io_normal = 0, /* I/O channel is in normal state */ -+ pci_channel_io_frozen = 1, /* I/O to channel is blocked */ -+ pci_channel_io_perm_failure, /* PCI card is dead */ -+}; -+ - /* - * The pci_dev structure is used to describe PCI devices. - */ -@@ -111,6 +121,7 @@ struct pci_dev { - this is D0-D3, D0 being fully functional, - and D3 being off. */ - -+ enum pci_channel_state error_state; /* current connectivity state */ - struct device dev; /* Generic device interface */ - - /* device is compatible with these IDs */ -@@ -233,6 +244,43 @@ struct pci_dynids { - unsigned int use_driver_data:1; /* pci_driver->driver_data is used */ - }; - -+/* ---------------------------------------------------------------- */ -+/** PCI error recovery infrastructure. If a PCI device driver provides -+ * a set fof callbacks in struct pci_error_handlers, then that device driver -+ * will be notified of PCI bus errors, and will be driven to recovery -+ * when an error occurs. -+ */ -+ -+enum pcierr_result { -+ PCIERR_RESULT_NONE = 0, /* no result/none/not supported in device driver */ -+ PCIERR_RESULT_CAN_RECOVER=1, /* Device driver can recover without slot reset */ -+ PCIERR_RESULT_NEED_RESET, /* Device driver wants slot to be reset. */ -+ PCIERR_RESULT_DISCONNECT, /* Device has completely failed, is unrecoverable */ -+ PCIERR_RESULT_RECOVERED, /* Device driver is fully recovered and operational */ -+}; -+ -+/* PCI bus error event callbacks */ -+struct pci_error_handlers -+{ -+ /* PCI bus error detected on this device */ -+ int (*error_detected)(struct pci_dev *dev, -+ enum pci_channel_state error); -+ -+ /* MMIO has been re-enabled, but not DMA */ -+ int (*mmio_enabled)(struct pci_dev *dev); -+ -+ /* PCI Express link has been reset */ -+ int (*link_reset)(struct pci_dev *dev); -+ -+ /* PCI slot has been reset */ -+ int (*slot_reset)(struct pci_dev *dev); -+ -+ /* Device driver may resume normal operations */ -+ void (*resume)(struct pci_dev *dev); -+}; -+ -+/* ---------------------------------------------------------------- */ -+ - struct module; - struct pci_driver { - struct list_head node; -@@ -245,6 +293,7 @@ struct pci_driver { - int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); /* Enable wake event */ - void (*shutdown) (struct pci_dev *dev); - -+ struct pci_error_handlers *err_handler; - struct device_driver driver; - struct pci_dynids dynids; - }; |