aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGavin Shan <gwshan@linux.vnet.ibm.com>2014-05-01 19:49:47 +1000
committerEli Qiao <taget@linux.vnet.ibm.com>2014-05-04 10:55:21 +0800
commit544b616f43a5569e9dc3012b357c2fcd0e74f421 (patch)
tree6a437f98fd5806faac8ca7eac6808782ab67ca31
parentf28eb4181a14995cd719e30552af2e16eed03972 (diff)
downloadpowerkvm-544b616f43a5569e9dc3012b357c2fcd0e74f421.tar.gz
powerpc/eeh: Report frozen parent PE prior to child PE
When we have the corner case of frozen parent and child PE at the same time, we have to handle the frozen parent PE prior to the child. Without clearning the frozen state on parent PE, the child PE can't be recovered successfully. There're 2 ways (polling and interrupt) to have frozen PE to be reported. If we have frozen parent PE out there, we have to report and handle that firstly. It's responsing to bug#109562. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
-rw-r--r--arch/powerpc/kernel/eeh.c27
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c30
2 files changed, 52 insertions, 5 deletions
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index cc85c467151e2b..05e5fc6f5cd398 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -354,10 +354,11 @@ out:
int eeh_dev_check_failure(struct eeh_dev *edev)
{
int ret;
+ int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
unsigned long flags;
struct device_node *dn;
struct pci_dev *dev;
- struct eeh_pe *pe;
+ struct eeh_pe *pe, *parent_pe;
int rc = 0;
const char *location;
@@ -435,14 +436,34 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
*/
if ((ret < 0) ||
(ret == EEH_STATE_NOT_SUPPORT) ||
- (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
- (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+ ((ret & active_flags) == active_flags)) {
eeh_stats.false_positives++;
pe->false_positives++;
rc = 0;
goto dn_unlock;
}
+ /*
+ * It should be corner case that the parent PE has been
+ * put into frozen state as well. We should take care
+ * that at first.
+ */
+ parent_pe = pe->parent;
+ while (parent_pe) {
+ /* Hit the ceiling ? */
+ if (parent_pe->type & EEH_PE_PHB)
+ break;
+
+ /* Frozen parent PE ? */
+ ret = eeh_ops->get_state(parent_pe, NULL);
+ if (ret > 0 &&
+ (ret & active_flags) != active_flags)
+ pe = parent_pe;
+
+ /* Next parent level */
+ parent_pe = parent_pe->parent;
+ }
+
eeh_stats.slot_resets++;
/* Avoid repeated reports of this failure, including problems
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 7eef37838fdccc..a93c73b6d9454c 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -720,11 +720,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
{
struct pci_controller *hose;
struct pnv_phb *phb;
- struct eeh_pe *phb_pe;
+ struct eeh_pe *phb_pe, *parent_pe;
u64 frozen_pe_no;
+ int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
u16 err_type, severity;
long rc;
- int ret = EEH_NEXT_ERR_NONE;
+ int state, ret = EEH_NEXT_ERR_NONE;
/*
* While running here, it's safe to purge the event queue.
@@ -852,6 +853,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
}
/*
+ * We probably have the frozen parent PE out there and
+ * we need have to handle frozen parent PE firstly.
+ */
+ if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+ parent_pe = (*pe)->parent;
+ while (parent_pe) {
+ /* Hit the ceiling ? */
+ if (parent_pe->type & EEH_PE_PHB)
+ break;
+
+ /* Frozen parent PE ? */
+ state = ioda_eeh_get_state(parent_pe);
+ if (state > 0 &&
+ (state & active_flags) != active_flags)
+ *pe = parent_pe;
+
+ /* Next parent level */
+ parent_pe = parent_pe->parent;
+ }
+
+ /* We possibly migrate to another PE */
+ eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+ }
+
+ /*
* If we have no errors on the specific PHB or only
* informative error there, we continue poking it.
* Otherwise, we need actions to be taken by upper