Add initial direct EEH handling to opvxa24xx driver

This is required to work around presumably bad card-side firmware
randomly (hours/days/weeks) performing bad DMA writes to the host (!)

Example PHB fence event:

[53811.103878] EEH: Recovering PHB#1-PE#fc
[53811.103904] EEH: PE location: UOPWR.D100022-Node0-SLOT1 PCIE 4.0 X16, PHB location: N/A
[53811.103917] EEH: Frozen PHB#1-PE#fc detected
[53811.103942] EEH: Call Trace:
[53811.103968] EEH: [000000009c27f02d] __eeh_send_failure_event+0x78/0x150
[53811.103997] EEH: [000000009006781a] eeh_dev_check_failure+0x380/0x670
[53811.104030] EEH: [00000000ebe9b5df] __opvx_a24xx_getcreg+0x7c/0xb0 [opvxa24xx]
[53811.104073] EEH: [000000005a792683] interrupt_onecard_handler+0x80/0xdc0 [opvxa24xx]
[53811.104115] EEH: [00000000903815aa] a24xx_interrupt+0xe8/0x120 [opvxa24xx]
[53811.104159] EEH: [0000000061324081] __handle_irq_event_percpu+0x11c/0x4b0
[53811.104203] EEH: [000000008d7d81ec] handle_irq_event_percpu+0x38/0x90
[53811.104244] EEH: [0000000078120288] handle_irq_event+0x60/0xa0
[53811.104286] EEH: [000000008b3623f1] handle_fasteoi_irq+0xd4/0x270
[53811.104318] EEH: [0000000042e6cb54] generic_handle_irq+0x54/0x80
[53811.104366] EEH: [0000000071e8585e] __do_irq+0xa0/0x380
[53811.104399] EEH: [00000000b76ddee5] __do_IRQ+0x9c/0x130
[53811.104440] EEH: [00000000aa5e1b3f] 0xc000000004373b10
[53811.104474] EEH: [000000006d701fc8] do_IRQ+0xfc/0x1d0
[53811.104524] EEH: [0000000070ff629f] replay_soft_interrupts+0x1c4/0x320
[53811.104555] EEH: [0000000095e31904] arch_local_irq_restore+0x1e4/0x250
[53811.104597] EEH: [0000000047c9e272] cpuidle_enter_state+0x158/0x720
[53811.104639] EEH: [00000000904e539f] cpuidle_enter+0x50/0x70
[53811.104680] EEH: [00000000b5de9adb] call_cpuidle+0x4c/0x90
[53811.104721] EEH: [000000008d8e2164] do_idle+0x320/0x3a0
[53811.104761] EEH: [00000000d680deba] cpu_startup_entry+0x3c/0x50
[53811.104784] EEH: [000000006146f980] start_secondary+0x27c/0x2d0
[53811.104820] EEH: [00000000cb4eb9c4] start_secondary_prolog+0x10/0x14
[53811.104849] EEH: This PCI device has failed 2 times in the last hour and will be permanently disabled after 5 failures.
[53811.104902] EEH: Notify device drivers to shutdown
[53811.104918] EEH: Beginning: 'error_detected(IO frozen)'
[53811.104931] PCI 0001:02:00.0#00fc: EEH: driver not EEH aware
[53811.104935] EEH: Finished:'error_detected(IO frozen)' with aggregate recovery state:'none'
[53811.104958] EEH: Collect temporary log
[53811.104988] EEH: of node=0001:02:00.0
[53811.104999] EEH: PCI device/vendor: 16101b74
[53811.105011] EEH: PCI cmd/status register: 04080146
[53811.105020] PHB4 PHB#1 Diag-data (Version: 1)
[53811.105030] brdgCtl:    00000002
[53811.105038] RootSts:    00060000 00402000 20110008 00100107 00000800
[53811.105049] PhbSts:     0000001c00000000 0000001c00000000
[53811.105059] Lem:        0000000100000080 0000000000000000 0000000000000080
[53811.105070] PhbErr:     0000028000000000 0000020000000000 2148000098000240 a008400000000000
[53811.105082] RxeTceErr:  6000000000000000 2000000000000000 c0000000000000fc 0000000000000000
[53811.105095] PblErr:     0000000000020000 0000000000020000 0000000000000000 0000000000000000
[53811.105107] RegbErr:    0000004000000000 0000004000000000 8800003c00000000 0000000000000000
[53811.105120] PE[0fc] A/B: 8300b03800000000 8000000000000000
[53811.105130] EEH: Reset with hotplug activity

Decoded PEST A/B:

Transaction type: DMA Read Response
Invalid MMIO Address
TCE Page Fault
TCE Access Fault
LEM Bit Number 56
Requestor 00:0.0
MSI Data 0x0000
Fault Address = 0x0000000000000000
parent be4af776
......@@ -123,6 +123,15 @@ static unsigned int irq_stub = 0;
((wc_dev->mod[(card)].fxs.vmwisetting.vmwi_type & DAHDI_VMWI_HVAC) != 0)\
)
static pci_ers_result_t a24xx_pci_error_detected(struct pci_dev *pdev,
pci_channel_state_t state);
static pci_ers_result_t a24xx_pci_slot_reset(struct pci_dev *pdev);
static const struct pci_error_handlers a24xx_err_handler = {
.error_detected = a24xx_pci_error_detected,
.slot_reset = a24xx_pci_slot_reset,
};
static const struct dahdi_echocan_features vpm_ec_features = {
.NLP_automatic = 1,
.CED_tx_detect = 1,
......@@ -2227,6 +2236,7 @@ static int __devinit a24xx_init_one(struct pci_dev *pdev, const struct pci_devic
}
wc_dev = &wc->dev;
wc_dev->ledstate = 0;
wc_dev->device_id = *ent;
for (x=0; x < sizeof(wc->chans)/sizeof(wc->chans[0]); ++x) {
wc->chans[x] = &wc->_chans[x];
......@@ -2331,6 +2341,8 @@ static int __devinit a24xx_init_one(struct pci_dev *pdev, const struct pci_devic
/* Enable bus mastering */
pci_set_master(pdev);
pci_save_state(pdev);
/* Keep track of which device we are */
pci_set_drvdata(pdev, wc);
......@@ -2606,6 +2618,7 @@ static struct pci_driver a24xx_driver = {
.suspend = NULL,
.resume = NULL,
.id_table = a24xx_pci_tbl,
.err_handler = &a24xx_err_handler,
};
static int __init a24xx_init(void)
......@@ -2657,6 +2670,56 @@ static void __exit a24xx_cleanup(void)
pci_unregister_driver(&a24xx_driver);
}
/**
* a24xx_pci_error_detected - Called when a PCI error is detected.
* @pdev: PCI device struct
* @state: PCI channel state
*
* Description: Called when a PCI error is detected.
*
* Return value:
* PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT
*/
static pci_ers_result_t a24xx_pci_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
if (state == pci_channel_io_perm_failure)
return PCI_ERS_RESULT_DISCONNECT;
return PCI_ERS_RESULT_NEED_RESET;
}
/**
* a24xx_pci_slot_reset - Called when PCI slot has been reset.
* @pdev: PCI device struct
*
* Description: This routine is called by the pci error recovery
* code after the PCI slot has been reset, just before we
* should resume normal operations.
*/
static pci_ers_result_t a24xx_pci_slot_reset(struct pci_dev *pdev)
{
int rc;
struct a24xx *wc = pci_get_drvdata(pdev);
struct a24xx_dev *wc_dev = &wc->dev;
pci_restore_state(pdev);
pci_save_state(pdev);
// Ideally we'd completely reinitialize the card here,
// but that will require more invasive changes to the driver.
//
// For now, assume that bad card-side firwmare has merely caused
// an invalid DMA operation, and that we can safely resume without
// re-initializing anything...
//a24xx_init_one(pdev, wc_dev->device_id);
return PCI_ERS_RESULT_RECOVERED;
}
module_param(spi_cmd, int, 0600);
module_param(debug, int, 0600);
module_param(ec_debug, int, 0600);
......
......@@ -126,6 +126,7 @@ enum battery_state {
struct a24xx_dev {
struct pci_dev *dev;
struct pci_device_id device_id;
char *variety;
struct dahdi_device *ddev;
unsigned char ios;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment