Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) handling. Follow similar design as found in PCIe error driver, pcie_do_recovery(). One difference is that cxl_do_recovery() will treat all UCEs as fatal with a kernel panic. This is to prevent corruption on CXL memory. Copy the PCIe error handlers merge_result(). Introduce PCI_ERS_RESULT_PANIC and add support in the merge_result() routine. Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the first device in all cases. Copy report_error_detected() to cxl_report_error_detected(). Update this function to populate the CXL error information structure, 'struct cxl_prot_error_info', before calling the device error handler. Call panic() to halt the system in the case of uncorrectable errors (UCE) in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use if a UCE is not found. In this case the AER status must be cleared and uses pci_aer_clear_fatal_status(). Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> --- drivers/cxl/core/ras.c | 92 +++++++++++++++++++++++++++++++++++++++++- drivers/pci/pci.h | 2 - include/linux/pci.h | 5 +++ 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index eca8f11a05d9..1f94fc08e72b 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -141,7 +141,97 @@ int cxl_create_prot_err_info(struct pci_dev *_pdev, int severity, } EXPORT_SYMBOL_NS_GPL(cxl_create_prot_err_info, "CXL"); -static void cxl_do_recovery(struct pci_dev *pdev) { } + +static pci_ers_result_t merge_result(enum pci_ers_result orig, + enum pci_ers_result new) +{ + if (new == PCI_ERS_RESULT_PANIC) + return PCI_ERS_RESULT_PANIC; + + if (new == PCI_ERS_RESULT_NO_AER_DRIVER) + return PCI_ERS_RESULT_NO_AER_DRIVER; + + if (new == PCI_ERS_RESULT_NONE) + return orig; + + switch (orig) { + case PCI_ERS_RESULT_CAN_RECOVER: + case PCI_ERS_RESULT_RECOVERED: + orig = new; + break; + case PCI_ERS_RESULT_DISCONNECT: + if (new == PCI_ERS_RESULT_NEED_RESET) + orig = PCI_ERS_RESULT_NEED_RESET; + break; + default: + break; + } + + return orig; +} + +static void cxl_walk_bridge(struct pci_dev *bridge, + int (*cb)(struct pci_dev *, void *), + void *userdata) +{ + if (cb(bridge, userdata)) + return; + + if (bridge->subordinate) + pci_walk_bus(bridge->subordinate, cb, userdata); +} + + +static int cxl_report_error_detected(struct pci_dev *pdev, void *data) +{ + struct cxl_driver *pdrv; + pci_ers_result_t vote, *result = data; + struct cxl_prot_error_info err_info = { 0 }; + const struct cxl_error_handlers *cxl_err_handler; + + if (cxl_create_prot_err_info(pdev, AER_FATAL, &err_info)) + return 0; + + struct device *dev __free(put_device) = get_device(err_info.dev); + if (!dev) + return 0; + + pdrv = to_cxl_drv(dev->driver); + if (!pdrv || !pdrv->err_handler || + !pdrv->err_handler->error_detected) + return 0; + + cxl_err_handler = pdrv->err_handler; + vote = cxl_err_handler->error_detected(dev, &err_info); + + *result = merge_result(*result, vote); + + return 0; +} + +static void cxl_do_recovery(struct pci_dev *pdev) +{ + struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus); + pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; + + cxl_walk_bridge(pdev, cxl_report_error_detected, &status); + if (status == PCI_ERS_RESULT_PANIC) + panic("CXL cachemem error."); + + /* + * If we have native control of AER, clear error status in the device + * that detected the error. If the platform retained control of AER, + * it is responsible for clearing this status. In that case, the + * signaling device may not even be visible to the OS. + */ + if (host->native_aer) { + pcie_clear_device_status(pdev); + pci_aer_clear_nonfatal_status(pdev); + pci_aer_clear_fatal_status(pdev); + } + + pci_info(pdev, "CXL uncorrectable error.\n"); +} static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index c32eab22c0b2..1354c7cfedeb 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -886,7 +886,6 @@ void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; -void pci_aer_clear_fatal_status(struct pci_dev *dev); int pci_aer_clear_status(struct pci_dev *dev); int pci_aer_raw_clear_status(struct pci_dev *dev); void pci_save_aer_state(struct pci_dev *dev); @@ -895,7 +894,6 @@ void pci_restore_aer_state(struct pci_dev *dev); static inline void pci_no_aer(void) { } static inline void pci_aer_init(struct pci_dev *d) { } static inline void pci_aer_exit(struct pci_dev *d) { } -static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline void pci_save_aer_state(struct pci_dev *dev) { } diff --git a/include/linux/pci.h b/include/linux/pci.h index 56015721be22..0aee5846b95c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -862,6 +862,9 @@ enum pci_ers_result { /* No AER capabilities registered for the driver */ PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6, + + /* System is unstable, panic */ + PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7, }; /* PCI bus error event callbacks */ @@ -1864,8 +1867,10 @@ static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } #ifdef CONFIG_PCIEAER bool pci_aer_available(void); +void pci_aer_clear_fatal_status(struct pci_dev *dev); #else static inline bool pci_aer_available(void) { return false; } +void pci_aer_clear_fatal_status(struct pci_dev *dev) { }; #endif bool pci_ats_disabled(void); -- 2.34.1