Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) handling. Follow similar design as found in PCIe error driver, pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs as fatal with a kernel panic. This is to prevent corruption on CXL memory. Export the PCI error driver's merge_result() to CXL namespace. Introduce PCI_ERS_RESULT_PANIC and add support in merge_result() routine. This will be used by CXL to panic the system in the case of uncorrectable protocol errors. PCI error handling is not currently expected to use the PCI_ERS_RESULT_PANIC. Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the first device in all cases. Copy the PCI error driver's report_error_detected() to cxl_report_error_detected(). Note, only CXL Endpoints and RCH Downstream Ports(RCH DSP) are currently supported. Add locking for PCI device as done in PCI's report_error_detected(). This is necessary to prevent the RAS registers from disappearing before logging is completed. Call panic() to halt the system in the case of uncorrectable errors (UCE) in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use if a UCE is not found. In this case the AER status must be cleared and uses pci_aer_clear_fatal_status(). Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> --- drivers/cxl/core/native_ras.c | 44 +++++++++++++++++++++++++++++++++++ drivers/pci/pcie/cxl_aer.c | 3 ++- drivers/pci/pcie/err.c | 8 +++++-- include/linux/aer.h | 11 +++++++++ include/linux/pci.h | 3 +++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/native_ras.c b/drivers/cxl/core/native_ras.c index 5bd79d5019e7..19f8f2ac8376 100644 --- a/drivers/cxl/core/native_ras.c +++ b/drivers/cxl/core/native_ras.c @@ -8,8 +8,52 @@ #include <core/core.h> #include <cxlpci.h> +static int cxl_report_error_detected(struct pci_dev *pdev, void *data) +{ + pci_ers_result_t vote, *result = data; + + if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) && + (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END)) + return 0; + + guard(device)(&pdev->dev); + + vote = cxl_error_detected(pdev, pci_channel_io_frozen); + *result = merge_result(*result, vote); + + return 0; +} + +static void cxl_walk_bridge(struct pci_dev *bridge, + int (*cb)(struct pci_dev *, void *), + void *userdata) +{ + if (cb(bridge, userdata)) + return; + + if (bridge->subordinate) + pci_walk_bus(bridge->subordinate, cb, userdata); +} + static void cxl_do_recovery(struct pci_dev *pdev) { + pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; + + cxl_walk_bridge(pdev, cxl_report_error_detected, &status); + if (status == PCI_ERS_RESULT_PANIC) + panic("CXL cachemem error."); + + /* + * If we have native control of AER, clear error status in the device + * that detected the error. If the platform retained control of AER, + * it is responsible for clearing this status. In that case, the + * signaling device may not even be visible to the OS. + */ + if (cxl_error_is_native(pdev)) { + pcie_clear_device_status(pdev); + pci_aer_clear_nonfatal_status(pdev); + pci_aer_clear_fatal_status(pdev); + } } static bool is_cxl_rcd(struct pci_dev *pdev) diff --git a/drivers/pci/pcie/cxl_aer.c b/drivers/pci/pcie/cxl_aer.c index 939438a7161a..b238791b7101 100644 --- a/drivers/pci/pcie/cxl_aer.c +++ b/drivers/pci/pcie/cxl_aer.c @@ -52,12 +52,13 @@ static bool is_cxl_mem_dev(struct pci_dev *dev) return true; } -static bool cxl_error_is_native(struct pci_dev *dev) +bool cxl_error_is_native(struct pci_dev *dev) { struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); return (pcie_ports_native || host->native_aer); } +EXPORT_SYMBOL_NS_GPL(cxl_error_is_native, "CXL"); static bool is_internal_error(struct aer_err_info *info) { diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index de6381c690f5..63fceb3e8613 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -21,9 +21,12 @@ #include "portdrv.h" #include "../pci.h" -static pci_ers_result_t merge_result(enum pci_ers_result orig, - enum pci_ers_result new) +pci_ers_result_t merge_result(enum pci_ers_result orig, + enum pci_ers_result new) { + if (new == PCI_ERS_RESULT_PANIC) + return PCI_ERS_RESULT_PANIC; + if (new == PCI_ERS_RESULT_NO_AER_DRIVER) return PCI_ERS_RESULT_NO_AER_DRIVER; @@ -45,6 +48,7 @@ static pci_ers_result_t merge_result(enum pci_ers_result orig, return orig; } +EXPORT_SYMBOL_NS_GPL(merge_result, "CXL"); static int report_error_detected(struct pci_dev *dev, pci_channel_state_t state, diff --git a/include/linux/aer.h b/include/linux/aer.h index 0aafcc678e45..f14db635ef90 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -10,6 +10,7 @@ #include <linux/errno.h> #include <linux/types.h> +#include <linux/pci.h> #include <linux/workqueue_types.h> #define AER_NONFATAL 0 @@ -78,6 +79,8 @@ struct cxl_proto_err_work_data { int pci_aer_clear_nonfatal_status(struct pci_dev *dev); void pci_aer_clear_fatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); +pci_ers_result_t merge_result(enum pci_ers_result orig, + enum pci_ers_result new); #else static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev) { @@ -85,16 +88,24 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev) } static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } +static inline pci_ers_result_t merge_result(enum pci_ers_result orig, + enum pci_ers_result new) +{ + return PCI_ERS_RESULT_NONE; +} + #endif #if defined(CONFIG_PCIEAER_CXL) void cxl_register_proto_err_work(struct work_struct *work); void cxl_unregister_proto_err_work(void); int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd); +bool cxl_error_is_native(struct pci_dev *dev); #else static inline void cxl_register_proto_err_work(struct work_struct *work) { } static inline void cxl_unregister_proto_err_work(void) { } static inline int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd) { return 0; } +static inline bool cxl_error_is_native(struct pci_dev *dev) { return 0; } #endif void pci_print_aer(struct pci_dev *dev, int aer_severity, diff --git a/include/linux/pci.h b/include/linux/pci.h index 79326358f641..16a8310e0373 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -868,6 +868,9 @@ enum pci_ers_result { /* No AER capabilities registered for the driver */ PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6, + + /* System is unstable, panic. Is CXL specific */ + PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7, }; /* PCI bus error event callbacks */ -- 2.34.1