On 4/23/2025 11:35 AM, Jonathan Cameron wrote: > On Wed, 26 Mar 2025 20:47:07 -0500 > Terry Bowman <terry.bowman@xxxxxxx> wrote: > >> Create cxl_do_recovery() to provide uncorrectable protocol error (UCE) >> handling. Follow similar design as found in PCIe error driver, >> pcie_do_recovery(). One difference is that cxl_do_recovery() will treat all >> UCEs as fatal with a kernel panic. This is to prevent corruption on CXL >> memory. >> >> Copy the PCIe error handlers merge_result(). Introduce PCI_ERS_RESULT_PANIC >> and add support in the merge_result() routine. >> >> Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the >> first device in all cases. >> >> Copy report_error_detected() to cxl_report_error_detected(). Update this >> function to populate the CXL error information structure, 'struct >> cxl_prot_error_info', before calling the device error handler. >> >> Call panic() to halt the system in the case of uncorrectable errors (UCE) >> in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use >> if a UCE is not found. In this case the AER status must be cleared and >> uses pci_aer_clear_fatal_status(). >> >> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> >> --- >> drivers/cxl/core/ras.c | 92 +++++++++++++++++++++++++++++++++++++++++- >> drivers/pci/pci.h | 2 - >> include/linux/pci.h | 5 +++ >> 3 files changed, 96 insertions(+), 3 deletions(-) >> >> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c >> index eca8f11a05d9..1f94fc08e72b 100644 >> --- a/drivers/cxl/core/ras.c >> +++ b/drivers/cxl/core/ras.c >> @@ -141,7 +141,97 @@ int cxl_create_prot_err_info(struct pci_dev *_pdev, int severity, >> } >> EXPORT_SYMBOL_NS_GPL(cxl_create_prot_err_info, "CXL"); >> >> -static void cxl_do_recovery(struct pci_dev *pdev) { } >> + >> +static pci_ers_result_t merge_result(enum pci_ers_result orig, > Rename perhaps to avoid confusion / grep clashed... Ok. I'll rename to cxl_merge_results(). >> + enum pci_ers_result new) >> +{ >> + if (new == PCI_ERS_RESULT_PANIC) >> + return PCI_ERS_RESULT_PANIC; >> + >> + if (new == PCI_ERS_RESULT_NO_AER_DRIVER) >> + return PCI_ERS_RESULT_NO_AER_DRIVER; >> + >> + if (new == PCI_ERS_RESULT_NONE) >> + return orig; >> + >> + switch (orig) { >> + case PCI_ERS_RESULT_CAN_RECOVER: >> + case PCI_ERS_RESULT_RECOVERED: >> + orig = new; >> + break; >> + case PCI_ERS_RESULT_DISCONNECT: >> + if (new == PCI_ERS_RESULT_NEED_RESET) >> + orig = PCI_ERS_RESULT_NEED_RESET; >> + break; >> + default: >> + break; >> + } >> + >> + return orig; >> +} >> + >> +static void cxl_walk_bridge(struct pci_dev *bridge, >> + int (*cb)(struct pci_dev *, void *), >> + void *userdata) >> +{ >> + if (cb(bridge, userdata)) >> + return; >> + >> + if (bridge->subordinate) >> + pci_walk_bus(bridge->subordinate, cb, userdata); >> +} >> + > Trivial but seems there are two blank lines where one will do. Ok -Terry >> + >> +static int cxl_report_error_detected(struct pci_dev *pdev, void *data) >> +{ >> + struct cxl_driver *pdrv; >> + pci_ers_result_t vote, *result = data; >> + struct cxl_prot_error_info err_info = { 0 }; >> + const struct cxl_error_handlers *cxl_err_handler; >> + >> + if (cxl_create_prot_err_info(pdev, AER_FATAL, &err_info)) >> + return 0; >> + >> + struct device *dev __free(put_device) = get_device(err_info.dev); >> + if (!dev) >> + return 0; >> + >> + pdrv = to_cxl_drv(dev->driver); >> + if (!pdrv || !pdrv->err_handler || >> + !pdrv->err_handler->error_detected) >> + return 0; >> + >> + cxl_err_handler = pdrv->err_handler; >> + vote = cxl_err_handler->error_detected(dev, &err_info); >> + >> + *result = merge_result(*result, vote); >> + >> + return 0; >> +} >> + >> +static void cxl_do_recovery(struct pci_dev *pdev) >> +{ >> + struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus); >> + pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; >> + >> + cxl_walk_bridge(pdev, cxl_report_error_detected, &status); >> + if (status == PCI_ERS_RESULT_PANIC) >> + panic("CXL cachemem error."); >> + >> + /* >> + * If we have native control of AER, clear error status in the device >> + * that detected the error. If the platform retained control of AER, >> + * it is responsible for clearing this status. In that case, the >> + * signaling device may not even be visible to the OS. >> + */ >> + if (host->native_aer) { >> + pcie_clear_device_status(pdev); >> + pci_aer_clear_nonfatal_status(pdev); >> + pci_aer_clear_fatal_status(pdev); >> + } >> + >> + pci_info(pdev, "CXL uncorrectable error.\n"); >> +} >> >> static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data) >> { >