Populate the cxl_do_recovery() function with uncorrectable protocol error (UCE) handling. Follow similar design as found in PCIe error driver, pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs as fatal with a kernel panic. This is to prevent corruption on CXL memory. Introduce cxl_walk_port(). Make this analogous to pci_walk_bridge() but walking CXL ports instead. This will iterate through the CXL topology from the erroring device through the downstream CXL Ports and Endpoints. Export pci_aer_clear_fatal_status() for CXL to use if a UCE is not found. Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> --- Changes in v10->v11: - pci_ers_merge_results() - Move to earlier patch --- drivers/cxl/core/port.c | 1 + drivers/cxl/core/ras.c | 94 +++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 2 - include/linux/aer.h | 2 + 4 files changed, 97 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 758fb73374c1..085c8620a797 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1347,6 +1347,7 @@ struct cxl_port *find_cxl_port(struct device *dport_dev, port = __find_cxl_port(&ctx); return port; } +EXPORT_SYMBOL_NS_GPL(find_cxl_port, "CXL"); static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port, struct device *dport_dev, diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 536ca9c815ce..3da675f72616 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -6,6 +6,7 @@ #include <cxl/event.h> #include <cxlmem.h> #include <cxlpci.h> +#include <cxl.h> #include "trace.h" static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, @@ -468,8 +469,101 @@ void cxl_endpoint_port_init_ras(struct cxl_port *ep) } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_port_init_ras, "CXL"); +static int cxl_report_error_detected(struct device *dev, void *data) +{ + struct pci_dev *pdev = to_pci_dev(dev); + pci_ers_result_t vote, *result = data; + + guard(device)(dev); + + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) + vote = cxl_error_detected(dev); + else + vote = cxl_port_error_detected(dev); + + vote = cxl_error_detected(dev); + *result = pci_ers_merge_result(*result, vote); + + return 0; +} + +static int match_port_by_parent_dport(struct device *dev, const void *dport_dev) +{ + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + + port = to_cxl_port(dev); + + return port->parent_dport->dport_dev == dport_dev; +} + +static void cxl_walk_port(struct device *port_dev, + int (*cb)(struct device *, void *), + void *userdata) +{ + struct cxl_dport *dport = NULL; + struct cxl_port *port; + unsigned long index; + + if (!port_dev) + return; + + port = to_cxl_port(port_dev); + if (port->uport_dev && dev_is_pci(port->uport_dev)) + cb(port->uport_dev, userdata); + + xa_for_each(&port->dports, index, dport) + { + struct device *child_port_dev __free(put_device) = + bus_find_device(&cxl_bus_type, &port->dev, dport, + match_port_by_parent_dport); + + cb(dport->dport_dev, userdata); + + cxl_walk_port(child_port_dev, cxl_report_error_detected, userdata); + } + + if (is_cxl_endpoint(port)) + cb(port->uport_dev->parent, userdata); +} + static void cxl_do_recovery(struct device *dev) { + pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; + struct pci_dev *pdev = to_pci_dev(dev); + struct cxl_dport *dport; + struct cxl_port *port; + + if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) || + (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) { + port = find_cxl_port(&pdev->dev, &dport); + } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) { + struct device *port_dev = bus_find_device(&cxl_bus_type, NULL, + &pdev->dev, match_uport); + port = to_cxl_port(port_dev); + } + + if (!port) + return; + + cxl_walk_port(&port->dev, cxl_report_error_detected, &status); + if (status == PCI_ERS_RESULT_PANIC) + panic("CXL cachemem error."); + + /* + * If we have native control of AER, clear error status in the device + * that detected the error. If the platform retained control of AER, + * it is responsible for clearing this status. In that case, the + * signaling device may not even be visible to the OS. + */ + if (cxl_error_is_native(pdev)) { + pcie_clear_device_status(pdev); + pci_aer_clear_nonfatal_status(pdev); + pci_aer_clear_fatal_status(pdev); + } + put_device(&port->dev); } static void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 69ff7c2d214f..0c4f73dd645f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -1170,13 +1170,11 @@ static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { } #ifdef CONFIG_CXL_RAS void pci_aer_unmask_internal_errors(struct pci_dev *dev); -bool cxl_error_is_native(struct pci_dev *dev); bool is_internal_error(struct aer_err_info *info); bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info); void cxl_forward_error(struct pci_dev *pdev, struct aer_err_info *info); #else static inline void pci_aer_unmask_internal_errors(struct pci_dev *dev) { } -static inline bool cxl_error_is_native(struct pci_dev *dev) { return false; } static inline bool is_internal_error(struct aer_err_info *info) { return false; } static inline bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return false; } static inline void cxl_forward_error(struct pci_dev *pdev, struct aer_err_info *info) { } diff --git a/include/linux/aer.h b/include/linux/aer.h index 1f79f0be4bf7..751a026fea73 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -81,10 +81,12 @@ static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd); void cxl_register_proto_err_work(struct work_struct *work); void cxl_unregister_proto_err_work(void); +bool cxl_error_is_native(struct pci_dev *dev); #else static inline int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd) { return 0; } static inline void cxl_register_proto_err_work(struct work_struct *work) { } static inline void cxl_unregister_proto_err_work(void) { } +static inline bool cxl_error_is_native(struct pci_dev *dev) { return false; } #endif void pci_print_aer(struct pci_dev *dev, int aer_severity, -- 2.51.0.rc2.21.ge5ab6b3e5a