On Tue, 3 Jun 2025 12:22:35 -0500 Terry Bowman <terry.bowman@xxxxxxx> wrote: > CXL Endpoint protocol errors are currently handled using PCI error > handlers. The CXL Endpoint requires CXL specific handling in the case of > uncorrectable error handling not provided by the PCI handlers. > > Add CXL specific handlers for CXL Endpoints. Rename the existing > cxl_error_handlers to be pci_error_handlers to more correctly indicate > the error type and follow naming consistency. I wonder if a rename precursor patch would be better here than doing it all in one go? Having not read the patch description thoroughly this had me confused ;) > > Keep the existing PCI Endpoint handlers. PCI handlers can be called if the > CXL device is not trained for alternate protocol (CXL). Update the CXL > Endpoint PCI handlers to call the CXL handler. If the CXL uncorrectable > handler returns PCI_ERS_RESULT_PANIC then the PCI handler invokes panic(). > > Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> > > static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) > { > - So this snuck in somewhere between upstream and here. If it is in this set let's push the removal back to where it came from. > return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras); > } > > @@ -844,14 +843,15 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) > static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } > #endif > > +pci_ers_result_t cxl_error_detected(struct device *dev) > +{ > + struct pci_dev *pdev = to_pci_dev(dev); > + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > + struct device *cxlmd_dev = &cxlds->cxlmd->dev; > + pci_ers_result_t ue; > + > + scoped_guard(device, cxlmd_dev) { > if (!dev->driver) { > dev_warn(&pdev->dev, > "%s: memdev disabled, abort error handling\n", > dev_name(dev)); > - return PCI_ERS_RESULT_DISCONNECT; > + return PCI_ERS_RESULT_PANIC; > } > > if (cxlds->rcd) > @@ -892,29 +900,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > ue = cxl_handle_endpoint_ras(cxlds); > } > > - Maybe something more in the patch description on why this chunk isn't relevant? I guess we don't need the more complex handling as these are all panic :) > - switch (state) { > - case pci_channel_io_normal: > - if (ue) { > - device_release_driver(dev); > - return PCI_ERS_RESULT_NEED_RESET; > - } > - return PCI_ERS_RESULT_CAN_RECOVER; > - case pci_channel_io_frozen: > - dev_warn(&pdev->dev, > - "%s: frozen state error detected, disable CXL.mem\n", > - dev_name(dev)); > - device_release_driver(dev); > - return PCI_ERS_RESULT_NEED_RESET; > - case pci_channel_io_perm_failure: > - dev_warn(&pdev->dev, > - "failure state error detected, request disconnect\n"); > - return PCI_ERS_RESULT_DISCONNECT; > - } > - return PCI_ERS_RESULT_NEED_RESET; > + return ue; > } > EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); > static int cxl_flit_size(struct pci_dev *pdev) > { > if (cxl_pci_flit_256(pdev)) > diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c > index 0ef8c2068c0c..664f532cc838 100644 > --- a/drivers/cxl/core/ras.c > +++ b/drivers/cxl/core/ras.c > @@ -244,6 +244,8 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info) > static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info) > { > struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info)); > + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > + struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev); > > if (!pdev) { > pr_err("Failed to find the CXL device\n"); > @@ -260,15 +262,13 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info) > > if (err_info->severity == AER_CORRECTABLE) { > int aer = pdev->aer_cap; > - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > - struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev); This code move seems somewhat unrelated? Maybe it's in the wrong patch and becomes necessary later? > > if (aer) > pci_clear_and_set_config_dword(pdev, > aer + PCI_ERR_COR_STATUS, > 0, PCI_ERR_COR_INTERNAL); > > - cxl_cor_error_detected(pdev); > + cxl_cor_error_detected(&pdev->dev); > > pcie_clear_device_status(pdev); > } else {