Re: [PATCH v2] PCI/AER: Consolidate CXL, ACPI GHES and native AER reporting paths

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Bjorn,

On 25/03/2025 16:07, Karolina Stolarek wrote:
Currently, CXL and GHES feature use pci_print_aer() function to
log AER errors. Its implementation is pretty similar to aer_print_error(),
duplicating the way how native PCIe devices report errors. We shouldn't
log messages differently only because they are coming from a different
code path.

Make CXL devices and GHES to call aer_print_error() when reporting
AER errors. Add a wrapper, aer_print_platform_error(), that translates
aer_capabilities_regs to aer_err_info so we can use pci_print_aer()
function.

Signed-off-by: Karolina Stolarek <karolina.stolarek@xxxxxxxxxx>
---
v2:
   - Don't expose aer_err_info to the world; as aer_recover_queue()
     is tightly connected to the ghes code, introduce a wrapper for
     aer_print_error()
   - Move aer_err_info memset to the wrapper, don't expect the
     caller to clean it for us

   I'm still working on the logs; in the meantime, I think, we can
   continue reviewing the patch.

I wasn't able to produce logs for the CXL path (that is, Restricted CXL Device, as CXL1.1 devices not supported by the driver due to a missing functionality; confirmed by Terry) and faced issues when trying to inject errors via GHES. Is the lack of logs a blocker for this patch? I tested other CXL scenarios and my changes didn't cause regression, as far as I know.

All the best,
Karolina


  drivers/cxl/core/pci.c |  2 +-
  drivers/pci/pcie/aer.c | 64 ++++++++++++++++++++----------------------
  include/linux/aer.h    |  4 +--
  3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 013b869b66cb..9ba711365388 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -885,7 +885,7 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
  	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
  		return;
- pci_print_aer(pdev, severity, &aer_regs);
+	aer_print_platform_error(pdev, severity, &aer_regs);
if (severity == AER_CORRECTABLE)
  		cxl_handle_rdport_cor_ras(cxlds, dport);
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a1cf8c7ef628..ec34bc9b2332 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -760,47 +760,42 @@ int cper_severity_to_aer(int cper_severity)
  EXPORT_SYMBOL_GPL(cper_severity_to_aer);
  #endif
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
-		   struct aer_capability_regs *aer)
+static void populate_aer_err_info(struct aer_err_info *info, int severity,
+				  struct aer_capability_regs *aer_regs)
  {
-	int layer, agent, tlp_header_valid = 0;
-	u32 status, mask;
-	struct aer_err_info info;
-
-	if (aer_severity == AER_CORRECTABLE) {
-		status = aer->cor_status;
-		mask = aer->cor_mask;
-	} else {
-		status = aer->uncor_status;
-		mask = aer->uncor_mask;
-		tlp_header_valid = status & AER_LOG_TLP_MASKS;
-	}
-
-	layer = AER_GET_LAYER_ERROR(aer_severity, status);
-	agent = AER_GET_AGENT(aer_severity, status);
+	int tlp_header_valid;
memset(&info, 0, sizeof(info));
-	info.severity = aer_severity;
-	info.status = status;
-	info.mask = mask;
-	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
- pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
-	__aer_print_error(dev, &info);
-	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
-		aer_error_layer[layer], aer_agent_string[agent]);
+	info->severity = severity;
+	info->first_error = PCI_ERR_CAP_FEP(aer_regs->cap_control);
- if (aer_severity != AER_CORRECTABLE)
-		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
-			aer->uncor_severity);
+	if (severity == AER_CORRECTABLE) {
+		info->id = aer_regs->cor_err_source;
+		info->status = aer_regs->cor_status;
+		info->mask = aer_regs->cor_mask;
+	} else {
+		info->id = aer_regs->uncor_err_source;
+		info->status = aer_regs->uncor_status;
+		info->mask = aer_regs->uncor_mask;
+		tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
+
+		if (tlp_header_valid) {
+			info->tlp_header_valid = tlp_header_valid;
+			info->tlp = aer_regs->header_log;
+		}
+	}
+}
- if (tlp_header_valid)
-		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
+void aer_print_platform_error(struct pci_dev *pdev, int severity,
+			      struct aer_capability_regs *aer_regs)
+{
+	struct aer_err_info info;
- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
-			aer_severity, tlp_header_valid, &aer->header_log);
+	populate_aer_err_info(&info, severity, aer_regs);
+	aer_print_error(pdev, &info);
  }
-EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
+EXPORT_SYMBOL_NS_GPL(aer_print_platform_error, "CXL");
/**
   * add_error_device - list device to be handled
@@ -1146,7 +1141,8 @@ static void aer_recover_work_func(struct work_struct *work)
  			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
  			continue;
  		}
-		pci_print_aer(pdev, entry.severity, entry.regs);
+
+		aer_print_platform_error(pdev, entry.severity, entry.regs);
/*
  		 * Memory for aer_capability_regs(entry.regs) is being
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 02940be66324..5593352dfb51 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -64,8 +64,8 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
  #endif
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
-		    struct aer_capability_regs *aer);
+void aer_print_platform_error(struct pci_dev *pdev, int severity,
+			      struct aer_capability_regs *aer_regs);
  int cper_severity_to_aer(int cper_severity);
  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
  		       int severity, struct aer_capability_regs *aer_regs);





[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux