Re: [PATCH v4 5/7] PCI/AER: Introduce ratelimit for error logs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On 3/20/25 1:20 AM, Jon Pan-Doh wrote:
Spammy devices can flood kernel logs with AER errors and slow/stall
execution. Add per-device ratelimits for AER correctable and uncorrectable
errors that use the kernel defaults (10 per 5s).

Should we exclude fatal errors from the rate limit? Fatal error logs would be
really useful for debug analysis, and they not happen very frequently.


Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged
while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show
true count of 11.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git

Signed-off-by: Jon Pan-Doh <pandoh@xxxxxxxxxx>
Reviewed-by: Karolina Stolarek <karolina.stolarek@xxxxxxxxxx>
---
  drivers/pci/pcie/aer.c | 74 +++++++++++++++++++++++++++++++++---------
  1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 3069376b3553..081cef5fc678 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -28,6 +28,7 @@
  #include <linux/interrupt.h>
  #include <linux/delay.h>
  #include <linux/kfifo.h>
+#include <linux/ratelimit.h>
  #include <linux/slab.h>
  #include <acpi/apei.h>
  #include <acpi/ghes.h>
@@ -88,6 +89,10 @@ struct aer_report {
  	u64 rootport_total_cor_errs;
  	u64 rootport_total_fatal_errs;
  	u64 rootport_total_nonfatal_errs;
+
+	/* Ratelimits for errors */
+	struct ratelimit_state cor_log_ratelimit;
+	struct ratelimit_state uncor_log_ratelimit;
  };
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
@@ -379,6 +384,15 @@ void pci_aer_init(struct pci_dev *dev)
dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL); + /*
+	 * Ratelimits are doubled as a given error produces 2 logs (root port
+	 * and endpoint) that should be under same ratelimit.
+	 */
+	ratelimit_state_init(&dev->aer_report->cor_log_ratelimit,
+			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST * 2);
+	ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit,
+			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST * 2);
+
  	/*
  	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
  	 * PCI_ERR_COR_MASK, and PCI_ERR_CAP.  Root and Root Complex Event
@@ -668,6 +682,17 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
  	}
  }
+static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
+{
+	struct ratelimit_state *ratelimit;
+
+	if (severity == AER_CORRECTABLE)
+		ratelimit = &dev->aer_report->cor_log_ratelimit;
+	else
+		ratelimit = &dev->aer_report->uncor_log_ratelimit;
+	return __ratelimit(ratelimit);
+}
+
  static void __aer_print_error(struct pci_dev *dev,
  			      struct aer_err_info *info,
  			      const char *level)
@@ -698,6 +723,12 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
  	int layer, agent;
  	int id = pci_dev_id(dev);
+ trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+			info->severity, info->tlp_header_valid, &info->tlp);
+
+	if (!aer_ratelimit(dev, info->severity))
+		return;
+
  	if (!info->status) {
  		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
  			aer_error_severity_string[info->severity]);
@@ -722,21 +753,28 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
  out:
  	if (info->id && info->error_dev_num > 1 && info->id == id)
  		pci_err(dev, "  Error of this Agent is reported first\n");
-
-	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
-			info->severity, info->tlp_header_valid, &info->tlp);
  }
static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
  {
  	u8 bus = info->id >> 8;
  	u8 devfn = info->id & 0xff;
+	struct pci_dev *endpoint;
+	int i;
+
+	/* extract endpoint device ratelimit */
+	for (i = 0; i < info->error_dev_num; i++) {
+		endpoint = info->dev[i];
+		if (info->id == pci_dev_id(endpoint))
+			break;
+	}
- pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
-		 info->multi_error_valid ? "Multiple " : "",
-		 aer_error_severity_string[info->severity],
-		 pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
-		 PCI_FUNC(devfn));
+	if (aer_ratelimit(endpoint, info->severity))
+		pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
+			 info->multi_error_valid ? "Multiple " : "",
+			 aer_error_severity_string[info->severity],
+			 pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+			 PCI_FUNC(devfn));
  }
#ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -784,6 +822,12 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
pci_dev_aer_stats_incr(dev, &info); + trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+			aer_severity, tlp_header_valid, &aer->header_log);
+
+	if (!aer_ratelimit(dev, aer_severity))
+		return;
+
  	aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
  	__aer_print_error(dev, &info, level);
  	aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
@@ -795,9 +839,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
if (tlp_header_valid)
  		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
-
-	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
-			aer_severity, tlp_header_valid, &aer->header_log);
  }
  EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
@@ -1299,10 +1340,11 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
  			e_info.multi_error_valid = 1;
  		else
  			e_info.multi_error_valid = 0;
-		aer_print_port_info(pdev, &e_info);
- if (find_source_device(pdev, &e_info))
+		if (find_source_device(pdev, &e_info)) {
+			aer_print_port_info(pdev, &e_info);
  			aer_process_err_devices(&e_info, KERN_WARNING);
+		}
  	}
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
@@ -1318,10 +1360,10 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
  		else
  			e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info);
-
-		if (find_source_device(pdev, &e_info))
+		if (find_source_device(pdev, &e_info)) {
+			aer_print_port_info(pdev, &e_info);
  			aer_process_err_devices(&e_info, KERN_ERR);
+		}
  	}
  }

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer





[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux