On Thu, Mar 20, 2025 at 06:58:04PM -0700, Jon Pan-Doh wrote: > Spammy devices can flood kernel logs with AER errors and slow/stall > execution. Add per-device ratelimits for AER correctable and uncorrectable > errors that use the kernel defaults (10 per 5s). > > Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged > while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show > true count of 11. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git > > Signed-off-by: Jon Pan-Doh <pandoh@xxxxxxxxxx> > Reported-by: Sargun Dhillon <sargun@xxxxxxxx> > Acked-by: Paul E. McKenney <paulmck@xxxxxxxxxx> Just taking a closer look. There are some things that I would do differently, but I don't see any show-stoppers here. I don't see this patch series in -next, which is fine: This is not so urgent that I would want to drive it into the current merge window. I am hoping that it goes into -next as soon as v6.14-rc1 comes out. > --- > drivers/pci/pci.h | 4 +- > drivers/pci/pcie/aer.c | 87 ++++++++++++++++++++++++++++++++---------- > drivers/pci/pcie/dpc.c | 2 +- > 3 files changed, 71 insertions(+), 22 deletions(-) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 9d63d32f041c..f709290e9e00 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -533,6 +533,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) > > struct aer_err_info { > struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; > + bool ratelimited[AER_MAX_MULTI_ERR_DEVICES]; In my stop-the-bleeding patch, I pass this as an argument to the functions needing it, but this works fine too. Yes, this approach does consume a bit more storage, but I doubt that it is enough to matter. The main concern is that either all information for a given error is printed or none of it is, to avoid confusion. (There will of course be the possibility of partial drops due to buffer overruns further down the console-log pipeline, but no need for additional opportunities for confusion.) For this boolean array to provide this property, the error path for a given device must be single threaded, for example, only one interrupt handler at a time. Is this the case? > int error_dev_num; > > unsigned int id:16; > @@ -552,7 +553,8 @@ struct aer_err_info { > > int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); > void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info); > -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level); > +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > + const char *level, bool ratelimited); And here you do pass it in. > int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, > unsigned int tlp_len, bool flit, > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index f657edca8769..e0f526960134 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -28,6 +28,7 @@ > #include <linux/interrupt.h> > #include <linux/delay.h> > #include <linux/kfifo.h> > +#include <linux/ratelimit.h> > #include <linux/slab.h> > #include <acpi/apei.h> > #include <acpi/ghes.h> > @@ -88,6 +89,10 @@ struct aer_report { > u64 rootport_total_cor_errs; > u64 rootport_total_fatal_errs; > u64 rootport_total_nonfatal_errs; > + > + /* Ratelimits for errors */ > + struct ratelimit_state cor_log_ratelimit; > + struct ratelimit_state uncor_log_ratelimit; Separate per-device rate limiting for correctable and uncorrectable errors, very good! > }; > > #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ > @@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev) > > dev->aer_report = kzalloc(sizeof(*dev->aer_report), GFP_KERNEL); > > + ratelimit_state_init(&dev->aer_report->cor_log_ratelimit, > + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); > + ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit, > + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); > + > /* > * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, > * PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event > @@ -668,6 +678,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, > } > } > > +static bool aer_ratelimited(struct pci_dev *dev, unsigned int severity) > +{ > + struct ratelimit_state *ratelimit; > + > + if (severity == AER_CORRECTABLE) > + ratelimit = &dev->aer_report->cor_log_ratelimit; > + else > + ratelimit = &dev->aer_report->uncor_log_ratelimit; > + > + return !__ratelimit(ratelimit); > +} Initialization and single is-it-ratelimited function, good. > + > static void __aer_print_error(struct pci_dev *dev, > struct aer_err_info *info, > const char *level) > @@ -693,11 +715,17 @@ static void __aer_print_error(struct pci_dev *dev, > } > > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > - const char *level) > + const char *level, bool ratelimited) > { > int layer, agent; > int id = pci_dev_id(dev); > > + trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), > + info->severity, info->tlp_header_valid, &info->tlp); Unconditional tracing, as before. I cannot imagine that moving this to the top of the function hurts anything. > + if (ratelimited) > + return; > + > if (!info->status) { > pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", > aer_error_severity_string[info->severity]); > @@ -722,21 +750,31 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > out: > if (info->id && info->error_dev_num > 1 && info->id == id) > pci_err(dev, " Error of this Agent is reported first\n"); > - > - trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), > - info->severity, info->tlp_header_valid, &info->tlp); > } > > static void aer_print_rp_info(struct pci_dev *rp, struct aer_err_info *info) > { > u8 bus = info->id >> 8; > u8 devfn = info->id & 0xff; > + struct pci_dev *dev; > + bool ratelimited = false; > + int i; > > - pci_info(rp, "%s%s error message received from %04x:%02x:%02x.%d\n", > - info->multi_error_valid ? "Multiple " : "", > - aer_error_severity_string[info->severity], > - pci_domain_nr(rp->bus), bus, PCI_SLOT(devfn), > - PCI_FUNC(devfn)); > + /* extract endpoint device ratelimit */ > + for (i = 0; i < info->error_dev_num; i++) { > + dev = info->dev[i]; > + if (info->id == pci_dev_id(dev)) { > + ratelimited = info->ratelimited[i]; > + break; > + } > + } I cannot resist noting that passing a "ratelimited" argument to this function would make it unnecessary to search this array. This would require doing rate-limiting checks in aer_isr_one_error(), which looks like it should work. Then again, I do not claim to be familiar with this AER code. The "ratelimited" arguments would need to be added to aer_print_port_info(), aer_process_err_devices(), and aer_print_error(). Maybe some that I have missed. Or is there some handoff to softirq or workqueues that I missed? > + if (!ratelimited) > + pci_info(rp, "%s%s error message received from %04x:%02x:%02x.%d\n", > + info->multi_error_valid ? "Multiple " : "", > + aer_error_severity_string[info->severity], > + pci_domain_nr(rp->bus), bus, PCI_SLOT(devfn), > + PCI_FUNC(devfn)); > } > > #ifdef CONFIG_ACPI_APEI_PCIEAER > @@ -784,6 +822,12 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > > pci_dev_aer_stats_incr(dev, &info); > > + trace_aer_event(dev_name(&dev->dev), (status & ~mask), > + aer_severity, tlp_header_valid, &aer->header_log); > + > + if (aer_ratelimited(dev, aer_severity)) > + return; PCIe suffices for our use case, but symmetry is not a bad thing. > aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); > __aer_print_error(dev, &info, level); > aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n", > @@ -795,9 +839,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > > if (tlp_header_valid) > pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" ")); > - > - trace_aer_event(dev_name(&dev->dev), (status & ~mask), > - aer_severity, tlp_header_valid, &aer->header_log); > } > EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); > > @@ -808,8 +849,12 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); > */ > static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev) > { > - if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) { > - e_info->dev[e_info->error_dev_num] = pci_dev_get(dev); > + int dev_idx = e_info->error_dev_num; > + unsigned int severity = e_info->severity; > + > + if (dev_idx < AER_MAX_MULTI_ERR_DEVICES) { > + e_info->dev[dev_idx] = pci_dev_get(dev); > + e_info->ratelimited[dev_idx] = aer_ratelimited(dev, severity); > e_info->error_dev_num++; > return 0; > } > @@ -1265,7 +1310,8 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info, > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > if (aer_get_device_error_info(e_info->dev[i], e_info)) { > pci_dev_aer_stats_incr(e_info->dev[i], e_info); > - aer_print_error(e_info->dev[i], e_info, level); > + aer_print_error(e_info->dev[i], e_info, level, > + e_info->ratelimited[i]); > } > } > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > @@ -1299,10 +1345,11 @@ static void aer_isr_one_error(struct aer_rpc *rpc, > e_info.multi_error_valid = 1; > else > e_info.multi_error_valid = 0; > - aer_print_rp_info(pdev, &e_info); > > - if (find_source_device(pdev, &e_info)) > + if (find_source_device(pdev, &e_info)) { > + aer_print_rp_info(pdev, &e_info); > aer_process_err_devices(&e_info, KERN_WARNING); > + } > } And here we are in the interrupts handler. > if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { > @@ -1318,10 +1365,10 @@ static void aer_isr_one_error(struct aer_rpc *rpc, > else > e_info.multi_error_valid = 0; > > - aer_print_rp_info(pdev, &e_info); > - > - if (find_source_device(pdev, &e_info)) > + if (find_source_device(pdev, &e_info)) { > + aer_print_rp_info(pdev, &e_info); > aer_process_err_devices(&e_info, KERN_ERR); > + } > } > } > > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index 81cd6e8ff3a4..42a36df4a651 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -290,7 +290,7 @@ void dpc_process_error(struct pci_dev *pdev) > dpc_get_aer_uncorrect_severity(pdev, &info) && > aer_get_device_error_info(pdev, &info)) { > pci_dev_aer_stats_incr(pdev, &info); > - aer_print_error(pdev, &info, KERN_ERR); > + aer_print_error(pdev, &info, KERN_ERR, false); And we don't throttle DPC errors. No opinion on this one. > pci_aer_clear_nonfatal_status(pdev); > pci_aer_clear_fatal_status(pdev); > } > -- > 2.49.0.395.g12beb8f557-goog >