On Sun, Apr 13, 2025 at 12:59:07PM +0100, Sergey Dolgov wrote: > Dear Bjorn, > > I've compiled pciutils-3.13.0 with MinGW and run it with WinDBG, and > it managed to display how Windows sets LTR1.2_Threshold. The trick is > that it does NOT! I've removed all the actual reprogramming in > aspm_calc_l12_info to see the devices' default thresholds, and they've > come out exactly the same as in Windows (81920ns for all devices > except 0 for NVIDIA, but it should have its own dynamic power > management). I've collected these logs also after booting the original > 6.14.0 kernel to make sure the devices haven't just kept the values > set by Windows - nope, the devices (or maybe BIOS/Firmware) do reset > LTR_L1.2_THRESHOLD to 81920ns every reboot. > > I tried to read latencies from PCI_EXT_CAP_ID_LTR (see the new > function pcie_log_runtime_ltr in l12debug.patch), but it bails out at > "No LTR capability found" for all devices. > > The idle power usage has increased slightly to 0.916 W due to about > 80% PC10 residency and 60-70% SYS%LPI residency (in contrast to 90%+ > PC10 and 80%+ SYS%LPI with pre 7afeb84d14ea thresholds), but that's > still much more power efficient than the mainline kernel. > > So the question is why the OS has to reprogram T_POWER_ON, CMRT and > LTR_L1.2_THRESHOLD at all, instead of trusting the device defaults? Is > there any statistics on broken devices reporting bogus values, and > frequently running out of buffer? Great question. I think you're right that if the BIOS has programmed L1.2 settings, it's very unlikely that the OS can do a better job. This is really ugly, but since I won't have more time for this until Monday, could you try the patch below? (Replace the previous debug patch with this one) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 29fcb0689a91..e6e9d6faa028 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -563,6 +563,27 @@ static void encode_l12_threshold(u32 threshold_us, u32 *scale, u32 *value) } } +static u64 decode_l12_threshold(u32 scale, u32 value) +{ + u32 mpy; + + /* + * LTR_L1.2_THRESHOLD_Value ("value") is a 10-bit field with max + * value of 0x3ff. + */ + switch (scale) { + case 0: mpy = 1; break; + case 1: mpy = 32; break; + case 2: mpy = 1024; break; + case 3: mpy = 32768; break; + case 4: mpy = 1048576; break; + case 5: mpy = 33554432; break; + default: mpy = 0; + } + + return (u64) value * mpy; +} + static void pcie_aspm_check_latency(struct pci_dev *endpoint) { u32 latency, encoding, lnkcap_up, lnkcap_dw; @@ -641,18 +662,29 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, u32 ctl1 = 0, ctl2 = 0; u32 pctl1, pctl2, cctl1, cctl2; u32 pl1_2_enables, cl1_2_enables; + u32 cmrt; + u32 p_ltr_val, p_ltr_scale; + u32 c_ltr_val, c_ltr_scale; + u64 p_ltr_threshold, c_ltr_threshold; /* Choose the greater of the two Port Common_Mode_Restore_Times */ val1 = FIELD_GET(PCI_L1SS_CAP_CM_RESTORE_TIME, parent_l1ss_cap); val2 = FIELD_GET(PCI_L1SS_CAP_CM_RESTORE_TIME, child_l1ss_cap); t_common_mode = max(val1, val2); + pci_info(child, "cap: parent CMRT %#04x child CMRT %#04x\n", val1, val2); + /* Choose the greater of the two Port T_POWER_ON times */ val1 = FIELD_GET(PCI_L1SS_CAP_P_PWR_ON_VALUE, parent_l1ss_cap); scale1 = FIELD_GET(PCI_L1SS_CAP_P_PWR_ON_SCALE, parent_l1ss_cap); val2 = FIELD_GET(PCI_L1SS_CAP_P_PWR_ON_VALUE, child_l1ss_cap); scale2 = FIELD_GET(PCI_L1SS_CAP_P_PWR_ON_SCALE, child_l1ss_cap); + pci_info(child, "cap: parent T_POWER_ON %#06x usec (val %#04x scale %x)\n", + calc_l12_pwron(parent, scale1, val1), val1, scale1); + pci_info(child, "cap: child T_POWER_ON %#06x usec (val %#04x scale %x)\n", + calc_l12_pwron(child, scale2, val2), val2, scale2); + if (calc_l12_pwron(parent, scale1, val1) > calc_l12_pwron(child, scale2, val2)) { ctl2 |= FIELD_PREP(PCI_L1SS_CTL2_T_PWR_ON_SCALE, scale1) | @@ -675,7 +707,11 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, * least 4us. */ l1_2_threshold = 2 + 4 + t_common_mode + t_power_on; + pci_info(child, "t_common_mode %#04x t_power_on %#04x l1_2_threshold %#06x\n", + t_common_mode, t_power_on, l1_2_threshold); encode_l12_threshold(l1_2_threshold, &scale, &value); + pci_info(child, "encoded LTR_L1.2_THRESHOLD value %#06x scale %x\n", + value, scale); ctl1 |= FIELD_PREP(PCI_L1SS_CTL1_CM_RESTORE_TIME, t_common_mode) | FIELD_PREP(PCI_L1SS_CTL1_LTR_L12_TH_VALUE, value) | FIELD_PREP(PCI_L1SS_CTL1_LTR_L12_TH_SCALE, scale); @@ -686,6 +722,28 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, &cctl1); pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL2, &cctl2); + pci_info(child, "L1SS_CTL1 %#08x (parent %#010x child %#010x)\n", + ctl1, pctl1, cctl1); + pci_info(child, "L1SS_CTL2 %#08x (parent %#010x child %#010x)\n", + ctl2, pctl2, cctl2); + + cmrt = FIELD_GET(PCI_L1SS_CTL1_CM_RESTORE_TIME, pctl1); + p_ltr_val = FIELD_GET(PCI_L1SS_CTL1_LTR_L12_TH_VALUE, pctl1); + p_ltr_scale = FIELD_GET(PCI_L1SS_CTL1_LTR_L12_TH_SCALE, pctl1); + p_ltr_threshold = decode_l12_threshold(p_ltr_scale, p_ltr_val); + c_ltr_val = FIELD_GET(PCI_L1SS_CTL1_LTR_L12_TH_VALUE, cctl1); + c_ltr_scale = FIELD_GET(PCI_L1SS_CTL1_LTR_L12_TH_SCALE, cctl1); + c_ltr_threshold = decode_l12_threshold(c_ltr_scale, c_ltr_val); + if (cmrt || p_ltr_threshold || c_ltr_threshold) { + pci_info(child, "cur: parent CMRT %d usec\n", cmrt); + pci_info(child, "cur: parent LTR_L1.2_THRESHOLD value %#06x scale %x decoded %#012llx ns\n", + p_ltr_val, p_ltr_scale, p_ltr_threshold); + pci_info(child, "cur: child LTR_L1.2_THRESHOLD value %#06x scale %x decoded %#012llx ns\n", + c_ltr_val, c_ltr_scale, c_ltr_threshold); + pci_info(child, "skipping L1.2 config\n"); + return; + } + if (ctl1 == pctl1 && ctl1 == cctl1 && ctl2 == pctl2 && ctl2 == cctl2) return; @@ -703,14 +761,27 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, PCI_L1SS_CTL1_L1_2_MASK, 0); } + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, &pctl1); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, &cctl1); + pci_info(child, "L1SS_CTL1 parent %#08x child %#08x (L1.2 disabled)\n", + pctl1, cctl1); + /* Program T_POWER_ON times in both ports */ pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2); pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, &pctl2); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL2, &cctl2); + pci_info(child, "L1SS_CTL2 parent %#08x child %#08x (T_POWER_ON updated)\n", + pctl2, cctl2); + /* Program Common_Mode_Restore_Time in upstream device */ pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, &pctl1); + pci_info(child, "L1SS_CTL1 parent %#08x (CMRT updated)\n", pctl1); + /* Program LTR_L1.2_THRESHOLD time in both ports */ pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, PCI_L1SS_CTL1_LTR_L12_TH_VALUE | @@ -721,6 +792,11 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, &pctl1); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, &cctl1); + pci_info(child, "L1SS_CTL1 parent %#08x child %#08x (LTR_L1.2_THRESHOLD updated)\n", + pctl1, cctl1); + if (pl1_2_enables || cl1_2_enables) { pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, @@ -729,6 +805,11 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, child->l1ss + PCI_L1SS_CTL1, 0, cl1_2_enables); } + + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, &pctl1); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, &cctl1); + pci_info(child, "L1SS_CTL1 parent %#08x child %#08x (L1.2 restored)\n", + pctl1, cctl1); } static void aspm_l1ss_init(struct pcie_link_state *link)