On 7/16/2025 12:57 AM, Ashish Kalra wrote: > From: Ashish Kalra <ashish.kalra@xxxxxxx> > > After a panic if SNP is enabled in the previous kernel then the kdump > kernel boots with IOMMU SNP enforcement still enabled. > > IOMMU device table register is locked and exclusive to the previous > kernel. Attempts to copy old device table from the previous kernel > fails in kdump kernel as hardware ignores writes to the locked device > table base address register as per AMD IOMMU spec Section 2.12.2.1. > > This results in repeated "Completion-Wait loop timed out" errors and a > second kernel panic: "Kernel panic - not syncing: timer doesn't work > through Interrupt-remapped IO-APIC". > > Reuse device table instead of copying device table in case of kdump > boot and remove all copying device table code. > > Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx> > --- > drivers/iommu/amd/init.c | 97 ++++++++++++---------------------------- > 1 file changed, 28 insertions(+), 69 deletions(-) > > diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c > index 32295f26be1b..18bd869a82d9 100644 > --- a/drivers/iommu/amd/init.c > +++ b/drivers/iommu/amd/init.c > @@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu) > > BUG_ON(iommu->mmio_base == NULL); > > + if (is_kdump_kernel()) This is fine.. but its becoming too many places with kdump check! I don't know what is the better way here. Is it worth to keep it like this -OR- add say iommu ops that way during init we check is_kdump_kernel() and adjust the ops ? @Joerg, any preference? > + return; > + > entry = iommu_virt_to_phys(dev_table); > entry |= (dev_table_size >> 12) - 1; > memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, > @@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) > > static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg) > { > - iommu_free_pages(pci_seg->dev_table); > + if (is_kdump_kernel()) > + memunmap((void *)pci_seg->dev_table); > + else > + iommu_free_pages(pci_seg->dev_table); > pci_seg->dev_table = NULL; > } > > @@ -1128,15 +1134,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit) > dte->data[i] |= (1UL << _bit); > } > > -static bool __copy_device_table(struct amd_iommu *iommu) > +static bool __reuse_device_table(struct amd_iommu *iommu) > { > - u64 int_ctl, int_tab_len, entry = 0; > struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; > - struct dev_table_entry *old_devtb = NULL; > - u32 lo, hi, devid, old_devtb_size; > + u32 lo, hi, old_devtb_size; > phys_addr_t old_devtb_phys; > - u16 dom_id, dte_v, irq_v; > - u64 tmp; > + u64 entry; > > /* Each IOMMU use separate device table with the same size */ > lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET); > @@ -1161,66 +1164,22 @@ static bool __copy_device_table(struct amd_iommu *iommu) > pr_err("The address of old device table is above 4G, not trustworthy!\n"); > return false; > } > - old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel()) > - ? (__force void *)ioremap_encrypted(old_devtb_phys, > - pci_seg->dev_table_size) > - : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB); > - > - if (!old_devtb) > - return false; > > - pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz( > - GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size); > + /* > + * IOMMU Device Table Base Address MMIO register is locked > + * if SNP is enabled during kdump, reuse the previous kernel's > + * device table. > + */ > + pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size); > if (pci_seg->old_dev_tbl_cpy == NULL) { > - pr_err("Failed to allocate memory for copying old device table!\n"); > - memunmap(old_devtb); > + pr_err("Failed to remap memory for reusing old device table!\n"); > return false; > } > > - for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { > - pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid]; > - dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK; > - dte_v = old_devtb[devid].data[0] & DTE_FLAG_V; > - > - if (dte_v && dom_id) { > - pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; > - pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; > - /* Reserve the Domain IDs used by previous kernel */ > - if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { > - pr_err("Failed to reserve domain ID 0x%x\n", dom_id); > - memunmap(old_devtb); > - return false; > - } > - /* If gcr3 table existed, mask it out */ > - if (old_devtb[devid].data[0] & DTE_FLAG_GV) { > - tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31); > - pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp; > - tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV); > - pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp; > - } > - } > - > - irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE; > - int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK; > - int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; > - if (irq_v && (int_ctl || int_tab_len)) { > - if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || > - (int_tab_len != DTE_INTTABLEN_512 && > - int_tab_len != DTE_INTTABLEN_2K)) { > - pr_err("Wrong old irq remapping flag: %#x\n", devid); > - memunmap(old_devtb); > - return false; > - } > - > - pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2]; > - } > - } > - memunmap(old_devtb); > - > return true; > } > > -static bool copy_device_table(void) > +static bool reuse_device_table(void) > { > struct amd_iommu *iommu; > struct amd_iommu_pci_seg *pci_seg; > @@ -1228,17 +1187,17 @@ static bool copy_device_table(void) > if (!amd_iommu_pre_enabled) > return false; > > - pr_warn("Translation is already enabled - trying to copy translation structures\n"); > + pr_warn("Translation is already enabled - trying to reuse translation structures\n"); > > /* > * All IOMMUs within PCI segment shares common device table. > - * Hence copy device table only once per PCI segment. > + * Hence reuse device table only once per PCI segment. > */ > for_each_pci_segment(pci_seg) { > for_each_iommu(iommu) { > if (pci_seg->id != iommu->pci_seg->id) > continue; > - if (!__copy_device_table(iommu)) > + if (!__reuse_device_table(iommu)) > return false; > break; > } > @@ -2917,8 +2876,8 @@ static void early_enable_iommu(struct amd_iommu *iommu) > * This function finally enables all IOMMUs found in the system after > * they have been initialized. > * > - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy > - * the old content of device table entries. Not this case or copy failed, > + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse > + * the old content of device table entries. Not this case or reuse failed, > * just continue as normal kernel does. > */ > static void early_enable_iommus(void) > @@ -2926,18 +2885,18 @@ static void early_enable_iommus(void) > struct amd_iommu *iommu; > struct amd_iommu_pci_seg *pci_seg; > > - if (!copy_device_table()) { > + if (!reuse_device_table()) { Hmmm. What happens if SNP enabled and reuse_device_table() couldn't setup previous DTE? In non-SNP case it works fine as we can rebuild new DTE. But in SNP case we should fail the kdump right? -Vasant