Hello Vasant, On 7/16/2025 4:42 AM, Vasant Hegde wrote: > > > On 7/16/2025 12:57 AM, Ashish Kalra wrote: >> From: Ashish Kalra <ashish.kalra@xxxxxxx> >> >> After a panic if SNP is enabled in the previous kernel then the kdump >> kernel boots with IOMMU SNP enforcement still enabled. >> >> IOMMU device table register is locked and exclusive to the previous >> kernel. Attempts to copy old device table from the previous kernel >> fails in kdump kernel as hardware ignores writes to the locked device >> table base address register as per AMD IOMMU spec Section 2.12.2.1. >> >> This results in repeated "Completion-Wait loop timed out" errors and a >> second kernel panic: "Kernel panic - not syncing: timer doesn't work >> through Interrupt-remapped IO-APIC". >> >> Reuse device table instead of copying device table in case of kdump >> boot and remove all copying device table code. >> >> Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx> >> --- >> drivers/iommu/amd/init.c | 97 ++++++++++++---------------------------- >> 1 file changed, 28 insertions(+), 69 deletions(-) >> >> diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c >> index 32295f26be1b..18bd869a82d9 100644 >> --- a/drivers/iommu/amd/init.c >> +++ b/drivers/iommu/amd/init.c >> @@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu) >> >> BUG_ON(iommu->mmio_base == NULL); >> >> + if (is_kdump_kernel()) > > This is fine.. but its becoming too many places with kdump check! I don't know > what is the better way here. > Is it worth to keep it like this -OR- add say iommu ops that way during init we > check is_kdump_kernel() and adjust the ops ? > > @Joerg, any preference? > > >> + return; >> + >> entry = iommu_virt_to_phys(dev_table); >> entry |= (dev_table_size >> 12) - 1; >> memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, >> @@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) >> >> static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg) >> { >> - iommu_free_pages(pci_seg->dev_table); >> + if (is_kdump_kernel()) >> + memunmap((void *)pci_seg->dev_table); >> + else >> + iommu_free_pages(pci_seg->dev_table); >> pci_seg->dev_table = NULL; >> } >> >> @@ -1128,15 +1134,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit) >> dte->data[i] |= (1UL << _bit); >> } >> >> -static bool __copy_device_table(struct amd_iommu *iommu) >> +static bool __reuse_device_table(struct amd_iommu *iommu) >> { >> - u64 int_ctl, int_tab_len, entry = 0; >> struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; >> - struct dev_table_entry *old_devtb = NULL; >> - u32 lo, hi, devid, old_devtb_size; >> + u32 lo, hi, old_devtb_size; >> phys_addr_t old_devtb_phys; >> - u16 dom_id, dte_v, irq_v; >> - u64 tmp; >> + u64 entry; >> >> /* Each IOMMU use separate device table with the same size */ >> lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET); >> @@ -1161,66 +1164,22 @@ static bool __copy_device_table(struct amd_iommu *iommu) >> pr_err("The address of old device table is above 4G, not trustworthy!\n"); >> return false; >> } >> - old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel()) >> - ? (__force void *)ioremap_encrypted(old_devtb_phys, >> - pci_seg->dev_table_size) >> - : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB); >> - >> - if (!old_devtb) >> - return false; >> >> - pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz( >> - GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size); >> + /* >> + * IOMMU Device Table Base Address MMIO register is locked >> + * if SNP is enabled during kdump, reuse the previous kernel's >> + * device table. >> + */ >> + pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size); >> if (pci_seg->old_dev_tbl_cpy == NULL) { >> - pr_err("Failed to allocate memory for copying old device table!\n"); >> - memunmap(old_devtb); >> + pr_err("Failed to remap memory for reusing old device table!\n"); >> return false; >> } >> >> - for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { >> - pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid]; >> - dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK; >> - dte_v = old_devtb[devid].data[0] & DTE_FLAG_V; >> - >> - if (dte_v && dom_id) { >> - pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; >> - pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; >> - /* Reserve the Domain IDs used by previous kernel */ >> - if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { >> - pr_err("Failed to reserve domain ID 0x%x\n", dom_id); >> - memunmap(old_devtb); >> - return false; >> - } >> - /* If gcr3 table existed, mask it out */ >> - if (old_devtb[devid].data[0] & DTE_FLAG_GV) { >> - tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31); >> - pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp; >> - tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV); >> - pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp; >> - } >> - } >> - >> - irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE; >> - int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK; >> - int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; >> - if (irq_v && (int_ctl || int_tab_len)) { >> - if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || >> - (int_tab_len != DTE_INTTABLEN_512 && >> - int_tab_len != DTE_INTTABLEN_2K)) { >> - pr_err("Wrong old irq remapping flag: %#x\n", devid); >> - memunmap(old_devtb); >> - return false; >> - } >> - >> - pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2]; >> - } >> - } >> - memunmap(old_devtb); >> - >> return true; >> } >> >> -static bool copy_device_table(void) >> +static bool reuse_device_table(void) >> { >> struct amd_iommu *iommu; >> struct amd_iommu_pci_seg *pci_seg; >> @@ -1228,17 +1187,17 @@ static bool copy_device_table(void) >> if (!amd_iommu_pre_enabled) >> return false; >> >> - pr_warn("Translation is already enabled - trying to copy translation structures\n"); >> + pr_warn("Translation is already enabled - trying to reuse translation structures\n"); >> >> /* >> * All IOMMUs within PCI segment shares common device table. >> - * Hence copy device table only once per PCI segment. >> + * Hence reuse device table only once per PCI segment. >> */ >> for_each_pci_segment(pci_seg) { >> for_each_iommu(iommu) { >> if (pci_seg->id != iommu->pci_seg->id) >> continue; >> - if (!__copy_device_table(iommu)) >> + if (!__reuse_device_table(iommu)) >> return false; >> break; >> } >> @@ -2917,8 +2876,8 @@ static void early_enable_iommu(struct amd_iommu *iommu) >> * This function finally enables all IOMMUs found in the system after >> * they have been initialized. >> * >> - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy >> - * the old content of device table entries. Not this case or copy failed, >> + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse >> + * the old content of device table entries. Not this case or reuse failed, >> * just continue as normal kernel does. >> */ >> static void early_enable_iommus(void) >> @@ -2926,18 +2885,18 @@ static void early_enable_iommus(void) >> struct amd_iommu *iommu; >> struct amd_iommu_pci_seg *pci_seg; >> >> - if (!copy_device_table()) { >> + if (!reuse_device_table()) { > > Hmmm. What happens if SNP enabled and reuse_device_table() couldn't setup > previous DTE? > In non-SNP case it works fine as we can rebuild new DTE. But in SNP case we > should fail the kdump right? > Which will happen automatically, if we can't setup previous DTE for SNP case then IOMMU commands will time-out and subsequenly cause a panic as IRQ remapping won't be setup. So this is as good as failing the kdump, which will have the same result. Thanks, Ashish