On Mon, Jun 16, 2025 at 03:06:06PM -0300, Jason Gunthorpe wrote: > The existing IOMMU page table implementations duplicate all of the working > algorithms for each format. By using the generic page table API a single C > version of the IOMMU algorithms can be created and re-used for all of the > different formats used in the drivers. The implementation will provide a > single C version of the iommu domain operations: iova_to_phys, map, unmap, > and read_and_clear_dirty. > > Further, adding new algorithms and techniques becomes easy to do across > the entire fleet of drivers and formats. > > The C functions are drop in compatible with the existing iommu_domain_ops > using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation > compilation unit will produce exported symbols following the pattern > pt_iommu_FMT_map_pages() which the macro directly maps to the > iommu_domain_ops members. This avoids the additional function pointer > indirection like io-pgtable has. > > The top level struct used by the drivers is pt_iommu_table_FMT. It > contains the other structs to allow container_of() to move between the > driver, iommu page table, generic page table, and generic format layers. > > struct pt_iommu_table_amdv1 { > struct pt_iommu { > struct iommu_domain domain; > } iommu; > struct pt_amdv1 { > struct pt_common { > } common; > } amdpt; > }; > > The driver is expected to union the pt_iommu_table_FMT with it's own > existing domain struct: > > struct driver_domain { > union { > struct iommu_domain domain; > struct pt_iommu_table_amdv1 amdv1; > }; > }; > PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain); > > To create an alias to avoid renaming 'domain' in a lot of driver code. > > This allows all the layers to access all the necessary functions to > implement their different roles with no change to any of the existing > iommu core code. > > Implement the basic starting point: pt_iommu_init(), get_info() and > deinit(). > > Tested-by: Alejandro Jimenez <alejandro.j.jimenez@xxxxxxxxxx> > Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx> Tested patch series with multiple command line configurations for amd iommu on zen5 machine with 12 NVMe devices. Suggested-by: Ankit Soni <Ankit.Soni@xxxxxxx> Tested-by: Ankit Soni <Ankit.Soni@xxxxxxx> > --- > drivers/iommu/generic_pt/Kconfig | 13 + > drivers/iommu/generic_pt/fmt/iommu_template.h | 39 +++ > drivers/iommu/generic_pt/iommu_pt.h | 268 ++++++++++++++++++ > include/linux/generic_pt/iommu.h | 118 ++++++++ > 4 files changed, 438 insertions(+) > create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h > create mode 100644 drivers/iommu/generic_pt/iommu_pt.h > create mode 100644 include/linux/generic_pt/iommu.h > > diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig > index 775a3afb563f72..73b7a54375f9bd 100644 > --- a/drivers/iommu/generic_pt/Kconfig > +++ b/drivers/iommu/generic_pt/Kconfig > @@ -19,4 +19,17 @@ config DEBUG_GENERIC_PT > kernels. > > The kunit tests require this to be enabled to get full coverage. > + > +config IOMMU_PT > + tristate "IOMMU Page Tables" > + select IOMMU_API > + depends on IOMMU_SUPPORT > + depends on GENERIC_PT > + default n > + help > + Generic library for building IOMMU page tables > + > + IOMMU_PT provides an implementation of the page table operations > + related struct iommu_domain using GENERIC_PT to abstract the page > + table format. > endif > diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h > new file mode 100644 > index 00000000000000..5b631bc07cbc16 > --- /dev/null > +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h > @@ -0,0 +1,39 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + * > + * Template to build the iommu module and kunit from the format and > + * implementation headers. > + * > + * The format should have: > + * #define PT_FMT <name> > + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) > + * And optionally: > + * #define PT_FORCE_ENABLED_FEATURES .. > + * #define PT_FMT_VARIANT <suffix> > + */ > +#include <linux/args.h> > +#include <linux/stringify.h> > + > +#ifdef PT_FMT_VARIANT > +#define PTPFX_RAW \ > + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) > +#else > +#define PTPFX_RAW PT_FMT > +#endif > + > +#define PTPFX CONCATENATE(PTPFX_RAW, _) > + > +#define _PT_FMT_H PT_FMT.h > +#define PT_FMT_H __stringify(_PT_FMT_H) > + > +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) > +#define PT_DEFS_H __stringify(_PT_DEFS_H) > + > +#include <linux/generic_pt/common.h> > +#include PT_DEFS_H > +#include "../pt_defs.h" > +#include PT_FMT_H > +#include "../pt_common.h" > + > +#include "../iommu_pt.h" > diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h > new file mode 100644 > index 00000000000000..205c232bda68b5 > --- /dev/null > +++ b/drivers/iommu/generic_pt/iommu_pt.h > @@ -0,0 +1,268 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + * > + * "Templated C code" for implementing the iommu operations for page tables. > + * This is compiled multiple times, over all the page table formats to pick up > + * the per-format definitions. > + */ > +#ifndef __GENERIC_PT_IOMMU_PT_H > +#define __GENERIC_PT_IOMMU_PT_H > + > +#include "pt_iter.h" > + > +#include <linux/iommu.h> > +#include "../iommu-pages.h" > +#include <linux/export.h> > + > +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) > + > +struct pt_iommu_collect_args { > + struct iommu_pages_list free_list; > + u8 ignore_mapped : 1; > +}; > + > +static int __collect_tables(struct pt_range *range, void *arg, > + unsigned int level, struct pt_table_p *table) > +{ > + struct pt_state pts = pt_init(range, level, table); > + struct pt_iommu_collect_args *collect = arg; > + int ret; > + > + if (collect->ignore_mapped && !pt_can_have_table(&pts)) > + return 0; > + > + for_each_pt_level_entry(&pts) { > + if (pts.type == PT_ENTRY_TABLE) { > + iommu_pages_list_add(&collect->free_list, pts.table_lower); > + ret = pt_descend(&pts, arg, __collect_tables); > + if (ret) > + return ret; > + continue; > + } > + if (pts.type == PT_ENTRY_OA && !collect->ignore_mapped) > + return -EADDRINUSE; > + } > + return 0; > +} > + > +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, > + uintptr_t top_of_table, > + gfp_t gfp) > +{ > + struct pt_iommu *iommu_table = iommu_from_common(common); > + > + /* > + * Top doesn't need the free list or otherwise, so it technically > + * doesn't need to use iommu pages. Use the API anyhow as the top is > + * usually not smaller than PAGE_SIZE to keep things simple. > + */ > + return iommu_alloc_pages_node_sz( > + iommu_table->nid, gfp, > + log2_to_int(pt_top_memsize_lg2(common, top_of_table))); > +} > + > +static void NS(get_info)(struct pt_iommu *iommu_table, > + struct pt_iommu_info *info) > +{ > + struct pt_common *common = common_from_iommu(iommu_table); > + struct pt_range range = pt_top_range(common); > + struct pt_state pts = pt_init_top(&range); > + pt_vaddr_t pgsize_bitmap = 0; > + > + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { > + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; > + pts.level++) { > + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) > + break; > + pgsize_bitmap |= pt_possible_sizes(&pts); > + } > + } else { > + for (pts.level = 0; pts.level <= range.top_level; pts.level++) > + pgsize_bitmap |= pt_possible_sizes(&pts); > + } > + > + /* Hide page sizes larger than the maximum OA */ > + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); > +} > + > +static void NS(deinit)(struct pt_iommu *iommu_table) > +{ > + struct pt_common *common = common_from_iommu(iommu_table); > + struct pt_range range = pt_all_range(common); > + struct pt_iommu_collect_args collect = { > + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), > + .ignore_mapped = true, > + }; > + > + iommu_pages_list_add(&collect.free_list, range.top_table); > + pt_walk_range(&range, __collect_tables, &collect); > + > + /* > + * The driver has to already have fenced the HW access to the page table > + * and invalidated any caching referring to this memory. > + */ > + iommu_put_pages_list(&collect.free_list); > +} > + > +static const struct pt_iommu_ops NS(ops) = { > + .get_info = NS(get_info), > + .deinit = NS(deinit), > +}; > + > +static int pt_init_common(struct pt_common *common) > +{ > + struct pt_range top_range = pt_top_range(common); > + > + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) > + return -EINVAL; > + > + if (top_range.top_level == PT_MAX_TOP_LEVEL || > + common->max_vasz_lg2 == top_range.max_vasz_lg2) > + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); > + > + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) > + common->features |= BIT(PT_FEAT_FULL_VA); > + > + /* Requested features must match features compiled into this format */ > + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || > + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && > + (common->features & PT_FORCE_ENABLED_FEATURES) != > + PT_FORCE_ENABLED_FEATURES)) > + return -EOPNOTSUPP; > + > + if (common->max_oasz_lg2 == 0) > + common->max_oasz_lg2 = pt_max_output_address_lg2(common); > + else > + common->max_oasz_lg2 = min(common->max_oasz_lg2, > + pt_max_output_address_lg2(common)); > + return 0; > +} > + > +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, > + struct iommu_domain *domain) > +{ > + struct pt_common *common = common_from_iommu(iommu_table); > + struct pt_iommu_info info; > + struct pt_range range; > + > + NS(get_info)(iommu_table, &info); > + > + domain->type = __IOMMU_DOMAIN_PAGING; > + domain->pgsize_bitmap = info.pgsize_bitmap; > + > + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) > + range = _pt_top_range(common, > + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); > + else > + range = pt_top_range(common); > + > + /* > + * A 64 bit high address space table on a 32 bit system cannot work. > + */ > + domain->geometry.aperture_start = (unsigned long)range.va; > + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va || > + range.va > ULONG_MAX) > + return -EOVERFLOW; > + > + /* > + * The aperture is limited to what the API can do after considering all > + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used > + * to store a VA. Set the aperture to something that is valid for all > + * cases. Saturate instead of truncate the end if the types are smaller > + * than the top range. aperture_end is a last. > + */ > + domain->geometry.aperture_end = (unsigned long)range.last_va; > + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { > + domain->geometry.aperture_end = ULONG_MAX; > + domain->pgsize_bitmap &= ULONG_MAX; > + } > + domain->geometry.force_aperture = true; > + > + return 0; > +} > + > +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) > +{ > + struct pt_iommu *iommu_table = &fmt_table->iommu; > + struct pt_iommu cfg = *iommu_table; > + > + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); > + memset_after(fmt_table, 0, iommu.domain); > + > + /* The caller can initialize some of these values */ > + iommu_table->nid = cfg.nid; > +} > + > +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) > +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) > +int pt_iommu_init(struct pt_iommu_table *fmt_table, > + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) > +{ > + struct pt_iommu *iommu_table = &fmt_table->iommu; > + struct pt_common *common = common_from_iommu(iommu_table); > + struct pt_table_p *table_mem; > + int ret; > + > + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || > + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) > + return -EINVAL; > + > + pt_iommu_zero(fmt_table); > + common->features = cfg->common.features; > + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; > + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; > +#ifdef PT_FIXED_TOP_LEVEL > + pt_top_set_level(common, PT_FIXED_TOP_LEVEL); > +#endif > + ret = pt_iommu_fmt_init(fmt_table, cfg); > + if (ret) > + return ret; > + > + if (cfg->common.hw_max_oasz_lg2 > pt_max_output_address_lg2(common)) > + return -EINVAL; > + > + ret = pt_init_common(common); > + if (ret) > + return ret; > + > + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && > + (pt_feature(common, PT_FEAT_FULL_VA) || > + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) > + return -EINVAL; > + > + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); > + if (ret) > + return ret; > + > + table_mem = table_alloc_top(common, common->top_of_table, gfp); > + if (IS_ERR(table_mem)) > + return PTR_ERR(table_mem); > + pt_top_set(common, table_mem, pt_top_get_level(common)); > + > + /* Must be last, see pt_iommu_deinit() */ > + iommu_table->ops = &NS(ops); > + return 0; > +} > +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); > + > +#ifdef pt_iommu_fmt_hw_info > +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) > +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) > +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, > + struct pt_iommu_table_hw_info *info) > +{ > + struct pt_iommu *iommu_table = &fmt_table->iommu; > + struct pt_common *common = common_from_iommu(iommu_table); > + struct pt_range top_range = pt_top_range(common); > + > + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); > +} > +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); > +#endif > + > +MODULE_LICENSE("GPL"); > +MODULE_DESCRIPTION("IOMMU Pagetable implementation for " __stringify(PTPFX_RAW)); > +MODULE_IMPORT_NS("GENERIC_PT"); > + > +#endif > diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h > new file mode 100644 > index 00000000000000..9d2152bc64c0d6 > --- /dev/null > +++ b/include/linux/generic_pt/iommu.h > @@ -0,0 +1,118 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + */ > +#ifndef __GENERIC_PT_IOMMU_H > +#define __GENERIC_PT_IOMMU_H > + > +#include <linux/generic_pt/common.h> > +#include <linux/iommu.h> > +#include <linux/mm_types.h> > + > +struct pt_iommu_ops; > + > +/** > + * DOC: IOMMU Radix Page Table > + * > + * The iommu implementation of the Generic Page Table provides an ops struct > + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and > + * the generic map/unmap interface. > + * > + * This interface uses a caller provided locking approach. The caller must have > + * a VA range lock concept that prevents concurrent threads from calling ops on > + * the same VA. Generally the range lock must be at least as large as a single > + * map call. > + */ > + > +/** > + * struct pt_iommu - Base structure for iommu page tables > + * > + * The format specific struct will include this as the first member. > + */ > +struct pt_iommu { > + /** > + * @domain - The core iommu domain. The driver should use a union to > + * overlay this memory with its previously existing domain struct to > + * create an alias. > + */ > + struct iommu_domain domain; > + > + /** > + * @ops - Function pointers to access the API > + */ > + const struct pt_iommu_ops *ops; > + > + /** > + * @nid - Node ID to use for table memory allocations. The iommu driver > + * may want to set the NID to the device's NID, if there are multiple > + * table walkers. > + */ > + int nid; > +}; > + > +/** > + * struct pt_iommu_info - Details about the iommu page table > + * > + * Returned from pt_iommu_ops->get_info() > + */ > +struct pt_iommu_info { > + /** > + * @pgsize_bitmap - A bitmask where each set bit indicates > + * a page size that can be natively stored in the page table. > + */ > + u64 pgsize_bitmap; > +}; > + > +struct pt_iommu_ops { > + /** > + * get_info() - Return the pt_iommu_info structure > + * @iommu_table: Table to query > + * > + * Return some basic static information about the page table. > + */ > + void (*get_info)(struct pt_iommu *iommu_table, > + struct pt_iommu_info *info); > + > + /** > + * deinit() - Undo a format specific init operation > + * @iommu_table: Table to destroy > + * > + * Release all of the memory. The caller must have already removed the > + * table from all HW access and all caches. > + */ > + void (*deinit)(struct pt_iommu *iommu_table); > +}; > + > +static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) > +{ > + /* > + * It is safe to call pt_iommu_deinit() before an init, or if init > + * fails. The ops pointer will only become non-NUL if deinit needs to be > + * run. > + */ > + if (iommu_table->ops) > + iommu_table->ops->deinit(iommu_table); > +} > + > +/** > + * struct pt_iommu_cfg - Common configuration values for all formats > + */ > +struct pt_iommu_cfg { > + /** > + * @features - Features required. Only these features will be turned on. > + * The feature list should reflect what the IOMMU HW is capable of. > + */ > + unsigned int features; > + /** > + * @hw_max_vasz_lg2 - Maximum VA the IOMMU HW can support. This will > + * imply the top level of the table. > + */ > + u8 hw_max_vasz_lg2; > + /** > + * @hw_max_oasz_lg2 - Maximum OA the IOMMU HW can support. The format > + * might select a lower maximum OA. > + */ > + u8 hw_max_oasz_lg2; > +}; > + > +#endif > -- > 2.43.0 >