On 8/26/25 10:18 AM, Jason Gunthorpe wrote: > Tested-by: Alejandro Jimenez <alejandro.j.jimenez@xxxxxxxxxx> > Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx> > --- > .clang-format | 1 + > drivers/iommu/Kconfig | 2 + > drivers/iommu/generic_pt/Kconfig | 20 + > drivers/iommu/generic_pt/pt_common.h | 354 ++++++++++++ > drivers/iommu/generic_pt/pt_defs.h | 323 +++++++++++ > drivers/iommu/generic_pt/pt_fmt_defaults.h | 193 +++++++ > drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++++++++++++++ > drivers/iommu/generic_pt/pt_log2.h | 130 +++++ > include/linux/generic_pt/common.h | 134 +++++ > 9 files changed, 1793 insertions(+) > create mode 100644 drivers/iommu/generic_pt/Kconfig > create mode 100644 drivers/iommu/generic_pt/pt_common.h > create mode 100644 drivers/iommu/generic_pt/pt_defs.h > create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h > create mode 100644 drivers/iommu/generic_pt/pt_iter.h > create mode 100644 drivers/iommu/generic_pt/pt_log2.h > create mode 100644 include/linux/generic_pt/common.h > > diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h > new file mode 100644 > index 00000000000000..5ed06104d38b45 > --- /dev/null > +++ b/drivers/iommu/generic_pt/pt_common.h > @@ -0,0 +1,354 @@ [snip] > +/** > + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry > + * @pts: Entry to query > + * > + * Returns the number of contiguous items this leaf entry spans. If the entry is * Returns: > + * single item it returns ilog2(1). > + */ > +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); > + > +/** > + * pt_entry_oa() - Output Address for this leaf entry > + * @pts: Entry to query > + * > + * Return the output address for the start of the entry. If the entry * Return: or * Returns: (usually last in the kernel-doc comment block) > + * is contigous this returns the same value for each sub-item. Ie:: contiguous I.e.:: > + * > + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 > + * > + * See pt_item_oa(). The format should implement one of these two functions > + * depending on how it stores the OA's in the table. or OAs > + */ > +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); > + > +/** > + * pt_entry_oa_lg2sz() - Return the size of a OA entry > + * @pts: Entry to query > + * > + * If the entry is not contigous this returns pt_table_item_lg2sz(), otherwise contiguous > + * it returns the total VA/OA size of the entire contiguous entry. Modify to use * Returns: ? > + */ > +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) > +{ > + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); > +} > + > +/** > + * pt_entry_oa_full() - Return the full OA for an entry > + * @pts: Entry to query > + * > + * During iteration the first entry could have a VA with an offset from the > + * natural start of the entry. Return the true full OA considering the pts's VA > + * offset. * * Returns: the true full OA considering the pts's VA offset > + */ > +static inline pt_oaddr_t pt_entry_oa_full(const struct pt_state *pts) > +{ > + return _pt_entry_oa_fast(pts) | > + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); > +} > + > +/** > + * pt_entry_set_write_clean() - Make the entry write clean > + * @pts: Table index to change > + * > + * Modify the entry so that pt_entry_write_is_dirty() == false. The HW will > + * eventually be notified of this change via a TLB flush, which is the point > + * that the HW must become synchronized. Any "write dirty" prior to the TLB > + * flush can be lost, but once the TLB flush completes all writes must make > + * their entries write dirty. > + * > + * The format should alter the entry in a way that is compatible with any > + * concurrent update from HW. The entire contiguous entry is changed. > + */ > +static inline void pt_entry_set_write_clean(struct pt_state *pts); > + > +/** > + * pt_entry_write_is_dirty() - True if the entry has been written to > + * @pts: Entry to query > + * > + * "write dirty" means that the HW has written to the OA translated > + * by this entry. If the entry is contiguous then the consolidated > + * "write dirty" for all the items must be returned. > + */ > +static inline bool pt_entry_write_is_dirty(const struct pt_state *pts); > + > +/** > + * pt_full_va_prefix() - The top bits of the VA > + * @common: Page table to query > + * > + * This is usually 0, but some formats have their VA space going downward from > + * PT_VADDR_MAX, and will return that instead. This value must always be > + * adjusted by struct pt_common max_vasz_lg2. > + */ > +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); > + > +/** > + * pt_has_system_page() - True if level 0 can install a PAGE_SHIFT entry > + * @common: Page table to query > + * > + * If true the caller use at level 0 pt_install_leaf_entry(PAGE_SHIFT). This is uses ? although it might just be missing a word or two? I can't tell. > + * useful to create optimized paths for common cases of PAGE_SIZE mappings. > + */ > +static inline bool pt_has_system_page(const struct pt_common *common); > + > +/** > + * pt_install_leaf_entry() - Write a leaf entry to the table > + * @pts: Table index to change > + * @oa: Output Address for this leaf > + * @oasz_lg2: Size in VA for this leaf > + * @attrs: Attributes to modify the entry > + * > + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates > + * the VA indicated by pts to the given OA. > + * > + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). > + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. > + * > + * This must not be called if pt_can_have_leaf() == false. Contigous sizes Contiguous > + * not indicated by pt_possible_sizes() must not be specified. > + */ > +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, > + unsigned int oasz_lg2, > + const struct pt_write_attrs *attrs); > + > +/** > + * pt_install_table() - Write a table entry to the table > + * @pts: Table index to change > + * @table_pa: CPU physical address of the lower table's memory > + * @attrs: Attributes to modify the table index > + * > + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa > + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the > + * current entry loaded. The pts is updated with the installed entry. > + * > + * This must not be called if pt_can_have_table() == false. > + * > + * Returns true if the table was installed successfully. * Returns: > + */ > +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, > + const struct pt_write_attrs *attrs); > + > +/** > + * pt_item_oa() - Output Address for this leaf item > + * @pts: Item to query > + * > + * Return the output address for this item. If the item is part of a contiguous * Return: > + * entry it returns the value of the OA for this individual sub item. > + * > + * See pt_entry_oa(). The format should implement one of these two functions > + * depending on how it stores the OA's in the table. > + */ > +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); > + > +/** > + * pt_load_entry_raw() - Read from the location pts points at into the pts > + * @pts: Table index to load > + * > + * Return the type of entry that was loaded. pts->entry will be filled in with * Return: > + * the entry's content. See pt_load_entry() > + */ > +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); > + > +/** > + * pt_max_output_address_lg2() - Return the maximum OA the table format can hold > + * @common: Page table to query > + * > + * The value oalog2_to_max_int(pt_max_output_address_lg2()) is the MAX for the > + * OA. This is the absolute maximum address the table can hold. struct pt_common > + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. > + */ > +static inline unsigned int > +pt_max_output_address_lg2(const struct pt_common *common); > + > +/** > + * pt_num_items_lg2() - Return the number of items in this table level > + * @pts: The current level > + * > + * The number of items in a table level defines the number of bits this level > + * decodes from the VA. This function is not called for the top level, > + * so it does not need to compute a special value for the top case. The > + * result for the top is based on pt_common max_vasz_lg2. > + * > + * The value is used as part if determining the table indexes via the part of ? > + * equation:: > + * > + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) > + */ > +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); > + > +/** > + * pt_pgsz_lg2_to_level - Return the level that maps the page size > + * @common: Page table to query > + * @pgsize_lg2: Log2 page size > + * > + * Returns the table level that will map the given page size. The page * Returns: > + * size must be part of the pt_possible_sizes() for some level. > + */ > +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, > + unsigned int pgsize_lg2); > + > +/** > + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level > + * @pts: The current level > + * > + * Each level has a list of possible output sizes that can be installed as > + * leaf entries. If pt_can_have_leaf() is false returns zero. > + * > + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating > + * that a non-contigous singe item leaf entry is supported. The following non-contiguous Also, is that single ? or is "singe" a real word here? (IDK.) > + * pt_num_items_lg2() number of bits can be set indicating contiguous entries > + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be > + * set, contiguous entries cannot span the entire table. > + * > + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all > + * supported sizes in the entire table. > + */ > +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); > + > +/** > + * pt_table_item_lg2sz() - Size of a single item entry in this table level > + * @pts: The current level > + * > + * The size of the item specifies how much VA and OA a single item occupies. > + * > + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous > + * entries. > + */ > +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); > + > +/** > + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table > + * @pts: The current level > + * > + * Return the size of VA decoded by the entire table level. * Return: > + */ > +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) > +{ > + if (pts->range->top_level == pts->level) > + return pts->range->max_vasz_lg2; > + return min_t(unsigned int, pts->range->common->max_vasz_lg2, > + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); > +} > + > +/** > + * pt_table_pa() - Return the CPU physical address of the table entry > + * @pts: Entry to query > + * > + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same > + * value passed to pt_install_table(). > + */ > +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); > + > +/** > + * pt_table_ptr() - Return a CPU pointer for a table item > + * @pts: Entry to query > + * > + * Same as pt_table_pa() but returns a CPU pointer. > + */ > +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) > +{ > + return __va(pt_table_pa(pts)); > +} > + > +/** > + * pt_load_entry() - Read from the location pts points at into the pts > + * @pts: Table index to load > + * > + * Set the type of entry that was loaded. pts->entry and pts->table_lower > + * will be filled in with the entry's content. > + */ > +static inline void pt_load_entry(struct pt_state *pts) > +{ > + pts->type = pt_load_entry_raw(pts); > + if (pts->type == PT_ENTRY_TABLE) > + pts->table_lower = pt_table_ptr(pts); > +} > +#endif > diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h > new file mode 100644 > index 00000000000000..3673566708495d > --- /dev/null > +++ b/drivers/iommu/generic_pt/pt_defs.h > @@ -0,0 +1,323 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + * > + * This header is included before the format. It contains definitions > + * that are required to compile the format. The header order is: > + * pt_defs.h > + * fmt_XX.h > + * pt_common.h > + */ > +#ifndef __GENERIC_PT_DEFS_H > +#define __GENERIC_PT_DEFS_H > + > +#include <linux/generic_pt/common.h> > + > +#include <linux/types.h> > +#include <linux/atomic.h> > +#include <linux/bits.h> > +#include <linux/limits.h> > +#include <linux/bug.h> > +#include <linux/kconfig.h> > +#include "pt_log2.h" > + > +/* Header self-compile default defines */ > +#ifndef pt_write_attrs > +typedef u64 pt_vaddr_t; > +typedef u64 pt_oaddr_t; > +#endif > + > +struct pt_table_p; > + > +enum { > + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, > + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, > + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, > + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, > +}; Hm, duplicated enum entry values? Interesting. > + > +/* > + * The format instantiation can have features wired off or on to optimize the > + * code gen. Supported features are just a reflection of what the current set of > + * kernel users want to use. > + */ > +#ifndef PT_SUPPORTED_FEATURES > +#define PT_SUPPORTED_FEATURES 0 > +#endif > + > +/* > + * When in debug mode we compile all formats with all features. This allows the > + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or > + * FULL_VA. > + */ > +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) > +enum { > + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, > + PT_DEBUG_SUPPORTED_FEATURES = > + UINT_MAX & > + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? > + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : > + BIT(PT_FEAT_SIGN_EXTEND)), > +}; > +#undef PT_SUPPORTED_FEATURES > +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES > +#endif > + > +#ifndef PT_FORCE_ENABLED_FEATURES > +#define PT_FORCE_ENABLED_FEATURES 0 > +#endif > + > +/** > + * DOC: Generic Page Table Language > + * > + * Language used in Generic Page Table > + * va > + * The input address to the page table, often the virtual address. > + * oa > + * The output address from the page table, often the physical address. > + * leaf > + * An entry that results in an output address. Ie a physical memory addr I.e., > + * start/end > + * An open range, eg [0,0) refers to no VA. e.g., and is a half-open (or right-open) range or interval, not open. Open would be (0, 0). Closed would be [0, 0]. I used to think that was "clopen" but now I read that clopen refers to sets and not intervals. > + * start/last > + * An inclusive closed range, eg [0,0] refers to the VA 0 e.g., > + * common > + * The generic page table container struct pt_common > + * level > + * The number of table hops from the lowest leaf. Level 0 > + * is always a table of only leaves of the least significant VA bits. The > + * labels used by HW descriptions are never used. > + * top_level > + * The inclusive highest level of the table. A two level table two-level > + * has a top level of 1. > + * table > + * A linear array of entries representing the translation items for that > + * level. > + * index > + * The position in a table of an element: item = table[index] > + * item > + * A single position in a table > + * entry > + * A single logical element in a table. If contiguous pages are not > + * supported then item and entry are the same thing, otherwise entry refers > + * to the all the items that comprise a single contiguous translation. > + * item/entry_size > + * The number of bytes of VA the table translates for. > + * If the item is a table entry then the next table covers > + * this size. If the entry is an output address then the > + * full OA is: OA | (VA % entry_size) > + * contig_count > + * The number of consecutive items fused into a single entry. > + * item_size * contig_count is the size of that entry's translation. > + * lg2 > + * Indicates the value is encoded as log2, ie 1<<x is the actual value. i.e., > + * Normally the compiler is fine to optimize divide and mod with log2 values > + * automatically when inlining, however if the values are not constant > + * expressions it can't. So we do it by hand, we want to avoid 64 bit hand; 64-bit > + * divmod. > + */ > + > +/* Returned by pt_load_entry() and for_each_pt_level_entry() */ > +enum pt_entry_type { > + PT_ENTRY_EMPTY, > + PT_ENTRY_TABLE, > + /* Entry is valid and returns an output address */ > + PT_ENTRY_OA, > +}; > + > +struct pt_range { > + struct pt_common *common; > + struct pt_table_p *top_table; > + pt_vaddr_t va; > + pt_vaddr_t last_va; > + u8 top_level; > + u8 max_vasz_lg2; > +}; > + > +/* > + * Similar to xa_state, this records information about an in progress parse at a in-progress > + * single level. > + */ > +struct pt_state { > + struct pt_range *range; > + struct pt_table_p *table; > + struct pt_table_p *table_lower; > + u64 entry; > + enum pt_entry_type type; > + unsigned short index; > + unsigned short end_index; > + u8 level; > +}; > + > +#define pt_cur_table(pts, type) ((type *)((pts)->table)) > + > +/* > + * Try to install a new table pointer. The locking methodology requires this to > + * be atomic, multiple threads can race to install a pointer, the losing threads atomic; pointer; > + * will fail the atomic and return false. They should free any memory and > + * reparse the table level again. > + */ [snup] > +/* > + * The full va (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and > + * generate a useful defined result. The non fva versions will malfunction at non-fva > + * this extreme. > + */ > +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) > +{ > + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) > + return 0; > + return log2_div_t(pt_vaddr_t, a, b_lg2); > +} > + > +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) > +{ > + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) > + return a; > + return log2_mod_t(pt_vaddr_t, a, b_lg2); > +} > + > +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, > + unsigned int c_lg2) > +{ > + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) > + return true; > + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); > +} > + > +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, > + unsigned int b_lg2) > +{ > + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) > + return val; > + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); > +} > + > +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) > +{ > + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) > + return PT_VADDR_MAX; > + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); > +} > + [snip] > diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h > new file mode 100644 > index 00000000000000..8738008d024b0b > --- /dev/null > +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h > @@ -0,0 +1,193 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + * > + * Default definitions for formats that don't define these functions. > + */ > +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H > +#define __GENERIC_PT_PT_FMT_DEFAULTS_H > + > +#include "pt_defs.h" > +#include <linux/log2.h> > + > +/* Header self-compile default defines */ > +#ifndef pt_load_entry_raw > +#include "fmt/amdv1.h" > +#endif > + > +/* > + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and > + * PT_ITEM_WORD_SIZE. The must be the same at every level excluding the top. They > + */ [snip] > diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h > new file mode 100644 > index 00000000000000..abbd243f10d879 > --- /dev/null > +++ b/drivers/iommu/generic_pt/pt_iter.h > @@ -0,0 +1,636 @@ [snip] > +/* > + * Add index_count_lg2 number of entries to pts's VA and index. The va will be s/VA/va/ for consistency? since it ("va") is defined in Generic Page Table Language. > + * adjusted to the end of the contiguous block if it is currently in the middle. > + */ > +static inline void _pt_advance(struct pt_state *pts, > + unsigned int index_count_lg2) > +{ > + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, > + index_count_lg2); > +} > + > +/** > + * pt_item_fully_covered() - Check if the item or entry is entirely contained > + * within pts->range > + * @pts: Iteration State > + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or > + * pt_entry_oa_lg2sz() > + * > + * True if the item is fully enclosed by the pts->range. * Return: true if the item ... > + */ > +static inline bool pt_item_fully_covered(const struct pt_state *pts, > + unsigned int oasz_lg2) > +{ > + struct pt_range *range = pts->range; > + > + /* Range begins at the start of the entry */ > + if (log2_mod(pts->range->va, oasz_lg2)) > + return false; > + > + /* Range ends past the end of the entry */ > + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) > + return true; > + > + /* Range ends at the end of the entry */ > + return log2_mod_eq_max(range->last_va, oasz_lg2); > +} > + > +/** > + * pt_range_to_index() - Starting index for an iteration > + * @pts: Iteration State > + * > + * Return the starting index for the iteration in pts. * Return: > + */ > +static inline unsigned int pt_range_to_index(const struct pt_state *pts) > +{ > + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); > + > + PT_WARN_ON(pts->level > pts->range->top_level); > + if (pts->range->top_level == pts->level) > + return log2_div(fvalog2_mod(pts->range->va, > + pts->range->max_vasz_lg2), > + isz_lg2); > + return log2_mod(log2_div(pts->range->va, isz_lg2), > + pt_num_items_lg2(pts)); > +} > + > +/** > + * pt_range_to_end_index() - Ending index iteration > + * @pts: Iteration State > + * > + * Return the last index for the iteration in pts. * Return: > + */ > +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) > +{ > + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); > + struct pt_range *range = pts->range; > + unsigned int num_entries_lg2; > + > + if (range->va == range->last_va) > + return pts->index + 1; > + > + if (pts->range->top_level == pts->level) > + return log2_div(fvalog2_mod(pts->range->last_va, > + pts->range->max_vasz_lg2), > + isz_lg2) + > + 1; > + > + num_entries_lg2 = pt_num_items_lg2(pts); > + > + /* last_va falls within this table */ > + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) > + return log2_mod(log2_div(pts->range->last_va, isz_lg2), > + num_entries_lg2) + > + 1; > + > + return log2_to_int(num_entries_lg2); > +} > + > +static inline void _pt_iter_first(struct pt_state *pts) > +{ > + pts->index = pt_range_to_index(pts); > + pts->end_index = pt_range_to_end_index(pts); > + PT_WARN_ON(pts->index > pts->end_index); > +} > + > +static inline bool _pt_iter_load(struct pt_state *pts) > +{ > + if (pts->index >= pts->end_index) > + return false; > + pt_load_entry(pts); > + return true; > +} > + > +/** > + * pt_next_entry() - Advance pts to the next entry > + * @pts: Iteration State > + * > + * Update pts to go to the next index at this level. If pts is pointing at a > + * contiguous entry then the index may advance my more than one. by > + */ > +static inline void pt_next_entry(struct pt_state *pts) > +{ > + if (pts->type == PT_ENTRY_OA && > + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) > + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); > + else > + pts->index++; > + pt_index_to_va(pts); > +} > + > +/** > + * for_each_pt_level_entry() - For loop wrapper over entries in the range > + * @pts: Iteration State > + * > + * This is the basic iteration primitive, it iterates over all the entries in primitive. It > + * pts->range that fall within the pts's current table level. Each step does > + * pt_load_entry(pts). > + */ > +#define for_each_pt_level_entry(pts) \ > + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) > + [snip] > +/* > + * pt_walk_descend_all() - Recursively invoke the walker for a table item > + * @pts: Iteration State * @parent_pts: > + * @fn: Walker function to call > + * @arg: Value to pass to the function > + * > + * With pts pointing at a table item this will descend and over the entire lower > + * table. This creates a new walk and does not alter pts or pts->range. > + */ > +static __always_inline int > +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, > + void *arg) > +{ [snip] > +/** > + * PT_MAKE_LEVELS() - Build an unwound walker > + * @fn: Name of the walker function > + * @do_fn: Function to call at each level > + * > + * This builds a function call tree that can be fully inlined, inlined. > + * The caller must provide a function body in an __always_inline function:: > + * [snip] > diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h > new file mode 100644 > index 00000000000000..91869fad33fbdf > --- /dev/null > +++ b/include/linux/generic_pt/common.h > @@ -0,0 +1,134 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + */ > +#ifndef __GENERIC_PT_COMMON_H > +#define __GENERIC_PT_COMMON_H > + > +#include <linux/types.h> > +#include <linux/build_bug.h> > +#include <linux/bits.h> > + > +/** > + * DOC: Generic Radix Page Table > + * > + * Generic Radix Page Table is a set of functions and helpers to efficiently > + * parse radix style page tables typically seen in HW implementations. The > + * interface is built to deliver similar code generation as the mm's pte/pmd/etc > + * system by fully inlining the exact code required to handle each table level. > + * > + * Like the MM each format contributes its parsing implementation under common maybe MM system, > + * names and the common code implements the required algorithms. > + * > + * The system is divided into three logical levels: > + * > + * - The page table format and its manipulation functions > + * - Generic helpers to give a consistent API regardless of underlying format > + * - An algorithm implementation (eg IOMMU/DRM/KVM/MM) (e.g., > + * > + * Multiple implementations are supported, the intention is to have the generic supported. The > + * format code be re-usable for whatever specalized implementation is required. > + * The generic code is solely about the format of the radix tree, it does not tree; > + * include memory allocation or higher level decisions that are left for the > + * implementation. > + * > + * The generic framework supports a superset of functions across many HW > + * implementations: > + * > + * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes > + * - Multi-level tables, up to 6 levels. Runtime selected top level > + * - Runtime variable table level size (ARM's concatenated tables) > + * - Expandable top level allowing dynamic sizing of table levels > + * - Optional leaf entries at any level > + * - 32 bit/64 bit virtual and output addresses, using every address bit 32-bit/64-bit > + * - Dirty tracking > + */ > + > +/** > + * struct pt_common * struct pt_common - <some short struct description> > + */ > +struct pt_common { > + /** > + * @top_of_table: Encodes the table top pointer and the top level in a > + * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower > + * bits of the aligned table pointer are used for the level. > + */ > + uintptr_t top_of_table; > + /** > + * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits > + * must be zero. This may be less than what the page table format > + * supports, but must not be more. > + */ > + u8 max_oasz_lg2; > + /** > + * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits > + * are 0 or 1 depending on pt_full_va_prefix(). This may be less than > + * what the page table format supports, but must not be more. When > + * PT_FEAT_DYNAMIC_TOP this reflects the maximum VA capability. PT_FEAT_DYNAMIC_TOP is set, this reflects ... ? > + */ > + u8 max_vasz_lg2; > + /** > + * @features: Bitmap of `enum pt_features` > + */ > + unsigned int features; > +}; > + > +/* Encoding parameters for top_of_table */ > +enum { > + PT_TOP_LEVEL_BITS = 3, > + PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0), > +}; > + > +/** > + * enum pt_features - Features turned on in the table. Each symbol is a bit > + * position. > + */ > +enum pt_features { > + /** > + * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to > + * PT_VADDR_MAX. > + */ > + PT_FEAT_FULL_VA, > + /** > + * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased > + * dynamically during map. This requires HW support for atomically > + * setting both the table top pointer and the starting table level. > + */ > + PT_FEAT_DYNAMIC_TOP, > + /** > + * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign > + * extends up to the full pt_vaddr_t. This divides the page table into > + * three VA ranges:: > + * > + * 0 -> 2^N - 1 Lower > + * 2^N -> (MAX - 2^N - 1) Non-Canonical > + * MAX - 2^N -> MAX Upper > + * > + * In this mode pt_common::max_vasz_lg2 includes the sign bit and the > + * upper bits that don't fall within the translation are just validated. > + * > + * If not set there is no sign extension and valid VA goes from 0 to 2^N > + * - 1. > + */ > + PT_FEAT_SIGN_EXTEND, > + /** > + * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA > + * ranges which will clean out any walk cache or any IOPTE fully > + * contained by the range. The optimization objective is to minimize the > + * number of flushes even if ranges include IOVA gaps that do not need > + * to be flushed. > + */ > + PT_FEAT_FLUSH_RANGE, > + /** > + * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that > + * the optimization objective is to only flush IOVA that has been > + * changed. This mode is suitable for cases like hypervisor shadowing > + * where flushing unchanged ranges may cause the hypervisor to reparse > + * significant amount of page table. > + */ > + PT_FEAT_FLUSH_RANGE_NO_GAPS, > + /* private: */ > + PT_FEAT_FMT_START, > +}; > + > +#endif -- ~Randy