On Wed, Sep 03, 2025 at 02:46:28PM -0300, Jason Gunthorpe wrote: > +/** > + * pt_entry_oa_lg2sz() - Return the size of a OA entry an OA > + * @pts: Entry to query > + * > + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise > + * it returns the total VA/OA size of the entire contiguous entry. > + */ > +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) > +{ > + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); > +} ------ > + * level > + * The number of table hops from the lowest leaf. Level 0 > + * is always a table of only leaves of the least significant VA bits. The Hmm, I am a bit confused here. I thought "leaf" was meant to be a "leaf" table? But here "a table of only leaves" makes it feel like a "leaf" table entry? Also, isn't "the least significant VA bits" the page offset? > + * table > + * A linear array of entries representing the translation items for that > + * level. > + * index > + * The position in a table of an element: item = table[index] > + * item > + * A single position in a table > + * entry > + * A single logical element in a table. If contiguous pages are not > + * supported then item and entry are the same thing, otherwise entry refers > + * to the all the items that comprise a single contiguous translation. So, an "entry" is a group of "items" if contiguous pages (huge page?) are supported. Then, the "entry" sounds like a physical (v.s. "logical") table entry, e.g. a PTE that we usually say? > +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) > +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) > +{ > + u64 *entryp = pt_cur_table(pts, u64) + pts->index; > + u64 old_entry = pts->entry; > + bool ret; > + > + /* > + * Ensure the zero'd table content itself is visible before its PTE can > + * be. release is a NOP on !SMP, but the HW is still doing an acquire. > + */ > + if (!IS_ENABLED(CONFIG_SMP)) > + dma_wmb(); Mind elaborating why SMP doesn't need this? > +/* > + * PT_WARN_ON is used for invariants that the kunit should be checking can't > + * happen. > + */ > +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) > +#define PT_WARN_ON WARN_ON > +#else > +static inline bool PT_WARN_ON(bool condition) > +{ > + return false; Should it "return condition"? Otherwise, these validations wouldn't be effective? drivers/iommu/generic_pt/pt_iter.h:388: if (PT_WARN_ON(!pts->table_lower)) drivers/iommu/generic_pt/pt_iter.h-389- return -EINVAL; -- drivers/iommu/generic_pt/pt_iter.h-429- drivers/iommu/generic_pt/pt_iter.h:430: if (PT_WARN_ON(!pt_can_have_table(pts)) || drivers/iommu/generic_pt/pt_iter.h:431: PT_WARN_ON(!pts->table_lower)) drivers/iommu/generic_pt/pt_iter.h-432- return -EINVAL; > +/** > + * pt_load_entry() - Read from the location pts points at into the pts > + * @pts: Table index to load > + * > + * Set the type of entry that was loaded. pts->entry and pts->table_lower > + * will be filled in with the entry's content. > + */ > +static inline void pt_load_entry(struct pt_state *pts) > +{ > + pts->type = pt_load_entry_raw(pts); > + if (pts->type == PT_ENTRY_TABLE) > + pts->table_lower = pt_table_ptr(pts); > +} I see a couple of callers check pts->type. Maybe it could return pts->type, matching with pt_load_entry_raw()? > diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h > new file mode 100644 > index 00000000000000..19e8f820c1dccf > --- /dev/null > +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h > @@ -0,0 +1,193 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + * > + * Default definitions for formats that don't define these functions. > + */ > +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H > +#define __GENERIC_PT_PT_FMT_DEFAULTS_H > + > +#include "pt_defs.h" > +#include <linux/log2.h> <> precedes "" > +#ifndef pt_pgsz_lg2_to_level > +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, > + unsigned int pgsize_lg2) > +{ > + return (pgsize_lg2 - PT_GRANULE_LG2SZ) / > + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); > + return 0; "return 0" should likely be dropped. > +/* > + * Format supplies either: > + * pt_entry_oa - OA is at the start of a contiguous entry > + * or > + * pt_item_oa - OA is correct for every item in a contiguous entry What does the "correct" mean here? > +/** > + * pt_range_to_end_index() - Ending index iteration > + * @pts: Iteration State > + * > + * Return: the last index for the iteration in pts. > + */ > +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) > +{ > + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); > + struct pt_range *range = pts->range; > + unsigned int num_entries_lg2; > + > + if (range->va == range->last_va) > + return pts->index + 1; > + > + if (pts->range->top_level == pts->level) > + return log2_div(fvalog2_mod(pts->range->last_va, > + pts->range->max_vasz_lg2), > + isz_lg2) + > + 1; How about: return 1 + log2_div(...); ? > +static __always_inline struct pt_range _pt_top_range(struct pt_common *common, > + uintptr_t top_of_table) > +{ > + struct pt_range range = { > + .common = common, > + .top_table = > + (struct pt_table_p *)(top_of_table & > + ~(uintptr_t)PT_TOP_LEVEL_MASK), > + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), Since top_level is unsigned, would it be faster to do bitwise: .top_level = top_of_table & PT_TOP_LEVEL_MASK, ? > +/* /** > + * pt_walk_descend_all() - Recursively invoke the walker for a table item > + * @parent_pts: Iteration State > + * @fn: Walker function to call > + * @arg: Value to pass to the function > + * > + * With pts pointing at a table item this will descend and over the entire lower > + * table. This creates a new walk and does not alter pts or pts->range. > + */ > +static __always_inline int > +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, > + void *arg) ------- > +/** > + * pt_compute_best_pgsize() - Determine the best page size for leaf entries > + * @pgsz_bitmap: Permitted page sizes > + * @va: Starting virtual address for the leaf entry > + * @last_va: Last virtual address for the leaf entry, sets the max page size > + * @oa: Starting output address for the leaf entry > + * > + * Compute the largest page size for va, last_va, and oa together and return it > + * in lg2. The largest page size depends on the format's supported page sizes at > + * this level, and the relative alignment of the VA and OA addresses. 0 means > + * the OA cannot be stored with the provided pgsz_bitmap. > + */ > +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, > + pt_vaddr_t va, > + pt_vaddr_t last_va, > + pt_oaddr_t oa) > +{ > + unsigned int best_pgsz_lg2; > + unsigned int pgsz_lg2; > + pt_vaddr_t len = last_va - va + 1; > + pt_vaddr_t mask; > + > + if (PT_WARN_ON(va >= last_va)) > + return 0; > + > + /* > + * Given a VA/OA pair the best page size is the largest page side > + * where: > + * > + * 1) VA and OA start at the page. Bitwise this is the count of least > + * significant 0 bits. > + * This also implies that last_va/oa has the same prefix as va/oa. > + */ > + mask = va | oa; > + > + /* > + * 2) The page size is not larger than the last_va (length). Since page > + * sizes are always power of two this can't be larger than the > + * largest power of two factor of the length. > + */ > + mask |= log2_to_int(log2_fls(len) - 1); > + > + best_pgsz_lg2 = log2_ffs(mask); > + > + /* Choose the higest bit <= best_pgsz_lg2 */ highest > +/* Compute a */ > +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) > +static_assert(log2_to_int_t(unsigned int, 0) == 1); > + > +/* Compute a - 1 (aka all low bits set) */ > +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) > + > +/* Compute a / b */ > +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) > +static_assert(log2_div_t(unsigned int, 4, 2) == 1); I skimmed through these macros and its callers. They are mostly dealing with VA, OA, and mask, which feels like straightforward bit-masking/shifting operations. E.g. log2_to_int_t = 64bit ? BIT_ULL(lg2) : BIT(lg2); log2_to_max_int_t = 64bit ? GENMASK_ULL(lg2 - 1, 0) : GENMASK(lg2 - 1, 0); What's the benefit from making them as arithmetic ones? > diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h > new file mode 100644 > index 00000000000000..a29bdd7b244de6 > --- /dev/null > +++ b/include/linux/generic_pt/common.h > @@ -0,0 +1,134 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + */ > +#ifndef __GENERIC_PT_COMMON_H > +#define __GENERIC_PT_COMMON_H > + > +#include <linux/types.h> > +#include <linux/build_bug.h> > +#include <linux/bits.h> In alphabetical order. Thanks Nicolin