Re: [PATCH v5 01/15] genpt: Generic Page Table base API

Nicolin Chen <nicolinc@xxxxxxxxxx> · Tue, 9 Sep 2025 20:40:28 -0700

On Wed, Sep 03, 2025 at 02:46:28PM -0300, Jason Gunthorpe wrote:
> +/**
> + * pt_entry_oa_lg2sz() - Return the size of a OA entry

an OA

> + * @pts: Entry to query
> + *
> + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
> + * it returns the total VA/OA size of the entire contiguous entry.
> + */
> +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
> +{
> +	return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
> +}
------

> + *  level
> + *     The number of table hops from the lowest leaf. Level 0
> + *     is always a table of only leaves of the least significant VA bits. The

Hmm, I am a bit confused here. I thought "leaf" was meant to be a
"leaf" table? But here "a table of only leaves" makes it feel like
a "leaf" table entry?

Also, isn't "the least significant VA bits" the page offset?

> + *  table
> + *     A linear array of entries representing the translation items for that
> + *     level.
> + *  index
> + *     The position in a table of an element: item = table[index]
> + *  item
> + *     A single position in a table
> + *  entry
> + *     A single logical element in a table. If contiguous pages are not
> + *     supported then item and entry are the same thing, otherwise entry refers
> + *     to the all the items that comprise a single contiguous translation.

So, an "entry" is a group of "items" if contiguous pages (huge
page?) are supported. Then, the "entry" sounds like a physical
(v.s. "logical") table entry, e.g. a PTE that we usually say?

> +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
> +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
> +{
> +	u64 *entryp = pt_cur_table(pts, u64) + pts->index;
> +	u64 old_entry = pts->entry;
> +	bool ret;
> +
> +	/*
> +	 * Ensure the zero'd table content itself is visible before its PTE can
> +	 * be. release is a NOP on !SMP, but the HW is still doing an acquire.
> +	 */
> +	if (!IS_ENABLED(CONFIG_SMP))
> +		dma_wmb();

Mind elaborating why SMP doesn't need this?

> +/*
> + * PT_WARN_ON is used for invariants that the kunit should be checking can't
> + * happen.
> + */
> +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
> +#define PT_WARN_ON WARN_ON
> +#else
> +static inline bool PT_WARN_ON(bool condition)
> +{
> +	return false;

Should it "return condition"?

Otherwise, these validations wouldn't be effective?

drivers/iommu/generic_pt/pt_iter.h:388: if (PT_WARN_ON(!pts->table_lower))
drivers/iommu/generic_pt/pt_iter.h-389-         return -EINVAL;
--
drivers/iommu/generic_pt/pt_iter.h-429-
drivers/iommu/generic_pt/pt_iter.h:430: if (PT_WARN_ON(!pt_can_have_table(pts)) ||
drivers/iommu/generic_pt/pt_iter.h:431:     PT_WARN_ON(!pts->table_lower))
drivers/iommu/generic_pt/pt_iter.h-432-         return -EINVAL;

> +/**
> + * pt_load_entry() - Read from the location pts points at into the pts
> + * @pts: Table index to load
> + *
> + * Set the type of entry that was loaded. pts->entry and pts->table_lower
> + * will be filled in with the entry's content.
> + */
> +static inline void pt_load_entry(struct pt_state *pts)
> +{
> +	pts->type = pt_load_entry_raw(pts);
> +	if (pts->type == PT_ENTRY_TABLE)
> +		pts->table_lower = pt_table_ptr(pts);
> +}

I see a couple of callers check pts->type. Maybe it could return
pts->type, matching with pt_load_entry_raw()?

> diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
> new file mode 100644
> index 00000000000000..19e8f820c1dccf
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
> @@ -0,0 +1,193 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + *
> + * Default definitions for formats that don't define these functions.
> + */
> +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
> +#define __GENERIC_PT_PT_FMT_DEFAULTS_H
> +
> +#include "pt_defs.h"
> +#include <linux/log2.h>

<> precedes ""

> +#ifndef pt_pgsz_lg2_to_level
> +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
> +						unsigned int pgsize_lg2)
> +{
> +	return (pgsize_lg2 - PT_GRANULE_LG2SZ) /
> +	       (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
> +	return 0;

"return 0" should likely be dropped.

> +/*
> + * Format supplies either:
> + *   pt_entry_oa - OA is at the start of a contiguous entry
> + * or
> + *   pt_item_oa  - OA is correct for every item in a contiguous entry

What does the "correct" mean here?

> +/**
> + * pt_range_to_end_index() - Ending index iteration
> + * @pts: Iteration State
> + *
> + * Return: the last index for the iteration in pts.
> + */
> +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
> +{
> +	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
> +	struct pt_range *range = pts->range;
> +	unsigned int num_entries_lg2;
> +
> +	if (range->va == range->last_va)
> +		return pts->index + 1;
> +
> +	if (pts->range->top_level == pts->level)
> +		return log2_div(fvalog2_mod(pts->range->last_va,
> +					    pts->range->max_vasz_lg2),
> +				isz_lg2) +
> +		       1;

How about:
		return 1 + log2_div(...);
?

> +static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
> +						     uintptr_t top_of_table)
> +{
> +	struct pt_range range = {
> +		.common = common,
> +		.top_table =
> +			(struct pt_table_p *)(top_of_table &
> +					      ~(uintptr_t)PT_TOP_LEVEL_MASK),
> +		.top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),

Since top_level is unsigned, would it be faster to do bitwise:
		.top_level = top_of_table & PT_TOP_LEVEL_MASK,
?

> +/*

/**

> + * pt_walk_descend_all() - Recursively invoke the walker for a table item
> + * @parent_pts: Iteration State
> + * @fn: Walker function to call
> + * @arg: Value to pass to the function
> + *
> + * With pts pointing at a table item this will descend and over the entire lower
> + * table. This creates a new walk and does not alter pts or pts->range.
> + */
> +static __always_inline int
> +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
> +		    void *arg)
-------

> +/**
> + * pt_compute_best_pgsize() - Determine the best page size for leaf entries
> + * @pgsz_bitmap: Permitted page sizes
> + * @va: Starting virtual address for the leaf entry
> + * @last_va: Last virtual address for the leaf entry, sets the max page size
> + * @oa: Starting output address for the leaf entry
> + *
> + * Compute the largest page size for va, last_va, and oa together and return it
> + * in lg2. The largest page size depends on the format's supported page sizes at
> + * this level, and the relative alignment of the VA and OA addresses. 0 means
> + * the OA cannot be stored with the provided pgsz_bitmap.
> + */
> +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
> +						  pt_vaddr_t va,
> +						  pt_vaddr_t last_va,
> +						  pt_oaddr_t oa)
> +{
> +	unsigned int best_pgsz_lg2;
> +	unsigned int pgsz_lg2;
> +	pt_vaddr_t len = last_va - va + 1;
> +	pt_vaddr_t mask;
> +
> +	if (PT_WARN_ON(va >= last_va))
> +		return 0;
> +
> +	/*
> +	 * Given a VA/OA pair the best page size is the largest page side
> +	 * where:
> +	 *
> +	 * 1) VA and OA start at the page. Bitwise this is the count of least
> +	 *    significant 0 bits.
> +	 *    This also implies that last_va/oa has the same prefix as va/oa.
> +	 */
> +	mask = va | oa;
> +
> +	/*
> +	 * 2) The page size is not larger than the last_va (length). Since page
> +	 *    sizes are always power of two this can't be larger than the
> +	 *    largest power of two factor of the length.
> +	 */
> +	mask |= log2_to_int(log2_fls(len) - 1);
> +
> +	best_pgsz_lg2 = log2_ffs(mask);
> +
> +	/* Choose the higest bit <= best_pgsz_lg2 */

highest

> +/* Compute a */
> +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
> +static_assert(log2_to_int_t(unsigned int, 0) == 1);
> +
> +/* Compute a - 1 (aka all low bits set) */
> +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
> +
> +/* Compute a / b */
> +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
> +static_assert(log2_div_t(unsigned int, 4, 2) == 1);

I skimmed through these macros and its callers. They are mostly
dealing with VA, OA, and mask, which feels like straightforward
bit-masking/shifting operations.

E.g.
log2_to_int_t = 64bit ? BIT_ULL(lg2) : BIT(lg2);
log2_to_max_int_t = 64bit ? GENMASK_ULL(lg2 - 1, 0) : GENMASK(lg2 - 1, 0);

What's the benefit from making them as arithmetic ones?

> diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
> new file mode 100644
> index 00000000000000..a29bdd7b244de6
> --- /dev/null
> +++ b/include/linux/generic_pt/common.h
> @@ -0,0 +1,134 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + */
> +#ifndef __GENERIC_PT_COMMON_H
> +#define __GENERIC_PT_COMMON_H
> +
> +#include <linux/types.h>
> +#include <linux/build_bug.h>
> +#include <linux/bits.h>

In alphabetical order.

Thanks
Nicolin