On Tue, May 20, 2025 at 3:58 PM Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote: > > On Thu, May 15, 2025 at 2:16 PM Amery Hung <ameryhung@xxxxxxxxx> wrote: > > > > Task local data defines an abstract storage type for storing task- > > specific data (TLD). This patch provides user space and bpf > > implementation as header-only libraries for accessing task local data. > > > > Task local data is a bpf task local storage map with two UPTRs: > > 1) u_tld_metadata, shared by all tasks of the same process, consists of > > the total count of TLDs and an array of metadata of TLDs. A metadata of > > a TLD comprises the size and the name. The name is used to identify a > > specific TLD in bpf 2) data is memory for storing TLDs specific to the > > task. > > > > The following are the basic task local data API: > > > > User space BPF > > Create key tld_create_key() - > > Fetch key - tld_fetch_key() > > Get data tld_get_data() tld_get_data() > > > > A TLD is first created by the user space with tld_create_key(). First, > > it goes through the metadata array to check if the TLD can be added. > > The total TLD size needs to fit into a page (limited by UPTR), and no > > two TLDs can have the same name. It also calculates the offset, the next > > available space in u_tld_data, by summing sizes of TLDs. If the TLD > > can be added, it increases the count using cmpxchg as there may be other > > concurrent tld_create_key(). After a successful cmpxchg, the last > > metadata slot now belongs to the calling thread and will be updated. > > tld_create_key() returns the offset encapsulated as a opaque object key > > to prevent user misuse. > > > > Then user space can pass the key to tld_get_data() to get a pointer > > to the TLD. The pointer will remain valid for the lifetime of the > > thread. > > > > BPF programs also locate TLDs with the keys. This is done by calling > > tld_fetch_key() with the name of the TLD. Similar to tld_create_key(), > > it scans through metadata array, compare the name of TLDs and compute > > the offset. Once found, the offset is also returned as a key, which > > can be passed to the bpf version of tld_get_data() to retrieve a > > pointer to the TLD. > > > > User space task local data library uses a light way approach to ensure > > thread safety (i.e., atomic operation + compiler and memory barriers). > > While a metadata is being updated, other threads may also try to read it. > > To prevent them from seeing incomplete data, metadata::size is used to > > signal the completion of the update, where 0 means the update is still > > ongoing. Threads will wait until seeing a non-zero size to read a > > metadata. Acquire/release order is required for metadata::size to > > prevent hardware reordering. For example, moving store to metadata::name > > after store to metadata::size or moving load from metadata::name before > > load from metadata::size. > > > > Signed-off-by: Amery Hung <ameryhung@xxxxxxxxx> > > --- > > .../bpf/prog_tests/task_local_data.h | 263 ++++++++++++++++++ > > .../selftests/bpf/progs/task_local_data.bpf.h | 220 +++++++++++++++ > > 2 files changed, 483 insertions(+) > > create mode 100644 tools/testing/selftests/bpf/prog_tests/task_local_data.h > > create mode 100644 tools/testing/selftests/bpf/progs/task_local_data.bpf.h > > > > diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h > > new file mode 100644 > > index 000000000000..ec43ea59267c > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h > > @@ -0,0 +1,263 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +#ifndef __TASK_LOCAL_DATA_H > > +#define __TASK_LOCAL_DATA_H > > + > > +#include <fcntl.h> > > +#include <errno.h> > > +#include <sched.h> > > +#include <stddef.h> > > +#include <stdlib.h> > > +#include <string.h> > > +#include <unistd.h> > > +#include <sys/syscall.h> > > +#include <sys/types.h> > > + > > +#include <bpf/bpf.h> > > + > > +#ifndef PIDFD_THREAD > > +#define PIDFD_THREAD O_EXCL > > +#endif > > + > > +#define PAGE_SIZE 4096 > > + > > +#ifndef __round_mask > > +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) > > +#endif > > +#ifndef round_up > > +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) > > +#endif > > + > > +#ifndef READ_ONCE > > +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) > > +#endif > > + > > +#ifndef WRITE_ONCE > > +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) > > +#endif > > this is a lot of pollution of user applications with generic names... > consider TLD_ prefixing all of them? > I will make the name more specific by adding TLD_ prefix. > > + > > +#define TLD_DATA_SIZE PAGE_SIZE > > +#define TLD_DATA_CNT 63 > > +#define TLD_NAME_LEN 62 > > + > > +typedef struct { > > + __s16 off; > > +} tld_key_t; > > + > > +struct tld_metadata { > > + char name[TLD_NAME_LEN]; > > + __u16 size; > > +}; > > + > > +struct u_tld_metadata { > > + __u8 cnt; > > + __u8 padding[63]; > > + struct tld_metadata metadata[TLD_DATA_CNT]; > > +}; > > + > > +struct u_tld_data { > > + char data[TLD_DATA_SIZE]; > > +}; > > + > > +struct tld_map_value { > > + struct u_tld_data *data; > > + struct u_tld_metadata *metadata; > > +}; > > + > > +struct u_tld_metadata *tld_metadata_p __attribute__((weak)); > > +__thread struct u_tld_data *tld_data_p __attribute__((weak)); > > + > > +static int __tld_init_metadata(int map_fd) > > +{ > > + struct u_tld_metadata *new_metadata; > > + struct tld_map_value map_val; > > + int task_fd = 0, err; > > + > > [...] > > > + > > + map_val.data = new_data; > > + map_val.metadata = READ_ONCE(tld_metadata_p); > > + > > + err = bpf_map_update_elem(map_fd, &task_fd, &map_val, 0); > > + if (err) { > > + free(new_data); > > + goto out; > > + } > > + > > + tld_data_p = new_data; > > +out: > > + if (task_fd > 0) > > task_fd can be zero, so >= 0 and init to -1; same in init_metadata Yeah. I will fix this in the next respin. > > > + close(task_fd); > > + return err; > > +} > > + > > +/** > > + * tld_create_key() - Create a key associated with a TLD. > > + * > > + * @map_fd: A file descriptor of the underlying task local storage map, > > + * tld_data_map > > + * @name: The name the TLD will be associated with > > + * @size: Size of the TLD > > + * > > + * Returns an opaque object key. Use tld_key_is_err() or tld_key_err_or_zero() to > > + * check if the key creation succeed. Pass to tld_get_data() to get a pointer to > > typo: succeeded Will fix the typo. Thanks > > > + * the TLD. bpf programs can also fetch the same key by name. > > + */ > > +__attribute__((unused)) > > +static tld_key_t tld_create_key(int map_fd, const char *name, size_t size) > > +{ > > + int err, i, cnt, sz, off = 0; > > + > > + if (!READ_ONCE(tld_metadata_p)) { > > + err = __tld_init_metadata(map_fd); > > + if (err) > > + return (tld_key_t) {.off = err}; > > + } > > + > > + if (!tld_data_p) { > > + err = __tld_init_data(map_fd); > > + if (err) > > + return (tld_key_t) {.off = err}; > > + } > > + > > + size = round_up(size, 8); > > I'd record actual requested size, but internally round up to 8 where > necessary (see below) > > > + > > + for (i = 0; i < TLD_DATA_CNT; i++) { > > +retry: > > + cnt = __atomic_load_n(&tld_metadata_p->cnt, __ATOMIC_RELAXED); > > + if (i < cnt) { > > + /* > > + * Pending tld_create_key() uses size to signal if the metadata has > > + * been fully updated. > > + */ > > + while (!(sz = __atomic_load_n(&tld_metadata_p->metadata[i].size, > > + __ATOMIC_ACQUIRE))) > > + sched_yield(); > > + > > + if (!strncmp(tld_metadata_p->metadata[i].name, name, TLD_NAME_LEN)) > > + return (tld_key_t) {.off = -EEXIST}; > > do you check mismatched size for the same key? if not, should it be checked? > > but if name and size matches, shouldn't this be a success instead of > -EEXIST error?... > I think users should only call tld_create_key() once for a TLD. Returning an -EEXIST gives us a way to detect conflict. If users knows that they will be calling tld_create_key() for a TLD in multiple places, they could still do it by treating -EEXIST as success. Therefore, no checking mismatching size here. > > > + > > + off += sz; > > you should probably specify alignment guarantees explicitly and round > that up somewhere here, so that if you allocate bool and then u64, u64 > is properly 8 byte aligned and internally you know that the size was 1 > and 8? With BPF ringbuf we guarantee 8 byte alignment, and so far it > worked out great, so I'd just document 8 and go with that. > Thanks for the suggestion. I will document the alignment guarantee. > > + continue; > > + } > > + > > + if (off + size > TLD_DATA_SIZE) > > + return (tld_key_t) {.off = -E2BIG}; > > + > > + /* > > + * Only one tld_create_key() can increase the current cnt by one and > > + * takes the latest available slot. Other threads will check again if a new > > + * TLD can still be added, and then compete for the new slot after the > > + * succeeding thread update the size. > > + */ > > + if (!__atomic_compare_exchange_n(&tld_metadata_p->cnt, &cnt, cnt + 1, true, > > + __ATOMIC_RELAXED, __ATOMIC_RELAXED)) > > + goto retry; > > + > > + strncpy(tld_metadata_p->metadata[i].name, name, TLD_NAME_LEN); > > + __atomic_store_n(&tld_metadata_p->metadata[i].size, size, __ATOMIC_RELEASE); > > + return (tld_key_t) {.off = off}; > > + } > > + > > + return (tld_key_t) {.off = -ENOSPC}; > > [...] > > > + * USAGE > > + * > > + * Similar to user space, bpf programs locate a TLD using the same key. > > + * tld_fetch_key() allows bpf programs to retrieve a key created in the user > > + * space by name, which is specified in the second argument of tld_create_key(). > > + * tld_fetch_key() additionally will cache the key in a task local storage map, > > + * tld_key_map, to avoid performing costly string comparisons every time when > > + * accessing a TLD. This requires the developer to define the map value type of > > + * tld_key_map, struct tld_keys. It only needs to contain keys used by bpf > > + * programs in the compilation unit. > > + * > > + * struct tld_keys { > > + * tld_key_t prio; > > + * tld_key_t in_cs; > > + * }; > > + * > > + * Then, for every new task, a bpf program will fetch and cache keys once and > > + * for all. This should be done ideally in a non-critical path (e.g., in > > + * sched_ext_ops::init_task). > > + * > > + * struct tld_object tld_obj; > > + * > > + * err = tld_object_init(task, &tld); > > tld_obj? Will fix the typo > > > + * if (err) > > + * return 0; > > + * > > + * tld_fetch_key(&tld_obj, "priority", prio); > > + * tld_fetch_key(&tld_obj, "in_critical_section", in_cs); > > + * > > [...]