From: Mykyta Yatsenko <yatsenko@xxxxxxxx>
This patch adds necessary plumbing in verifier, syscall and maps to
support handling new kfunc bpf_task_work_schedule and kernel structure
bpf_task_work. The idea is similar to how we already handle bpf_wq and
bpf_timer.
verifier changes validate calls to bpf_task_work_schedule to make sure
it is safe and expected invariants hold.
btf part is required to detect bpf_task_work structure inside map value
and store its offset, which will be used in the next patch to calculate
key and value addresses.
arraymap and hashtab changes are needed to handle freeing of the
bpf_task_work: run code needed to deinitialize it, for example cancel
task_work callback if possible.
The use of bpf_task_work and proper implementation for kfuncs are
introduced in the next patch.
Signed-off-by: Mykyta Yatsenko <yatsenko@xxxxxxxx>
---
include/linux/bpf.h | 11 +++
include/uapi/linux/bpf.h | 4 +
kernel/bpf/arraymap.c | 8 +-
kernel/bpf/btf.c | 15 ++++
kernel/bpf/hashtab.c | 22 ++++--
kernel/bpf/helpers.c | 45 +++++++++++
kernel/bpf/syscall.c | 23 +++++-
kernel/bpf/verifier.c | 131 ++++++++++++++++++++++++++++++++-
tools/include/uapi/linux/bpf.h | 4 +
9 files changed, 247 insertions(+), 16 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9cd2164ed23..cb83ba0eaed5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -206,6 +206,7 @@ enum btf_field_type {
BPF_WORKQUEUE = (1 << 10),
BPF_UPTR = (1 << 11),
BPF_RES_SPIN_LOCK = (1 << 12),
+ BPF_TASK_WORK = (1 << 13),
};
typedef void (*btf_dtor_kfunc_t)(void *);
@@ -245,6 +246,7 @@ struct btf_record {
int timer_off;
int wq_off;
int refcount_off;
+ int task_work_off;
struct btf_field fields[];
};
@@ -340,6 +342,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
return "bpf_rb_node";
case BPF_REFCOUNT:
return "bpf_refcount";
+ case BPF_TASK_WORK:
+ return "bpf_task_work";
default:
WARN_ON_ONCE(1);
return "unknown";
@@ -378,6 +382,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
return sizeof(struct bpf_rb_node);
case BPF_REFCOUNT:
return sizeof(struct bpf_refcount);
+ case BPF_TASK_WORK:
+ return sizeof(struct bpf_task_work);
default:
WARN_ON_ONCE(1);
return 0;
@@ -410,6 +416,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
return __alignof__(struct bpf_rb_node);
case BPF_REFCOUNT:
return __alignof__(struct bpf_refcount);
+ case BPF_TASK_WORK:
+ return __alignof__(struct bpf_task_work);
default:
WARN_ON_ONCE(1);
return 0;
@@ -441,6 +449,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
+ case BPF_TASK_WORK:
break;
default:
WARN_ON_ONCE(1);
@@ -577,6 +586,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
bool lock_src);
void bpf_timer_cancel_and_free(void *timer);
void bpf_wq_cancel_and_free(void *timer);
+void bpf_task_work_cancel_and_free(void *timer);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
@@ -2391,6 +2401,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
+void bpf_obj_free_task_work(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 233de8677382..e444d9f67829 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7418,6 +7418,10 @@ struct bpf_timer {
__u64 __opaque[2];
} __attribute__((aligned(8)));
+struct bpf_task_work {
+ __u64 __opaque[16];
+} __attribute__((aligned(8)));
+
struct bpf_wq {
__u64 __opaque[2];
} __attribute__((aligned(8)));
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d080916faf9..4130d8e76dff 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -431,7 +431,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers_wq(struct bpf_map *map)
+static void array_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -439,12 +439,14 @@ static void array_map_free_timers_wq(struct bpf_map *map)
/* We don't reset or free fields other than timer and workqueue
* on uref dropping to zero.
*/
- if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
for (i = 0; i < array->map.max_entries; i++) {
if (btf_record_has_field(map->record, BPF_TIMER))
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
}
}
}
@@ -783,7 +785,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers_wq,
+ .map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0aff814cb53a..c66f9c6dfc48 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3527,6 +3527,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
goto end;
}
}
+ if (field_mask & BPF_TASK_WORK) {
+ if (!strcmp(name, "bpf_task_work")) {
+ if (*seen_mask & BPF_TASK_WORK)
+ return -E2BIG;
+ *seen_mask |= BPF_TASK_WORK;
+ type = BPF_TASK_WORK;
+ goto end;
+ }
+ }
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
@@ -3693,6 +3702,7 @@ static int btf_find_field_one(const struct btf *btf,
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
+ case BPF_TASK_WORK:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
info_cnt ? &info[0] : &tmp);
if (ret < 0)
@@ -3985,6 +3995,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->timer_off = -EINVAL;
rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
+ rec->task_work_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
if (info_arr[i].off + field_type_size > value_size) {
@@ -4050,6 +4061,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
case BPF_LIST_NODE:
case BPF_RB_NODE:
break;
+ case BPF_TASK_WORK:
+ WARN_ON_ONCE(rec->task_work_off >= 0);
+ rec->task_work_off = rec->fields[i].offset;
+ break;
default:
ret = -EFAULT;
goto end;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 71f9931ac64c..207ad4823b5b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -215,7 +215,7 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
+static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
@@ -233,6 +233,9 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -1490,7 +1493,7 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
-static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
+static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
{
int i;
@@ -1508,22 +1511,25 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(l, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(htab->map.record,
+ htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers_and_wq(struct bpf_map *map)
+static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* We only free timer and workqueue on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
if (!htab_is_prealloc(htab))
- htab_free_malloced_timers_and_wq(htab);
+ htab_free_malloced_internal_structs(htab);
else
- htab_free_prealloced_timers_and_wq(htab);
+ htab_free_prealloced_internal_structs(htab);
}
}
@@ -2255,7 +2261,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers_and_wq,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2276,7 +2282,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers_and_wq,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68..322ffcaedc38 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3703,8 +3703,53 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
}
+typedef void (*bpf_task_work_callback_t)(struct bpf_map *, void *, void *);
+
+/**
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to the bpf_task_work struct, to use by kernel internally for bookkeeping
+ * @map__map: bpf_map which contains bpf_task_work in one of the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task,
+ struct bpf_task_work *tw,
+ struct bpf_map *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return 0;
+}
+
+/**
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME or
+ * TWA_NMI_CURRENT mode if scheduling for the current task in the NMI
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to the bpf_task_work struct, to use by kernel internally for bookkeeping
+ * @map__map: bpf_map which contains bpf_task_work in one of the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task,
+ struct bpf_task_work *tw,
+ struct bpf_map *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return 0;
+}