To support the epoll family, RPAL needs to add new logic for RPAL services to the existing epoll logic, ensuring that user mode can execute RPAL service-related logic through identical interfaces. When the receiver thread calls epoll_wait(), it can set RPAL_EP_POLL_MAGIC to notify the kernel to invoke RPAL-related logic. The kernel then sets the receiver's state to RPAL_RECEIVER_STATE_READY and transitions it to RPAL_RECEIVER_STATE_WAIT when the receiver is actually removed from the runqueue, allowing the sender to perform RPAL calls on the receiver thread. Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx> --- arch/x86/rpal/core.c | 4 + fs/eventpoll.c | 200 +++++++++++++++++++++++++++++++++++++++++++ include/linux/rpal.h | 21 +++++ kernel/sched/core.c | 17 ++++ 4 files changed, 242 insertions(+) diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c index 47c9e551344e..6a22b9faa100 100644 --- a/arch/x86/rpal/core.c +++ b/arch/x86/rpal/core.c @@ -9,6 +9,7 @@ #include <linux/rpal.h> #include <linux/sched/task_stack.h> #include <linux/pkeys.h> +#include <linux/file.h> #include <asm/fsgsbase.h> #include "internal.h" @@ -63,6 +64,7 @@ void rpal_kernel_ret(struct pt_regs *regs) if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) { rcc = current->rpal_rd->rcc; + regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc); atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET); } else { tsk = current->rpal_sd->receiver; @@ -142,6 +144,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs) struct task_struct *prev = current; if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) { + rpal_resume_ep(next); current->rpal_sd->receiver = next; rpal_lock_cpu(current); rpal_lock_cpu(next); @@ -154,6 +157,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs) */ rebuild_sender_stack(current->rpal_sd, regs); rpal_schedule(next); + fdput(next->rpal_rd->f); } else { update_dst_stack(next, regs); /* diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d4dbffdedd08..437cd5764c03 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -38,6 +38,7 @@ #include <linux/compat.h> #include <linux/rculist.h> #include <linux/capability.h> +#include <linux/rpal.h> #include <net/busy_poll.h> /* @@ -2141,6 +2142,187 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } } +#ifdef CONFIG_RPAL + +void rpal_resume_ep(struct task_struct *tsk) +{ + struct rpal_receiver_data *rrd = tsk->rpal_rd; + struct eventpoll *ep = (struct eventpoll *)rrd->ep; + struct rpal_receiver_call_context *rcc = rrd->rcc; + + if (rcc->timeout > 0) { + hrtimer_cancel(&rrd->ep_sleeper.timer); + destroy_hrtimer_on_stack(&rrd->ep_sleeper.timer); + } + if (!list_empty_careful(&rrd->ep_wait.entry)) { + write_lock(&ep->lock); + __remove_wait_queue(&ep->wq, &rrd->ep_wait); + write_unlock(&ep->lock); + } +} + +int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc) +{ + int eavail; + int res = 0; + + res = ep_send_events(ep, rcc->events, rcc->maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + + eavail = ep_events_available(ep); + if (!eavail) { + atomic_and(~RPAL_KERNEL_PENDING, &rcc->ep_pending); + /* check again to avoid data race on RPAL_KERNEL_PENDING */ + eavail = ep_events_available(ep); + if (eavail) + atomic_or(RPAL_KERNEL_PENDING, &rcc->ep_pending); + } + return res; +} + +static int rpal_schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, + clockid_t clock_id) +{ + struct hrtimer_sleeper *t = ¤t->rpal_rd->ep_sleeper; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_setup_sleeper_on_stack(t, clock_id, mode); + hrtimer_set_expires_range_ns(&t->timer, *expires, delta); + hrtimer_sleeper_start_expires(t, mode); + + if (likely(t->task)) + schedule(); + + hrtimer_cancel(&t->timer); + destroy_hrtimer_on_stack(&t->timer); + + __set_current_state(TASK_RUNNING); + + return !t->task ? 0 : -EINTR; +} + +static int rpal_ep_poll(struct eventpoll *ep, struct epoll_event __user *events, + int maxevents, struct timespec64 *timeout) +{ + int res = 0, eavail, timed_out = 0; + u64 slack = 0; + struct rpal_receiver_data *rrd = current->rpal_rd; + wait_queue_entry_t *wait = &rrd->ep_wait; + ktime_t expires, *to = NULL; + + rrd->ep = ep; + + lockdep_assert_irqs_enabled(); + + if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { + slack = select_estimate_accuracy(timeout); + to = &expires; + *to = timespec64_to_ktime(*timeout); + } else if (timeout) { + timed_out = 1; + } + + eavail = ep_events_available(ep); + + while (1) { + if (eavail) { + res = rpal_try_send_events(ep, rrd->rcc); + if (res) { + atomic_xchg(&rrd->rcc->receiver_state, + RPAL_RECEIVER_STATE_RUNNING); + return res; + } + } + + if (timed_out) { + atomic_xchg(&rrd->rcc->receiver_state, + RPAL_RECEIVER_STATE_RUNNING); + return 0; + } + + eavail = ep_busy_loop(ep); + if (eavail) + continue; + + if (signal_pending(current)) { + atomic_xchg(&rrd->rcc->receiver_state, + RPAL_RECEIVER_STATE_RUNNING); + return -EINTR; + } + + init_wait(wait); + wait->func = rpal_ep_autoremove_wake_function; + wait->private = rrd; + write_lock_irq(&ep->lock); + + atomic_xchg(&rrd->rcc->receiver_state, + RPAL_RECEIVER_STATE_READY); + __set_current_state(TASK_INTERRUPTIBLE); + + eavail = ep_events_available(ep); + if (!eavail) + __add_wait_queue_exclusive(&ep->wq, wait); + + write_unlock_irq(&ep->lock); + + if (!eavail && ep_schedule_timeout(to)) { + if (RPAL_USER_PENDING & atomic_read(&rrd->rcc->ep_pending)) { + timed_out = 1; + } else { + timed_out = + !rpal_schedule_hrtimeout_range_clock( + to, slack, HRTIMER_MODE_ABS, + CLOCK_MONOTONIC); + } + } + atomic_cmpxchg(&rrd->rcc->receiver_state, + RPAL_RECEIVER_STATE_READY, + RPAL_RECEIVER_STATE_RUNNING); + __set_current_state(TASK_RUNNING); + + /* + * We were woken up, thus go and try to harvest some events. + * If timed out and still on the wait queue, recheck eavail + * carefully under lock, below. + */ + eavail = 1; + + if (!list_empty_careful(&wait->entry)) { + write_lock_irq(&ep->lock); + /* + * If the thread timed out and is not on the wait queue, + * it means that the thread was woken up after its + * timeout expired before it could reacquire the lock. + * Thus, when wait.entry is empty, it needs to harvest + * events. + */ + if (timed_out) + eavail = list_empty(&wait->entry); + __remove_wait_queue(&ep->wq, wait); + write_unlock_irq(&ep->lock); + } + } +} +#endif + /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in @@ -2529,7 +2711,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, ep = fd_file(f)->private_data; /* Time to fish for events ... */ +#ifdef CONFIG_RPAL + /* + * For RPAL task, if it is a receiver and it set MAGIC in shared memory, + * We think it is prepared for rpal calls. Therefore, we need to handle + * it differently. + * + * In other cases, RPAL task always plays like a normal task. + */ + if (rpal_current_service() && + rpal_test_current_thread_flag(RPAL_RECEIVER_BIT) && + current->rpal_rd->rcc->rpal_ep_poll_magic == RPAL_EP_POLL_MAGIC) { + current->rpal_rd->f = f; + return rpal_ep_poll(ep, events, maxevents, to); + } else { + return ep_poll(ep, events, maxevents, to); + } +#else return ep_poll(ep, events, maxevents, to); +#endif } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, diff --git a/include/linux/rpal.h b/include/linux/rpal.h index f2474cb53abe..5912ffec6e28 100644 --- a/include/linux/rpal.h +++ b/include/linux/rpal.h @@ -16,6 +16,8 @@ #include <linux/hashtable.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include <linux/file.h> +#include <linux/hrtimer.h> #define RPAL_ERROR_MSG "rpal error: " #define rpal_err(x...) pr_err(RPAL_ERROR_MSG x) @@ -89,6 +91,7 @@ enum { }; #define RPAL_ERROR_MAGIC 0x98CC98CC +#define RPAL_EP_POLL_MAGIC 0xCC98CC98 #define RPAL_SID_SHIFT 24 #define RPAL_ID_SHIFT 8 @@ -103,6 +106,9 @@ enum { #define RPAL_PKRU_UNION 1 #define RPAL_PKRU_INTERSECT 2 +#define RPAL_KERNEL_PENDING 0x1 +#define RPAL_USER_PENDING 0x2 + extern unsigned long rpal_cap; enum rpal_task_flag_bits { @@ -282,6 +288,12 @@ struct rpal_receiver_call_context { int receiver_id; atomic_t receiver_state; atomic_t sender_state; + atomic_t ep_pending; + int rpal_ep_poll_magic; + int epfd; + void __user *events; + int maxevents; + int timeout; }; /* recovery point for sender */ @@ -325,6 +337,10 @@ struct rpal_receiver_data { struct rpal_shared_page *rsp; struct rpal_receiver_call_context *rcc; struct task_struct *sender; + void *ep; + struct fd f; + struct hrtimer_sleeper ep_sleeper; + wait_queue_entry_t ep_wait; }; struct rpal_sender_data { @@ -574,4 +590,9 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p); asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev); int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey); void rpal_set_pku_schedule_tail(struct task_struct *prev); +int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr, + unsigned int mode, int wake_flags, + void *key); +void rpal_resume_ep(struct task_struct *tsk); +int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc); #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eb5d5bd51597..486d59bdd3fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6794,6 +6794,23 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #define SM_RTLOCK_WAIT 2 #ifdef CONFIG_RPAL +int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr, + unsigned int mode, int wake_flags, + void *key) +{ + struct rpal_receiver_data *rrd = curr->private; + struct task_struct *tsk = rrd->rcd.bp_task; + int ret; + + ret = try_to_wake_up(tsk, mode, wake_flags); + + list_del_init_careful(&curr->entry); + if (!ret) + atomic_or(RPAL_KERNEL_PENDING, &rrd->rcc->ep_pending); + + return 1; +} + static inline void rpal_check_ready_state(struct task_struct *tsk, int state) { if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) { -- 2.20.1