The implementation of lazy switch differs from a regular schedule() in three key aspects: 1. It occurs at the kernel entry with irq disabled. 2. The next task is explicitly pre-determined rather than selected by the scheduler. 3. User-space context (excluding general-purpose registers) remains unchanged across the switch. This patch introduces the rpal_schedule() interface to address these requirements. Firstly, the rpal_schedule() skips irq enabling in finish_lock_switch(), preserving the irq-disabled state required during kernel entry. Secondly, the rpal_pick_next_task() interface is used to explicitly specify the target task, bypassing the default scheduler's decision-making process. Thirdly, non-general-purpose registers (e.g., FPU, vector units) are not restored during the switch, ensuring user space context remains intact. Handling of general-purpose registers will be addressed in a subsequent patch by RPAL before invoking rpal_schedule(). Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx> --- arch/x86/kernel/process_64.c | 75 +++++++++++++++++++++ include/linux/rpal.h | 3 + kernel/sched/core.c | 126 +++++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4830e9215de7..efc3f238c486 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -753,6 +753,81 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } +#ifdef CONFIG_RPAL +__no_kmsan_checks +__visible __notrace_funcgraph struct task_struct * +__rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(hardirq_stack_inuse)); + + /* no need to switch fpu */ + /* __fpu_invalidate_fpregs_state() */ + x86_task_fpu(prev_p)->last_cpu = -1; + /* fpregs_activate() */ + __this_cpu_write(fpu_fpregs_owner_ctx, x86_task_fpu(next_p)); + trace_x86_fpu_regs_activated(x86_task_fpu(next_p)); + x86_task_fpu(next_p)->last_cpu = cpu; + set_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD); + clear_tsk_thread_flag(next_p, TIF_NEED_FPU_LOAD); + + /* no need to save fs */ + savesegment(gs, prev_p->thread.gsindex); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + prev_p->thread.gsbase = __rdgsbase_inactive(); + else + save_base_legacy(prev_p, prev_p->thread.gsindex, GS); + + load_TLS(next, cpu); + + arch_end_context_switch(next_p); + + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + /* no need to load fs */ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex, + next->gsbase, GS); + } + + /* skip pkru load as we will use pkru in RPAL */ + + this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + + /* no need to load fpu */ + + update_task_stack(next_p); + switch_to_extra(prev_p, next_p); + + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + unsigned short ss_sel; + + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + resctrl_sched_in(next_p); + + return prev_p; +} +#endif + void set_personality_64bit(void) { /* inherit personality from parent */ diff --git a/include/linux/rpal.h b/include/linux/rpal.h index 45137770fac6..0813db4552c0 100644 --- a/include/linux/rpal.h +++ b/include/linux/rpal.h @@ -487,4 +487,7 @@ int rpal_try_to_wake_up(struct task_struct *p); int rpal_init_thread_pending(struct rpal_common_data *rcd); void rpal_free_thread_pending(struct rpal_common_data *rcd); int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock); +void rpal_schedule(struct task_struct *next); +asmlinkage struct task_struct * +__rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p); #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e76376c5172..760d88458b39 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6827,6 +6827,12 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (unlikely(is_special_task_state(task_state))) flags |= DEQUEUE_SPECIAL; +#ifdef CONFIG_RPAL + /* DELAY_DEQUEUE will cause CPU stalls after lazy switch, skip it */ + if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) + flags |= DEQUEUE_SPECIAL; +#endif + /* * __schedule() ttwu() * prev_state = prev->state; if (p->on_rq && ...) @@ -11005,6 +11011,62 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) #endif /* CONFIG_SCHED_CLASS_EXT */ #ifdef CONFIG_RPAL +static struct rq *rpal_finish_task_switch(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); + + rq->prev_mm = NULL; + vtime_task_switch(prev); + perf_event_task_sched_in(prev, current); + finish_task(prev); + tick_nohz_task_switch(); + + /* finish_lock_switch, not enable irq */ + spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); + __balance_callbacks(rq); + raw_spin_rq_unlock(rq); + + finish_arch_post_lock_switch(); + kcov_finish_switch(current); + kmap_local_sched_in(); + + fire_sched_in_preempt_notifiers(current); + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); + mmdrop(mm); + } + + return rq; +} + +static __always_inline struct rq *rpal_context_switch(struct rq *rq, + struct task_struct *prev, + struct task_struct *next, + struct rq_flags *rf) +{ + /* irq is off */ + prepare_task_switch(rq, prev, next); + arch_start_context_switch(prev); + + membarrier_switch_mm(rq, prev->active_mm, next->mm); + switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); + + switch_mm_cid(rq, prev, next); + + prepare_lock_switch(rq, next, rf); + __rpal_switch_to(prev, next); + barrier(); + return rpal_finish_task_switch(prev); +} + #ifdef CONFIG_SCHED_CORE static inline struct task_struct * __rpal_pick_next_task(struct rq *rq, struct task_struct *prev, @@ -11214,4 +11276,68 @@ rpal_pick_next_task(struct rq *rq, struct task_struct *prev, BUG(); } #endif + +/* enter and exit with irqs disabled() */ +void __sched notrace rpal_schedule(struct task_struct *next) +{ + struct task_struct *prev, *picked; + bool preempt = false; + unsigned long *switch_count; + unsigned long prev_state; + struct rq_flags rf; + struct rq *rq; + int cpu; + + /* sched_mode = SM_NONE */ + + preempt_disable(); + + trace_sched_entry_tp(preempt, CALLER_ADDR0); + + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + prev = rq->curr; + + schedule_debug(prev, preempt); + + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) + hrtick_clear(rq); + + rcu_note_context_switch(preempt); + rq_lock(rq, &rf); + smp_mb__after_spinlock(); + + rq->clock_update_flags <<= 1; + update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED; + + switch_count = &prev->nivcsw; + + prev_state = READ_ONCE(prev->__state); + if (prev_state) { + try_to_block_task(rq, prev, &prev_state); + switch_count = &prev->nvcsw; + } + + picked = rpal_pick_next_task(rq, prev, next, &rf); + rq_set_donor(rq, next); + if (unlikely(next != picked)) + panic("rpal error: next != picked\n"); + + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + rq->last_seen_need_resched_ns = 0; + + rq->nr_switches++; + RCU_INIT_POINTER(rq->curr, next); + ++*switch_count; + migrate_disable_switch(rq, prev); + psi_account_irqtime(rq, prev, next); + psi_sched_switch(prev, next, !task_on_rq_queued(prev) || + prev->se.sched_delayed); + trace_sched_switch(preempt, prev, next, prev_state); + rq = rpal_context_switch(rq, prev, next, &rf); + trace_sched_exit_tp(true, CALLER_ADDR0); + preempt_enable_no_resched(); +} #endif -- 2.20.1