| From 777e0b509cab5f1c145071916b8172261e2ef034 Mon Sep 17 00:00:00 2001 |
| From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org> |
| Date: Wed, 4 Oct 2023 19:43:51 -0400 |
| Subject: [PATCH] CHROMIUM: sched/core: boost/unboost in guest scheduler |
| |
| RT or higher priority tasks in guest is considered a critical workload |
| and guest scheduler can request boost/unboost on a task switch and/or a |
| task wakeup. Also share the preempt status of guest vcpu with the host |
| so that host can take decision on boot/unboost. |
| |
| We use the function equivalent of preempt_count_{add,sub} functions |
| defined via CONFIG_TRACE_PREEMPT_TOGGLE to track the preemption state. |
| Another option is to update the preempt_count_{add,sub} macros, but |
| it will be more code churn and complex. |
| |
| Boost request is lazy, but unboost request is synchronous. |
| |
| Detect the feature in guest from cpuid flags and use the MSR to pass the |
| GPA of memory location for sharing scheduling information. |
| |
| UPSTREAM-TASK=b:303645537 |
| BUG=b:262267726 |
| TEST=boot |
| |
| Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org> |
| Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> |
| Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org> |
| (cherry picked from commit 704f0a4f10258a0650f3046ba6377b2d0dcc2b96) |
| |
| Change-Id: I1df908aca326dab15e619ba87c348179e45f517d |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425469 |
| Reviewed-by: Joel Fernandes <joelaf@google.com> |
| Tested-by: Vineeth Pillai <vineethrp@google.com> |
| Commit-Queue: Vineeth Pillai <vineethrp@google.com> |
| --- |
| arch/x86/Kconfig | 13 ++++ |
| arch/x86/include/asm/kvm_para.h | 7 ++ |
| arch/x86/kernel/kvm.c | 16 ++++ |
| include/linux/entry-common.h | 13 ++++ |
| include/linux/preempt.h | 3 +- |
| include/linux/sched.h | 34 +++++++++ |
| kernel/sched/core.c | 128 ++++++++++++++++++++++++++++++-- |
| 8 files changed, 244 insertions(+), 7 deletions(-) |
| |
| diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig |
| index 1d7122a1883e8265fb9be256bf8efa680d02191b..2c2bbc413b0fac52840daf5cceb235b2c8649923 100644 |
| --- a/arch/x86/Kconfig |
| +++ b/arch/x86/Kconfig |
| @@ -838,6 +838,19 @@ config KVM_GUEST |
| underlying device model, the host provides the guest with |
| timing infrastructure such as time of day, and system time |
| |
| +config PARAVIRT_SCHED |
| + bool "Enable paravirt scheduling capability for guests" |
| + depends on KVM_GUEST |
| + default n |
| + help |
| + Paravirtualized scheduling facilitates the exchange of scheduling |
| + related information between the host and guest through shared memory, |
| + enhancing the efficiency of vCPU thread scheduling by the hypervisor. |
| + An illustrative use case involves dynamically boosting the priority of |
| + a vCPU thread when the guest is executing a latency-sensitive workload |
| + on that specific vCPU. |
| + This config enables paravirt scheduling in guest(VM). |
| + |
| config ARCH_CPUIDLE_HALTPOLL |
| def_bool n |
| prompt "Disable host haltpoll when loading haltpoll driver" |
| diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h |
| index 57bc74e112f20936d6ee2601443892ecc083b533..3473dd2915b5e30d7f92da7b4c7940c7bc7cb12a 100644 |
| --- a/arch/x86/include/asm/kvm_para.h |
| +++ b/arch/x86/include/asm/kvm_para.h |
| @@ -176,4 +176,11 @@ static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) |
| } |
| #endif |
| |
| +#ifdef CONFIG_PARAVIRT_SCHED |
| +static inline void kvm_pv_sched_notify_host(void) |
| +{ |
| + wrmsrl(MSR_KVM_PV_SCHED, ULLONG_MAX); |
| +} |
| +#endif |
| + |
| #endif /* _ASM_X86_KVM_PARA_H */ |
| diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c |
| index 263f8aed4e2cf8b84575d21e4b4358b2924915b5..b9fbb30c3837b9d4dc847da43298770abbcc1a22 100644 |
| --- a/arch/x86/kernel/kvm.c |
| +++ b/arch/x86/kernel/kvm.c |
| @@ -378,6 +378,14 @@ static void kvm_guest_cpu_init(void) |
| wrmsrl(MSR_KVM_PV_EOI_EN, pa); |
| } |
| |
| +#ifdef CONFIG_PARAVIRT_SCHED |
| + if (pv_sched_enabled()) { |
| + unsigned long pa = pv_sched_pa() | KVM_MSR_ENABLED; |
| + |
| + wrmsrl(MSR_KVM_PV_SCHED, pa); |
| + } |
| +#endif |
| + |
| if (has_steal_clock) |
| kvm_register_steal_time(); |
| } |
| @@ -834,6 +842,14 @@ static void __init kvm_guest_init(void) |
| sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); |
| } |
| |
| +#ifdef CONFIG_PARAVIRT_SCHED |
| + if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED)) { |
| + pr_info("KVM host has PV_SCHED!\n"); |
| + pv_sched_enable(); |
| + } else |
| + pr_info("KVM host does not support PV_SCHED!\n"); |
| +#endif |
| + |
| #ifdef CONFIG_SMP |
| if (pv_tlb_flush_supported()) { |
| pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; |
| diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h |
| index b0fb775a600d9844aa6dbd7c6db70b1c7261829d..fa1bc37bba6460395a288b44bb9b12ec01a55089 100644 |
| --- a/include/linux/entry-common.h |
| +++ b/include/linux/entry-common.h |
| @@ -13,6 +13,8 @@ |
| #include <linux/tick.h> |
| #include <linux/kmsan.h> |
| |
| +#include <linux/kvm_para.h> |
| + |
| #include <asm/entry-common.h> |
| |
| /* |
| @@ -320,6 +322,17 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) |
| |
| lockdep_assert_irqs_disabled(); |
| |
| + /* |
| + * Guest requests a boost when preemption is disabled but does not request |
| + * an immediate unboost when preemption is enabled back. There is a chance |
| + * that we are boosted here. Unboost if needed. |
| + */ |
| + if (pv_sched_enabled()) { |
| + pv_sched_vcpu_kerncs_unboost(PVSCHED_KERNCS_BOOST_ALL, true); |
| + pv_sched_vcpu_update(current->policy, current->rt_priority, |
| + task_nice(current), false); |
| + } |
| + |
| /* Flush pending rcuog wakeup before the last need_resched() check */ |
| tick_nohz_user_enter_prepare(); |
| |
| diff --git a/include/linux/preempt.h b/include/linux/preempt.h |
| index 7233e9cf1bab60a46037172fa5468706c8e3a232..61897110440451067327eac583f3143b04d81fb4 100644 |
| --- a/include/linux/preempt.h |
| +++ b/include/linux/preempt.h |
| @@ -191,7 +191,8 @@ static __always_inline unsigned char interrupt_context_level(void) |
| */ |
| #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) |
| |
| -#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) |
| +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) || \ |
| + defined(CONFIG_PARAVIRT_SCHED) |
| extern void preempt_count_add(int val); |
| extern void preempt_count_sub(int val); |
| #define preempt_count_dec_and_test() \ |
| diff --git a/include/linux/sched.h b/include/linux/sched.h |
| index ecd2d20481651e4cba2d8d60d0b816e3a03dae33..f2dd5413356fb5f31547245fa77fc014cf3088bc 100644 |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -2222,4 +2222,38 @@ static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *ol |
| #define alloc_tag_restore(_tag, _old) do {} while (0) |
| #endif |
| |
| +#ifdef CONFIG_PARAVIRT_SCHED |
| +DECLARE_STATIC_KEY_FALSE(__pv_sched_enabled); |
| + |
| +extern unsigned long pv_sched_pa(void); |
| + |
| +static inline bool pv_sched_enabled(void) |
| +{ |
| + return static_branch_unlikely(&__pv_sched_enabled); |
| +} |
| + |
| +static inline void pv_sched_enable(void) |
| +{ |
| + static_branch_enable(&__pv_sched_enabled); |
| +} |
| + |
| +extern void pv_sched_vcpu_update(int policy, int prio, int nice, bool lazy); |
| +extern void pv_sched_vcpu_kerncs_unboost(int boost_type, bool lazy); |
| +extern void pv_sched_vcpu_kerncs_boost_lazy(int boost_type); |
| +#else |
| +static inline bool pv_sched_enabled(void) |
| +{ |
| + return false; |
| +} |
| + |
| +static inline void pv_sched_enable(void) { } |
| + |
| +static inline void pv_sched_vcpu_update(int policy, int prio, |
| + int nice, bool lazy) |
| +{ |
| +} |
| +static inline void pv_sched_vcpu_kerncs_unboost(int boost_type, bool lazy) { } |
| +static inline void pv_sched_vcpu_kerncs_boost_lazy(int boost_type) { } |
| +#endif |
| + |
| #endif |
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
| index 428a4f295e2e8bc3d2b6a2e60f53474d24090380..2f19f5f043646b18d1da8527e5c524dcbff026ea 100644 |
| --- a/kernel/sched/core.c |
| +++ b/kernel/sched/core.c |
| @@ -155,6 +155,91 @@ unsigned int sysctl_iowait_apply_ticks = 10; |
| |
| __read_mostly int scheduler_running; |
| |
| +#ifdef CONFIG_PARAVIRT_SCHED |
| +#include <linux/kvm_para.h> |
| + |
| +DEFINE_STATIC_KEY_FALSE(__pv_sched_enabled); |
| + |
| +DEFINE_PER_CPU_DECRYPTED(struct pv_sched_data, pv_sched) __aligned(64); |
| + |
| +unsigned long pv_sched_pa(void) |
| +{ |
| + return slow_virt_to_phys(this_cpu_ptr(&pv_sched)); |
| +} |
| + |
| +static inline int __normal_prio(int policy, int rt_prio, int nice); |
| +static inline int __pv_sched_equal_prio(union vcpu_sched_attr a1, |
| + union vcpu_sched_attr a2) |
| +{ |
| + return (__normal_prio(a1.sched_policy, a1.rt_priority, a1.sched_nice) == |
| + __normal_prio(a2.sched_policy, a2.rt_priority, a2.sched_nice)); |
| +} |
| + |
| +static inline void __pv_sched_vcpu_attr_update(union vcpu_sched_attr attr, |
| + bool lazy) |
| +{ |
| + union vcpu_sched_attr status_attr; |
| + |
| + status_attr.pad = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_HOST].pad); |
| + if (!status_attr.enabled || (status_attr.kern_cs == attr.kern_cs && |
| + __pv_sched_equal_prio(attr, status_attr))) |
| + return; |
| + |
| + this_cpu_write(pv_sched.attr[PV_SCHEDATTR_GUEST].pad, attr.pad); |
| + |
| + if (!lazy) |
| + kvm_pv_sched_notify_host(); |
| +} |
| + |
| +void pv_sched_vcpu_update(int policy, int prio, int nice, bool lazy) |
| +{ |
| + union vcpu_sched_attr attr = { |
| + .sched_policy = policy, |
| + .rt_priority = prio, |
| + .sched_nice = nice |
| + }; |
| + attr.kern_cs = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_GUEST].kern_cs); |
| + __pv_sched_vcpu_attr_update(attr, lazy); |
| +} |
| + |
| +void pv_sched_vcpu_kerncs_boost_lazy(int boost_type) |
| +{ |
| + union vcpu_sched_attr attr; |
| + |
| + attr.pad = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_GUEST].pad); |
| + attr.kern_cs |= boost_type; |
| + __pv_sched_vcpu_attr_update(attr, true); |
| +} |
| + |
| +void pv_sched_vcpu_kerncs_unboost(int boost_type, bool lazy) |
| +{ |
| + union vcpu_sched_attr attr; |
| + |
| + attr.pad = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_GUEST].pad); |
| + attr.kern_cs &= ~boost_type; |
| + __pv_sched_vcpu_attr_update(attr, lazy); |
| +} |
| + |
| +/* |
| + * Share the preemption enabled/disabled status with host. This will not incur a |
| + * VMEXIT and acts as a lazy boost/unboost mechanism - host will check this on |
| + * the next VMEXIT for boost/unboost decisions. |
| + * XXX: Lazy unboosting may allow cfs tasks to run on RT vcpu till next VMEXIT. |
| + */ |
| +static inline void pv_sched_update_preempt_status(bool preempt_disabled) |
| +{ |
| + if (!pv_sched_enabled()) |
| + return; |
| + |
| + if (preempt_disabled) |
| + pv_sched_vcpu_kerncs_boost_lazy(PVSCHED_KERNCS_BOOST_PREEMPT_DISABLED); |
| + else |
| + pv_sched_vcpu_kerncs_unboost(PVSCHED_KERNCS_BOOST_PREEMPT_DISABLED, true); |
| +} |
| +#else |
| +static inline void pv_sched_update_preempt_status(bool preempt_disabled) {} |
| +#endif |
| + |
| #ifdef CONFIG_SCHED_CORE |
| |
| DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); |
| @@ -2128,6 +2213,17 @@ unsigned long get_wchan(struct task_struct *p) |
| |
| static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
| { |
| + /* |
| + * TODO: currently request for boosting remote vcpus is not implemented. So |
| + * we boost only if this enqueue happens for this cpu. |
| + * This is not a big problem though, target cpu gets an IPI and then gets |
| + * boosted by the host. Posted interrupts is an exception where target vcpu |
| + * will not get boosted immediately, but on the next schedule(). |
| + */ |
| + if (pv_sched_enabled() && this_rq() == rq && |
| + sched_class_above(p->sched_class, &fair_sched_class)) |
| + pv_sched_vcpu_update(p->policy, p->rt_priority, 0, true); |
| + |
| if (!(flags & ENQUEUE_NOCLOCK)) |
| update_rq_clock(rq); |
| |
| @@ -5866,8 +5962,14 @@ static inline void sched_tick_start(int cpu) { } |
| static inline void sched_tick_stop(int cpu) { } |
| #endif |
| |
| -#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
| - defined(CONFIG_TRACE_PREEMPT_TOGGLE)) |
| +/* |
| + * PARAVIRT_SCHED enabled guest passes the preemption state to host. |
| + * So we piggy back on the following functions to keep track of the |
| + * preemption state. |
| + */ |
| +#if (defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
| + defined(CONFIG_TRACE_PREEMPT_TOGGLE)) || \ |
| + defined(CONFIG_PARAVIRT_SCHED)) |
| /* |
| * If the value passed in is equal to the current preempt count |
| * then we just disabled preemption. Start timing the latency. |
| @@ -5875,11 +5977,12 @@ static inline void sched_tick_stop(int cpu) { } |
| static inline void preempt_latency_start(int val) |
| { |
| if (preempt_count() == val) { |
| - unsigned long ip = get_lock_parent_ip(); |
| #ifdef CONFIG_DEBUG_PREEMPT |
| - current->preempt_disable_ip = ip; |
| + current->preempt_disable_ip = get_lock_parent_ip(); |
| #endif |
| - trace_preempt_off(CALLER_ADDR0, ip); |
| + pv_sched_update_preempt_status(true); |
| + |
| + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); |
| } |
| } |
| |
| @@ -5911,8 +6014,10 @@ NOKPROBE_SYMBOL(preempt_count_add); |
| */ |
| static inline void preempt_latency_stop(int val) |
| { |
| - if (preempt_count() == val) |
| + if (preempt_count() == val) { |
| + pv_sched_update_preempt_status(false); |
| trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); |
| + } |
| } |
| |
| void preempt_count_sub(int val) |
| @@ -6732,6 +6837,17 @@ static void __sched notrace __schedule(unsigned int sched_mode) |
| rq->last_seen_need_resched_ns = 0; |
| #endif |
| |
| + if (pv_sched_enabled()) { |
| + int lazy = true; |
| + |
| + /* |
| + * Synchronous unboost. |
| + */ |
| + if (task_is_realtime(next) || NICE_TO_PRIO(task_nice(next)) < DEFAULT_PRIO) |
| + lazy = false; |
| + pv_sched_vcpu_update(next->policy, next->rt_priority, task_nice(next), lazy); |
| + } |
| + |
| if (likely(prev != next)) { |
| rq->nr_switches++; |
| /* |
| -- |
| 2.45.2.803.g4e1b14247a-goog |
| |