blob: ea9c92c29e6ebc1b925e5bf709e532d2cf2342f8 [file] [log] [blame]
From 777e0b509cab5f1c145071916b8172261e2ef034 Mon Sep 17 00:00:00 2001
From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
Date: Wed, 4 Oct 2023 19:43:51 -0400
Subject: [PATCH] CHROMIUM: sched/core: boost/unboost in guest scheduler
RT or higher priority tasks in guest is considered a critical workload
and guest scheduler can request boost/unboost on a task switch and/or a
task wakeup. Also share the preempt status of guest vcpu with the host
so that host can take decision on boot/unboost.
We use the function equivalent of preempt_count_{add,sub} functions
defined via CONFIG_TRACE_PREEMPT_TOGGLE to track the preemption state.
Another option is to update the preempt_count_{add,sub} macros, but
it will be more code churn and complex.
Boost request is lazy, but unboost request is synchronous.
Detect the feature in guest from cpuid flags and use the MSR to pass the
GPA of memory location for sharing scheduling information.
UPSTREAM-TASK=b:303645537
BUG=b:262267726
TEST=boot
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
(cherry picked from commit 704f0a4f10258a0650f3046ba6377b2d0dcc2b96)
Change-Id: I1df908aca326dab15e619ba87c348179e45f517d
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425469
Reviewed-by: Joel Fernandes <joelaf@google.com>
Tested-by: Vineeth Pillai <vineethrp@google.com>
Commit-Queue: Vineeth Pillai <vineethrp@google.com>
---
arch/x86/Kconfig | 13 ++++
arch/x86/include/asm/kvm_para.h | 7 ++
arch/x86/kernel/kvm.c | 16 ++++
include/linux/entry-common.h | 13 ++++
include/linux/preempt.h | 3 +-
include/linux/sched.h | 34 +++++++++
kernel/sched/core.c | 128 ++++++++++++++++++++++++++++++--
8 files changed, 244 insertions(+), 7 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d7122a1883e8265fb9be256bf8efa680d02191b..2c2bbc413b0fac52840daf5cceb235b2c8649923 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -838,6 +838,19 @@ config KVM_GUEST
underlying device model, the host provides the guest with
timing infrastructure such as time of day, and system time
+config PARAVIRT_SCHED
+ bool "Enable paravirt scheduling capability for guests"
+ depends on KVM_GUEST
+ default n
+ help
+ Paravirtualized scheduling facilitates the exchange of scheduling
+ related information between the host and guest through shared memory,
+ enhancing the efficiency of vCPU thread scheduling by the hypervisor.
+ An illustrative use case involves dynamically boosting the priority of
+ a vCPU thread when the guest is executing a latency-sensitive workload
+ on that specific vCPU.
+ This config enables paravirt scheduling in guest(VM).
+
config ARCH_CPUIDLE_HALTPOLL
def_bool n
prompt "Disable host haltpoll when loading haltpoll driver"
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 57bc74e112f20936d6ee2601443892ecc083b533..3473dd2915b5e30d7f92da7b4c7940c7bc7cb12a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -176,4 +176,11 @@ static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
}
#endif
+#ifdef CONFIG_PARAVIRT_SCHED
+static inline void kvm_pv_sched_notify_host(void)
+{
+ wrmsrl(MSR_KVM_PV_SCHED, ULLONG_MAX);
+}
+#endif
+
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 263f8aed4e2cf8b84575d21e4b4358b2924915b5..b9fbb30c3837b9d4dc847da43298770abbcc1a22 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -378,6 +378,14 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_PV_EOI_EN, pa);
}
+#ifdef CONFIG_PARAVIRT_SCHED
+ if (pv_sched_enabled()) {
+ unsigned long pa = pv_sched_pa() | KVM_MSR_ENABLED;
+
+ wrmsrl(MSR_KVM_PV_SCHED, pa);
+ }
+#endif
+
if (has_steal_clock)
kvm_register_steal_time();
}
@@ -834,6 +842,14 @@ static void __init kvm_guest_init(void)
sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
}
+#ifdef CONFIG_PARAVIRT_SCHED
+ if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED)) {
+ pr_info("KVM host has PV_SCHED!\n");
+ pv_sched_enable();
+ } else
+ pr_info("KVM host does not support PV_SCHED!\n");
+#endif
+
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index b0fb775a600d9844aa6dbd7c6db70b1c7261829d..fa1bc37bba6460395a288b44bb9b12ec01a55089 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -13,6 +13,8 @@
#include <linux/tick.h>
#include <linux/kmsan.h>
+#include <linux/kvm_para.h>
+
#include <asm/entry-common.h>
/*
@@ -320,6 +322,17 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();
+ /*
+ * Guest requests a boost when preemption is disabled but does not request
+ * an immediate unboost when preemption is enabled back. There is a chance
+ * that we are boosted here. Unboost if needed.
+ */
+ if (pv_sched_enabled()) {
+ pv_sched_vcpu_kerncs_unboost(PVSCHED_KERNCS_BOOST_ALL, true);
+ pv_sched_vcpu_update(current->policy, current->rt_priority,
+ task_nice(current), false);
+ }
+
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7233e9cf1bab60a46037172fa5468706c8e3a232..61897110440451067327eac583f3143b04d81fb4 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -191,7 +191,8 @@ static __always_inline unsigned char interrupt_context_level(void)
*/
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
-#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) || \
+ defined(CONFIG_PARAVIRT_SCHED)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
#define preempt_count_dec_and_test() \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ecd2d20481651e4cba2d8d60d0b816e3a03dae33..f2dd5413356fb5f31547245fa77fc014cf3088bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2222,4 +2222,38 @@ static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *ol
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
+#ifdef CONFIG_PARAVIRT_SCHED
+DECLARE_STATIC_KEY_FALSE(__pv_sched_enabled);
+
+extern unsigned long pv_sched_pa(void);
+
+static inline bool pv_sched_enabled(void)
+{
+ return static_branch_unlikely(&__pv_sched_enabled);
+}
+
+static inline void pv_sched_enable(void)
+{
+ static_branch_enable(&__pv_sched_enabled);
+}
+
+extern void pv_sched_vcpu_update(int policy, int prio, int nice, bool lazy);
+extern void pv_sched_vcpu_kerncs_unboost(int boost_type, bool lazy);
+extern void pv_sched_vcpu_kerncs_boost_lazy(int boost_type);
+#else
+static inline bool pv_sched_enabled(void)
+{
+ return false;
+}
+
+static inline void pv_sched_enable(void) { }
+
+static inline void pv_sched_vcpu_update(int policy, int prio,
+ int nice, bool lazy)
+{
+}
+static inline void pv_sched_vcpu_kerncs_unboost(int boost_type, bool lazy) { }
+static inline void pv_sched_vcpu_kerncs_boost_lazy(int boost_type) { }
+#endif
+
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 428a4f295e2e8bc3d2b6a2e60f53474d24090380..2f19f5f043646b18d1da8527e5c524dcbff026ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -155,6 +155,91 @@ unsigned int sysctl_iowait_apply_ticks = 10;
__read_mostly int scheduler_running;
+#ifdef CONFIG_PARAVIRT_SCHED
+#include <linux/kvm_para.h>
+
+DEFINE_STATIC_KEY_FALSE(__pv_sched_enabled);
+
+DEFINE_PER_CPU_DECRYPTED(struct pv_sched_data, pv_sched) __aligned(64);
+
+unsigned long pv_sched_pa(void)
+{
+ return slow_virt_to_phys(this_cpu_ptr(&pv_sched));
+}
+
+static inline int __normal_prio(int policy, int rt_prio, int nice);
+static inline int __pv_sched_equal_prio(union vcpu_sched_attr a1,
+ union vcpu_sched_attr a2)
+{
+ return (__normal_prio(a1.sched_policy, a1.rt_priority, a1.sched_nice) ==
+ __normal_prio(a2.sched_policy, a2.rt_priority, a2.sched_nice));
+}
+
+static inline void __pv_sched_vcpu_attr_update(union vcpu_sched_attr attr,
+ bool lazy)
+{
+ union vcpu_sched_attr status_attr;
+
+ status_attr.pad = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_HOST].pad);
+ if (!status_attr.enabled || (status_attr.kern_cs == attr.kern_cs &&
+ __pv_sched_equal_prio(attr, status_attr)))
+ return;
+
+ this_cpu_write(pv_sched.attr[PV_SCHEDATTR_GUEST].pad, attr.pad);
+
+ if (!lazy)
+ kvm_pv_sched_notify_host();
+}
+
+void pv_sched_vcpu_update(int policy, int prio, int nice, bool lazy)
+{
+ union vcpu_sched_attr attr = {
+ .sched_policy = policy,
+ .rt_priority = prio,
+ .sched_nice = nice
+ };
+ attr.kern_cs = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_GUEST].kern_cs);
+ __pv_sched_vcpu_attr_update(attr, lazy);
+}
+
+void pv_sched_vcpu_kerncs_boost_lazy(int boost_type)
+{
+ union vcpu_sched_attr attr;
+
+ attr.pad = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_GUEST].pad);
+ attr.kern_cs |= boost_type;
+ __pv_sched_vcpu_attr_update(attr, true);
+}
+
+void pv_sched_vcpu_kerncs_unboost(int boost_type, bool lazy)
+{
+ union vcpu_sched_attr attr;
+
+ attr.pad = this_cpu_read(pv_sched.attr[PV_SCHEDATTR_GUEST].pad);
+ attr.kern_cs &= ~boost_type;
+ __pv_sched_vcpu_attr_update(attr, lazy);
+}
+
+/*
+ * Share the preemption enabled/disabled status with host. This will not incur a
+ * VMEXIT and acts as a lazy boost/unboost mechanism - host will check this on
+ * the next VMEXIT for boost/unboost decisions.
+ * XXX: Lazy unboosting may allow cfs tasks to run on RT vcpu till next VMEXIT.
+ */
+static inline void pv_sched_update_preempt_status(bool preempt_disabled)
+{
+ if (!pv_sched_enabled())
+ return;
+
+ if (preempt_disabled)
+ pv_sched_vcpu_kerncs_boost_lazy(PVSCHED_KERNCS_BOOST_PREEMPT_DISABLED);
+ else
+ pv_sched_vcpu_kerncs_unboost(PVSCHED_KERNCS_BOOST_PREEMPT_DISABLED, true);
+}
+#else
+static inline void pv_sched_update_preempt_status(bool preempt_disabled) {}
+#endif
+
#ifdef CONFIG_SCHED_CORE
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
@@ -2128,6 +2213,17 @@ unsigned long get_wchan(struct task_struct *p)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ /*
+ * TODO: currently request for boosting remote vcpus is not implemented. So
+ * we boost only if this enqueue happens for this cpu.
+ * This is not a big problem though, target cpu gets an IPI and then gets
+ * boosted by the host. Posted interrupts is an exception where target vcpu
+ * will not get boosted immediately, but on the next schedule().
+ */
+ if (pv_sched_enabled() && this_rq() == rq &&
+ sched_class_above(p->sched_class, &fair_sched_class))
+ pv_sched_vcpu_update(p->policy, p->rt_priority, 0, true);
+
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -5866,8 +5962,14 @@ static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
-#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_TRACE_PREEMPT_TOGGLE))
+/*
+ * PARAVIRT_SCHED enabled guest passes the preemption state to host.
+ * So we piggy back on the following functions to keep track of the
+ * preemption state.
+ */
+#if (defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) || \
+ defined(CONFIG_PARAVIRT_SCHED))
/*
* If the value passed in is equal to the current preempt count
* then we just disabled preemption. Start timing the latency.
@@ -5875,11 +5977,12 @@ static inline void sched_tick_stop(int cpu) { }
static inline void preempt_latency_start(int val)
{
if (preempt_count() == val) {
- unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = ip;
+ current->preempt_disable_ip = get_lock_parent_ip();
#endif
- trace_preempt_off(CALLER_ADDR0, ip);
+ pv_sched_update_preempt_status(true);
+
+ trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
}
}
@@ -5911,8 +6014,10 @@ NOKPROBE_SYMBOL(preempt_count_add);
*/
static inline void preempt_latency_stop(int val)
{
- if (preempt_count() == val)
+ if (preempt_count() == val) {
+ pv_sched_update_preempt_status(false);
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+ }
}
void preempt_count_sub(int val)
@@ -6732,6 +6837,17 @@ static void __sched notrace __schedule(unsigned int sched_mode)
rq->last_seen_need_resched_ns = 0;
#endif
+ if (pv_sched_enabled()) {
+ int lazy = true;
+
+ /*
+ * Synchronous unboost.
+ */
+ if (task_is_realtime(next) || NICE_TO_PRIO(task_nice(next)) < DEFAULT_PRIO)
+ lazy = false;
+ pv_sched_vcpu_update(next->policy, next->rt_priority, task_nice(next), lazy);
+ }
+
if (likely(prev != next)) {
rq->nr_switches++;
/*
--
2.45.2.803.g4e1b14247a-goog