From 2cc8ebf921a689a0a1d13eba734f053e92c74a78 Mon Sep 17 00:00:00 2001 From: Eric Naim Date: Mon, 9 Mar 2026 13:50:25 +0800 Subject: [PATCH] bore-cachy Signed-off-by: Eric Naim --- include/linux/sched.h | 28 +++ include/linux/sched/bore.h | 46 ++++ init/Kconfig | 17 ++ kernel/Kconfig.hz | 17 ++ kernel/exit.c | 4 + kernel/fork.c | 13 + kernel/futex/waitwake.c | 11 + kernel/sched/Makefile | 1 + kernel/sched/bore.c | 470 +++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 + kernel/sched/debug.c | 65 +++++ kernel/sched/fair.c | 129 ++++++++-- kernel/sched/features.h | 7 + kernel/sched/sched.h | 9 + 14 files changed, 809 insertions(+), 20 deletions(-) create mode 100644 include/linux/sched/bore.h create mode 100644 kernel/sched/bore.c diff --git a/include/linux/sched.h b/include/linux/sched.h index a7b4a980eb2f..8ec2365e1c02 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -817,6 +817,31 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_SCHED_BORE +#define BORE_BC_TIMESTAMP_SHIFT 16 + +struct bore_bc { + union { + struct { + u64 timestamp: 48; + u64 penalty: 16; + }; + u64 value; + }; +}; + +struct bore_ctx { + u64 burst_time; + u16 prev_penalty; + u16 curr_penalty; + u16 penalty; + bool stop_update; + bool futex_waiting; + struct bore_bc subtree; + struct bore_bc group; +}; +#endif /* CONFIG_SCHED_BORE */ + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -875,6 +900,9 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif +#ifdef CONFIG_SCHED_BORE + struct bore_ctx bore; +#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h new file mode 100644 index 000000000000..0d433c4ef279 --- /dev/null +++ b/include/linux/sched/bore.h @@ -0,0 +1,46 @@ +#ifndef _KERNEL_SCHED_BORE_H +#define _KERNEL_SCHED_BORE_H + +#include +#include +#include +#include +#include +#include + +#define SCHED_BORE_AUTHOR "Masahito Suzuki" +#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" + +#define SCHED_BORE_VERSION "6.7.3" + +extern u8 __read_mostly sched_bore; +DECLARE_STATIC_KEY_TRUE(sched_bore_key); +extern u8 __read_mostly sched_burst_inherit_type; +extern u8 __read_mostly sched_burst_smoothness; +extern u8 __read_mostly sched_burst_penalty_offset; +extern uint __read_mostly sched_burst_penalty_scale; +extern uint __read_mostly sched_burst_cache_lifetime; + +extern u8 effective_prio_bore(struct task_struct *p); +extern u32 get_score_bore(struct task_struct *p); +extern void update_curr_bore(struct task_struct *p, u64 delta_exec); +extern void restart_burst_bore(struct task_struct *p); +extern void restart_burst_rescale_deadline_bore(struct task_struct *p); +extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, + u64 clone_flags, u64 now); +extern void sched_init_bore(void); +extern void reset_task_bore(struct task_struct *p); + +extern int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_penalty_offset_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_penalty_scale_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +extern void reweight_entity( + struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); + +#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/init/Kconfig b/init/Kconfig index 694ddcbf467d..7605c78d4663 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1466,6 +1466,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. +config SCHED_BORE + bool "Burst-Oriented Response Enhancer" + default y + help + In Desktop and Mobile computing, one might prefer interactive + tasks to keep responsive no matter what they run in the background. + + Enabling this kernel feature modifies the scheduler to discriminate + tasks by their burst time (runtime since it last went sleeping or + yielding state) and prioritize those that run less bursty. + Such tasks usually include window compositor, widgets backend, + terminal emulator, video playback, games and so on. + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index e1359db5561e..31053e61972f 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -81,3 +81,20 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS + +config MIN_BASE_SLICE_NS + int "Default value for min_base_slice_ns" + default 2000000 + help + The BORE Scheduler automatically calculates the optimal base + slice for the configured HZ using the following equation: + + base_slice_ns = + 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) + + This option sets the default lower bound limit of the base slice + to prevent the loss of task throughput due to overscheduling. + + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. diff --git a/kernel/exit.c b/kernel/exit.c index ede3117fa7d4..3f3af470d5cb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -147,7 +147,11 @@ static void __unhash_process(struct release_task_post *post, struct task_struct detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); +#ifdef CONFIG_SCHED_BORE + list_del_rcu(&p->sibling); +#else /* !CONFIG_SCHED_BORE */ list_del_init(&p->sibling); +#endif /* CONFIG_SCHED_BORE */ __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); diff --git a/kernel/fork.c b/kernel/fork.c index 71d0f7acb376..1c02cca5207f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,6 +117,10 @@ /* For dup_mmap(). */ #include "../mm/internal.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + #include #define CREATE_TRACE_POINTS @@ -2377,6 +2381,11 @@ __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); +#ifdef CONFIG_SCHED_BORE + if (likely(p->pid)) + task_fork_bore(p, current, clone_flags, p->start_time); +#endif /* CONFIG_SCHED_BORE */ + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! @@ -2450,7 +2459,11 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; +#ifdef CONFIG_SCHED_BORE + list_add_tail_rcu(&p->sibling, &p->real_parent->children); +#else /* !CONFIG_SCHED_BORE */ list_add_tail(&p->sibling, &p->real_parent->children); +#endif /* CONFIG_SCHED_BORE */ list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 1c2dd03f11ec..de57e2d543c9 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,6 +4,9 @@ #include #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ #include "futex.h" @@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) +#ifdef CONFIG_SCHED_BORE + { + current->bore.futex_waiting = true; +#endif /* CONFIG_SCHED_BORE */ schedule(); +#ifdef CONFIG_SCHED_BORE + current->bore.futex_waiting = false; + } +#endif /* CONFIG_SCHED_BORE */ } __set_current_state(TASK_RUNNING); } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index b1f1a367034f..f95a7b3d5a32 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,3 +40,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c new file mode 100644 index 000000000000..11e79d1653ae --- /dev/null +++ b/kernel/sched/bore.c @@ -0,0 +1,470 @@ +/* + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler + * Copyright (C) 2021-2025 Masahito Suzuki + */ +#include +#include +#include +#include "sched.h" + +#ifdef CONFIG_SCHED_BORE +DEFINE_STATIC_KEY_TRUE(sched_bore_key); +u8 __read_mostly sched_bore = 1; +u8 __read_mostly sched_burst_inherit_type = 2; +u8 __read_mostly sched_burst_smoothness = 1; +u8 __read_mostly sched_burst_penalty_offset = 24; +uint __read_mostly sched_burst_penalty_scale = 1536; +uint __read_mostly sched_burst_cache_lifetime = 75000000; +static u32 __read_mostly scaled_tolerance; +static int __maybe_unused maxval_prio = 39; +static int __maybe_unused maxval_6_bits = 63; +static int __maybe_unused maxval_8_bits = 255; +static int __maybe_unused maxval_12_bits = 4095; + +#define MAX_BURST_PENALTY ((40U << 8) - 1) +#define BURST_CACHE_SAMPLE_LIMIT 63 +#define BURST_CACHE_SCAN_LIMIT (BURST_CACHE_SAMPLE_LIMIT * 2) + +static u32 bore_reciprocal_lut[BURST_CACHE_SAMPLE_LIMIT + 1]; + +DEFINE_STATIC_KEY_TRUE(sched_burst_inherit_key); +DEFINE_STATIC_KEY_TRUE(sched_burst_ancestor_key); + +static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { + if (unlikely(!v)) return 0; + int clz = __builtin_clzll(v); + int exponent = 64 - clz; + u32 mantissa = (u32)((v << clz) << 1 >> (64 - fp)); + return exponent << fp | mantissa; +} + +static inline u32 calc_burst_penalty(u64 burst_time) { + u32 greed = log2p1_u64_u32fp(burst_time, 8); + return greed * sched_burst_penalty_scale >> 10; +} + +static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { + u64 unscaled, rescaled; + unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); + rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); + return rescaled; +} + +static inline u32 binary_smooth(u32 new, u32 old) { + u32 is_growing = (new > old); + u32 increment = (new - old) * is_growing; + u32 shift = sched_burst_smoothness; + u32 smoothed = old + ((increment + (1U << shift) - 1) >> shift); + return (new & ~(-is_growing)) | (smoothed & (-is_growing)); +} + +static void reweight_task_by_prio(struct task_struct *p, int prio) { + if (task_has_idle_policy(p)) return; + + struct sched_entity *se = &p->se; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + if (se->on_rq) { + p->bore.stop_update = true; + reweight_entity(cfs_rq_of(se), se, weight); + p->bore.stop_update = false; + } else + se->load.weight = weight; + se->load.inv_weight = sched_prio_to_wmult[prio]; +} + +static inline u32 adj_score(u16 penalty) { + s32 diff = (s32)(u32)penalty - (s32)scaled_tolerance; + return (u32)(diff & ~(diff >> 31)); +} + +static inline u32 scaled_penalty_score(u16 greed) { + u32 penalty = adj_score(greed); + s32 overflow = (s32)penalty - (s32)MAX_BURST_PENALTY; + penalty -= (overflow & ~(overflow >> 31)); + return penalty >> 8; +} + +u32 get_score_bore(struct task_struct *p) +{ return scaled_penalty_score(p->bore.penalty); } + +u8 effective_prio_bore(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + if (static_branch_likely(&sched_bore_key)) + prio += scaled_penalty_score(p->bore.penalty); + prio &= ~(prio >> 31); + s32 pdiff = prio - maxval_prio; + prio -= (pdiff & ~(pdiff >> 31)); + return (u8)prio; +} + +static void update_penalty(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + + u8 prev_prio = effective_prio_bore(p); + + s32 diff = (s32)ctx->curr_penalty - (s32)ctx->prev_penalty; + u16 max_val = ctx->curr_penalty - (diff & (diff >> 31)); + u32 is_kthread = !!(p->flags & PF_KTHREAD); + ctx->penalty = max_val & -(s32)(!is_kthread); + + u8 new_prio = effective_prio_bore(p); + if (new_prio != prev_prio) + reweight_task_by_prio(p, new_prio); +} + +void update_curr_bore(struct task_struct *p, u64 delta_exec) { + struct bore_ctx *ctx = &p->bore; + if (ctx->stop_update) return; + + ctx->burst_time += delta_exec; + u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); + + if (curr_penalty <= ctx->prev_penalty) return; + update_penalty(p); +} + +void restart_burst_bore(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); + ctx->prev_penalty = new_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + update_penalty(p); +} + +void restart_burst_rescale_deadline_bore(struct task_struct *p) { + struct sched_entity *se = &p->se; + s64 vscaled, vremain = se->deadline - se->vruntime; + + u8 old_prio = effective_prio_bore(p); + restart_burst_bore(p); + u8 new_prio = effective_prio_bore(p); + + if (old_prio > new_prio) { + vscaled = rescale_slice(abs(vremain), old_prio, new_prio); + if (unlikely(vremain < 0)) + vscaled = -vscaled; + se->deadline = se->vruntime + vscaled; + } +} + +static inline bool task_is_bore_eligible(struct task_struct *p) +{return p && p->sched_class == &fair_sched_class && !p->exit_state;} + +#ifndef for_each_child_task +#define for_each_child_task(p, t) \ + list_for_each_entry_rcu(t, &(p)->children, sibling) +#endif + +static inline u32 count_children_upto2(struct task_struct *p) { + struct list_head *head = &p->children; + struct list_head *next = head->next; + return (next != head) + (next->next != head); +} + +static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { + struct bore_bc bc_val = { .value = READ_ONCE(bc->value) }; + u64 timestamp = (u64)bc_val.timestamp << BORE_BC_TIMESTAMP_SHIFT; + return now - timestamp > (u64)sched_burst_cache_lifetime; +} + +static void update_burst_cache(struct bore_bc *bc, + struct task_struct *p, u32 count, u32 total, u64 now) { + u32 average = (count == 1) ? total : + (u32)(((u64)total * bore_reciprocal_lut[count]) >> 32); + + struct bore_bc new_bc = { + .penalty = min(max(average, adj_score(p->bore.penalty)) + + scaled_tolerance, (u32)MAX_BURST_PENALTY + scaled_tolerance), + .timestamp = now >> BORE_BC_TIMESTAMP_SHIFT + }; + WRITE_ONCE(bc->value, new_bc.value); +} + +static u32 inherit_from_parent(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct bore_bc bc_val; + + if (clone_flags & CLONE_PARENT) + parent = rcu_dereference(parent->real_parent); + + struct bore_bc *bc = &parent->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *child; + u32 count = 0, total = 0, scan_count = 0; + for_each_child_task(parent, child) { + if (count >= BURST_CACHE_SAMPLE_LIMIT) break; + if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; + + if (!task_is_bore_eligible(child)) continue; + count++; + total += adj_score(child->bore.penalty); + } + + update_burst_cache(bc, parent, count, total, now); + } + + bc_val.value = READ_ONCE(bc->value); + return (u32)bc_val.penalty; +} + +static u32 inherit_from_ancestor_hub(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct bore_bc bc_val; + struct task_struct *ancestor = parent; + u32 sole_child_count = 0; + + if (clone_flags & CLONE_PARENT) { + ancestor = rcu_dereference(ancestor->real_parent); + sole_child_count = 1; + } + + for (struct task_struct *next; + (next = rcu_dereference(ancestor->real_parent)) != ancestor && + count_children_upto2(ancestor) <= sole_child_count; + ancestor = next, sole_child_count = 1) {} + + struct bore_bc *bc = &ancestor->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *direct_child; + u32 count = 0, total = 0, scan_count = 0; + for_each_child_task(ancestor, direct_child) { + if (count >= BURST_CACHE_SAMPLE_LIMIT) break; + if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; + + struct task_struct *descendant = direct_child; + while (count_children_upto2(descendant) == 1) + descendant = list_first_entry(&descendant->children, + struct task_struct, sibling); + + if (!task_is_bore_eligible(descendant)) continue; + count++; + total += adj_score(descendant->bore.penalty); + } + + update_burst_cache(bc, ancestor, count, total, now); + } + + bc_val.value = READ_ONCE(bc->value); + return (u32)bc_val.penalty; +} + +static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { + struct bore_bc bc_val; + struct task_struct *leader = p->group_leader; + struct bore_bc *bc = &leader->bore.group; + + if (burst_cache_expired(bc, now)) { + struct task_struct *sibling; + u32 count = 0, total = 0; + + for_each_thread(leader, sibling) { + if (count >= BURST_CACHE_SAMPLE_LIMIT) break; + + if (!task_is_bore_eligible(sibling)) continue; + count++; + total += adj_score(sibling->bore.penalty); + } + + update_burst_cache(bc, leader, count, total, now); + } + + bc_val.value = READ_ONCE(bc->value); + return (u32)bc_val.penalty; +} + +void task_fork_bore(struct task_struct *p, + struct task_struct *parent, u64 clone_flags, u64 now) { + if (!static_branch_likely(&sched_bore_key) || !task_is_bore_eligible(p)) return; + + rcu_read_lock(); + struct bore_ctx *ctx = &p->bore; + u32 inherited_penalty; + if (clone_flags & CLONE_THREAD) + inherited_penalty = inherit_from_thread_group(parent, now); + else if (static_branch_likely(&sched_burst_inherit_key)) + inherited_penalty = static_branch_likely(&sched_burst_ancestor_key)? + inherit_from_ancestor_hub(parent, clone_flags, now): + inherit_from_parent(parent, clone_flags, now); + else + inherited_penalty = 0; + + if (ctx->prev_penalty < inherited_penalty) + ctx->prev_penalty = inherited_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + ctx->stop_update = false; + ctx->futex_waiting = false; + update_penalty(p); + rcu_read_unlock(); +} + +void reset_task_bore(struct task_struct *p) +{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } + +static void update_inherit_type(void) { + switch(sched_burst_inherit_type) { + case 1: + static_branch_enable(&sched_burst_inherit_key); + static_branch_disable(&sched_burst_ancestor_key); + break; + case 2: + static_branch_enable(&sched_burst_inherit_key); + static_branch_enable(&sched_burst_ancestor_key); + break; + default: + static_branch_disable(&sched_burst_inherit_key); + break; + } +} + +static inline void update_scaled_tolerance(void) { + scaled_tolerance = + (u32)(sched_burst_penalty_offset << 8) * sched_burst_penalty_scale >> 10; +} + +void __init sched_init_bore(void) { + printk(KERN_INFO "%s %s by %s\n", + SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); + + for (int i = 1; i <= BURST_CACHE_SAMPLE_LIMIT; i++) + bore_reciprocal_lut[i] = (u32)div64_u64(0xffffffffULL + i, i); + + reset_task_bore(&init_task); + update_inherit_type(); + update_scaled_tolerance(); +} + +static void readjust_all_task_weights(void) { + struct task_struct *task; + struct rq *rq; + struct rq_flags rf; + + scoped_guard(write_lock_irq, &tasklist_lock) + for_each_process(task) { + if (!task_is_bore_eligible(task)) continue; + rq = task_rq_lock(task, &rf); + update_rq_clock(rq); + reweight_task_by_prio(task, effective_prio_bore(task)); + task_rq_unlock(rq, task, &rf); + } +} + +int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + if (sched_bore) + static_branch_enable(&sched_bore_key); + else + static_branch_disable(&sched_bore_key); + + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_penalty_offset_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_scaled_tolerance(); + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_penalty_scale_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_scaled_tolerance(); + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_inherit_type(); + + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_bore_sysctls[] = { + { + .procname = "sched_bore", + .data = &sched_bore, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_bore_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_inherit_type", + .data = &sched_burst_inherit_type, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_inherit_type_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "sched_burst_penalty_offset", + .data = &sched_burst_penalty_offset, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_penalty_offset_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_6_bits, + }, + { + .procname = "sched_burst_penalty_scale", + .data = &sched_burst_penalty_scale, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = sched_burst_penalty_scale_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_12_bits, + }, + { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static int __init sched_bore_sysctl_init(void) { + register_sysctl_init("kernel", sched_bore_sysctls); + return 0; +} +late_initcall(sched_bore_sysctl_init); + +#endif // CONFIG_SYSCTL +#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 115bd9ce93e2..bb17ecbd3102 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -100,6 +100,10 @@ #include "../smpboot.h" #include "../locking/mutex.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1440,7 +1444,11 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { +#ifdef CONFIG_SCHED_BORE + int prio = effective_prio_bore(p); +#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; +#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8600,6 +8608,10 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); +#endif /* CONFIG_SCHED_BORE */ + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..d5c5eaef7e79 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -10,6 +10,10 @@ #include #include "sched.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * This allows printing both to /sys/kernel/debug/sched/debug and * to the console @@ -169,6 +173,53 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_BORE +#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ +static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ +{ \ + char buf[16]; \ + unsigned int value; \ +\ + if (cnt > 15) \ + cnt = 15; \ +\ + if (copy_from_user(&buf, ubuf, cnt)) \ + return -EFAULT; \ + buf[cnt] = '\0'; \ +\ + if (kstrtouint(buf, 10, &value)) \ + return -EINVAL; \ +\ + sysctl_sched_##name = value; \ + sched_update_##update_func(); \ +\ + *ppos += cnt; \ + return cnt; \ +} \ +\ +static int sched_##name##_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", sysctl_sched_##name); \ + return 0; \ +} \ +\ +static int sched_##name##_open(struct inode *inode, struct file *filp) \ +{ \ + return single_open(filp, sched_##name##_show, NULL); \ +} \ +\ +static const struct file_operations sched_##name##_fops = { \ + .open = sched_##name##_open, \ + .write = sched_##name##_write, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) + +#undef DEFINE_SYSCTL_SCHED_FUNC +#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -208,6 +259,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -602,12 +654,19 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifdef CONFIG_SCHED_BORE + debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); + debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); +#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); +#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -852,6 +911,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE + SEQ_printf(m, " %2d", get_score_bore(p)); +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1329,6 +1391,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); +#ifdef CONFIG_SCHED_BORE + __PS("bore.score", get_score_bore(p)); +#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 993e996f40e0..0a6747e96e1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -58,6 +58,10 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * The initial- and re-scaling of tunables is configurable * @@ -67,28 +71,32 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant + * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) + * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice + * (default min_base_slice = 2000000 constant, units: nanoseconds) + * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds */ -#ifdef CONFIG_CACHY -unsigned int sysctl_sched_base_slice = 400000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; -#else +#ifdef CONFIG_SCHED_BORE +static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; +unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -#endif /* CONFIG_CACHY */ +#endif /* CONFIG_SCHED_BORE */ -#ifdef CONFIG_CACHY -__read_mostly unsigned int sysctl_sched_migration_cost = 400000UL; -#else __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; -#endif static int __init setup_sched_thermal_decay_shift(char *str) { @@ -198,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE +static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} +void sched_update_min_base_slice(void) { update_sysctl(); } +#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -228,6 +243,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } +#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -966,7 +982,11 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) */ static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#ifdef CONFIG_SCHED_BORE + u64 slice = sysctl_sched_base_slice; +#else /* CONFIG_SCHED_BORE */ u64 slice = normalized_sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ u64 vprot = se->deadline; if (sched_feat(RUN_TO_PARITY)) @@ -1044,6 +1064,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) curr = NULL; if (curr && protect && protect_slice(curr)) +#ifdef CONFIG_SCHED_BORE + if (!static_branch_likely(&sched_bore_key) || + !entity_is_task(curr) || + !task_of(curr)->bore.futex_waiting) +#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1105,6 +1130,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ +#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1116,6 +1142,7 @@ int sched_update_scaling(void) return 0; } +#endif /* CONFIG_SCHED_BORE */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1315,6 +1342,11 @@ static void update_curr(struct cfs_rq *cfs_rq) resched = update_deadline(cfs_rq, curr); if (entity_is_task(curr)) { +#ifdef CONFIG_SCHED_BORE + struct task_struct *p = task_of(curr); + update_curr_bore(p, delta_exec); +#endif /* CONFIG_SCHED_BORE */ + /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -3851,7 +3883,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; @@ -5172,12 +5204,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice, vruntime = avg_vruntime(cfs_rq); + u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5262,7 +5293,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } - +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key) && + entity_is_task(se) && + task_of(se)->bore.futex_waiting) + goto vslice_found; +#endif /* !CONFIG_SCHED_BORE */ + vslice = calc_delta_fair(se->slice, se); +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key)) + vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); + else +#endif /* CONFIG_SCHED_BORE */ /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5271,6 +5313,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; +#ifdef CONFIG_SCHED_BORE +vslice_found: +#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5281,7 +5326,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static void -requeue_delayed_entity(struct sched_entity *se); +requeue_delayed_entity(struct sched_entity *se, int flags); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5439,6 +5484,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key) && sched_feat(DELAY_ZERO)) + update_entity_lag(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -6915,7 +6964,7 @@ static int sched_idle_cpu(int cpu) } static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -6928,13 +6977,22 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key)) + flags |= ENQUEUE_WAKEUP; + else { +#endif /* CONFIG_SCHED_BORE */ + flags = 0; update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + } +#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, 0); + place_entity(cfs_rq, se, flags); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; @@ -6974,7 +7032,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); return; } @@ -6992,7 +7050,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); break; } cfs_rq = cfs_rq_of(se); @@ -7199,6 +7257,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); +#ifdef CONFIG_SCHED_BORE + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + struct sched_entity *se = &p->se; + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + if (cfs_rq->curr == se) + update_curr(cfs_rq); + restart_burst_bore(p); + } +#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -8919,6 +8986,16 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f goto pick; } +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key) && + sched_feat(PREEMPT_SHORT_BORE) && + entity_is_task(pse) && entity_is_task(se) && + task_of(pse)->bore.penalty < task_of(se)->bore.penalty) { + preempt_action = PREEMPT_WAKEUP_SHORT; + goto preempt; + } +#endif /* CONFIG_SCHED_BORE */ + /* * Ignore wakee preemption on WF_FORK as it is less likely that * there is shared data as exec often follow fork. Do not @@ -9130,16 +9207,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ +#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_BORE + restart_burst_rescale_deadline_bore(curr); + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13586,6 +13672,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); +#ifdef CONFIG_SCHED_BORE + reset_task_bore(p); +#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..1930f60c4295 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -23,6 +23,13 @@ SCHED_FEAT(RUN_TO_PARITY, true) * current. */ SCHED_FEAT(PREEMPT_SHORT, true) +/* + * Allow preemption of tasks with a higher BORE penalty by tasks with a lower + * penalty, using the PREEMPT_SHORT mechanism. + */ +#ifdef CONFIG_SCHED_BORE +SCHED_FEAT(PREEMPT_SHORT_BORE, true) +#endif /* CONFIG_SCHED_BORE */ /* * Prefer to schedule the task we woke last (assuming it failed diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 01b6beb3753a..c4e8d9be8c8f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2212,7 +2212,11 @@ extern int group_balance_cpu(struct sched_group *sg); extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); +#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); +#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -3006,7 +3010,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern __read_mostly unsigned int sysctl_sched_nr_migrate; extern __read_mostly unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_BORE +extern unsigned int sysctl_sched_min_base_slice; +extern __read_mostly uint sysctl_sched_base_slice; +#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; -- 2.53.0