From 5aa546ec52211d580884e3ae10b2f964f1d8c9a6 Mon Sep 17 00:00:00 2001 From: Eric Naim Date: Mon, 15 Sep 2025 09:15:33 +0800 Subject: [PATCH] bore-cachy Signed-off-by: Eric Naim --- include/linux/sched.h | 29 +++ include/linux/sched/bore.h | 39 ++++ init/Kconfig | 17 ++ kernel/Kconfig.hz | 17 ++ kernel/fork.c | 8 + kernel/futex/waitwake.c | 11 ++ kernel/sched/Makefile | 1 + kernel/sched/bore.c | 393 +++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 ++ kernel/sched/debug.c | 61 ++++++ kernel/sched/fair.c | 191 +++++++++++------- kernel/sched/features.h | 3 + kernel/sched/sched.h | 9 + 13 files changed, 724 insertions(+), 67 deletions(-) create mode 100644 include/linux/sched/bore.h create mode 100644 kernel/sched/bore.c diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b833350..86b5c8c2ba85 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -815,6 +815,32 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_SCHED_BORE +#define BORE_BC_TIMESTAMP_SHIFT 16 + +struct bore_bc { + u64 timestamp: 48; + u64 penalty: 16; +}; + +struct bore_ctx { + struct bore_bc subtree; + struct bore_bc group; + u64 burst_time; + u16 prev_penalty; + u16 curr_penalty; + union { + u16 penalty; + struct { + u8 _; + u8 score; + }; + }; + bool stop_update; + bool futex_waiting; +}; +#endif /* CONFIG_SCHED_BORE */ + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -873,6 +899,9 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif +#ifdef CONFIG_SCHED_BORE + struct bore_ctx bore; +#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h new file mode 100644 index 000000000000..646291bb4706 --- /dev/null +++ b/include/linux/sched/bore.h @@ -0,0 +1,39 @@ +#ifndef _KERNEL_SCHED_BORE_H +#define _KERNEL_SCHED_BORE_H + +#include +#include +#include +#include +#include + +#define SCHED_BORE_AUTHOR "Masahito Suzuki" +#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" + +#define SCHED_BORE_VERSION "6.5.5" + +extern u8 __read_mostly sched_bore; +extern u8 __read_mostly sched_burst_inherit_type; +extern u8 __read_mostly sched_burst_smoothness; +extern u8 __read_mostly sched_burst_penalty_offset; +extern uint __read_mostly sched_burst_penalty_scale; +extern uint __read_mostly sched_burst_cache_lifetime; + +extern u8 effective_prio_bore(struct task_struct *p); +extern void update_curr_bore(struct task_struct *p, u64 delta_exec); +extern void restart_burst_bore(struct task_struct *p); +extern void restart_burst_rescale_deadline_bore(struct task_struct *p); +extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, + u64 clone_flags, u64 now); +extern void sched_init_bore(void); +extern void reset_task_bore(struct task_struct *p); + +extern int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +extern void reweight_entity( + struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); + +#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/init/Kconfig b/init/Kconfig index 14b467ccea51..b4653fcb07dc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1430,6 +1430,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. +config SCHED_BORE + bool "Burst-Oriented Response Enhancer" + default y + help + In Desktop and Mobile computing, one might prefer interactive + tasks to keep responsive no matter what they run in the background. + + Enabling this kernel feature modifies the scheduler to discriminate + tasks by their burst time (runtime since it last went sleeping or + yielding state) and prioritize those that run less bursty. + Such tasks usually include window compositor, widgets backend, + terminal emulator, video playback, games and so on. + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index e1359db5561e..31053e61972f 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -81,3 +81,20 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS + +config MIN_BASE_SLICE_NS + int "Default value for min_base_slice_ns" + default 2000000 + help + The BORE Scheduler automatically calculates the optimal base + slice for the configured HZ using the following equation: + + base_slice_ns = + 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) + + This option sets the default lower bound limit of the base slice + to prevent the loss of task throughput due to overscheduling. + + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. diff --git a/kernel/fork.c b/kernel/fork.c index 41d3153248a0..229a673a6d94 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -120,6 +120,10 @@ /* For dup_mmap(). */ #include "../mm/internal.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + #include #define CREATE_TRACE_POINTS @@ -2327,6 +2331,10 @@ __latent_entropy struct task_struct *copy_process( * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); +#ifdef CONFIG_SCHED_BORE + if (likely(p->pid)) + task_fork_bore(p, current, clone_flags, p->start_time); +#endif /* CONFIG_SCHED_BORE */ /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec2..6484ad583f3b 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,6 +4,9 @@ #include #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ #include "futex.h" @@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) +#ifdef CONFIG_SCHED_BORE + { + current->bore.futex_waiting = true; +#endif /* CONFIG_SCHED_BORE */ schedule(); +#ifdef CONFIG_SCHED_BORE + current->bore.futex_waiting = false; + } +#endif /* CONFIG_SCHED_BORE */ } __set_current_state(TASK_RUNNING); } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae86371ddcd..b688084bcecc 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c new file mode 100644 index 000000000000..c9e76eda35c8 --- /dev/null +++ b/kernel/sched/bore.c @@ -0,0 +1,393 @@ +/* + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler + * Copyright (C) 2021-2025 Masahito Suzuki + */ +#include +#include +#include +#include "sched.h" + +#ifdef CONFIG_SCHED_BORE +u8 __read_mostly sched_bore = 1; +u8 __read_mostly sched_burst_inherit_type = 2; +u8 __read_mostly sched_burst_smoothness = 1; +u8 __read_mostly sched_burst_penalty_offset = 24; +uint __read_mostly sched_burst_penalty_scale = 1536; +uint __read_mostly sched_burst_cache_lifetime = 75000000; +static int __maybe_unused maxval_prio = 39; +static int __maybe_unused maxval_6_bits = 63; +static int __maybe_unused maxval_8_bits = 255; +static int __maybe_unused maxval_12_bits = 4095; + +#define MAX_BURST_PENALTY ((40U << 8) - 1) +#define BURST_CACHE_STOP_COUNT 63 + +static u32 (*inherit_penalty_fn)(struct task_struct *, u64, u64); + +static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { + if (!v) return 0; + u32 exponent = fls64(v), + mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp)); + return exponent << fp | mantissa; +} + +static inline u32 calc_burst_penalty(u64 burst_time) { + u32 greed = log2p1_u64_u32fp(burst_time, 8), + tolerance = sched_burst_penalty_offset << 8, + penalty = max(0, (s32)(greed - tolerance)), + scaled_penalty = penalty * sched_burst_penalty_scale >> 10; + return min(MAX_BURST_PENALTY, scaled_penalty); +} + +static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { + u64 unscaled, rescaled; + unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); + rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); + return rescaled; +} + +static inline u32 binary_smooth(u32 new, u32 old) { + if (new <= old) return new; + + u32 increment = new - old, + shift = sched_burst_smoothness, + divisor = 1U << shift; + + return old + ((increment + divisor - 1) >> shift); +} + +static void reweight_task_by_prio(struct task_struct *p, int prio) { + if (task_has_idle_policy(p)) return; + + struct sched_entity *se = &p->se; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + if (se->on_rq) { + p->bore.stop_update = true; + reweight_entity(cfs_rq_of(se), se, weight); + p->bore.stop_update = false; + } else + se->load.weight = weight; + se->load.inv_weight = sched_prio_to_wmult[prio]; +} + +u8 effective_prio_bore(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + if (likely(sched_bore)) + prio += p->bore.score; + return (u8)clamp(prio, 0, maxval_prio); +} + +static void update_penalty(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + + u8 prev_prio = effective_prio_bore(p); + u32 penalty = 0; + + if (!(p->flags & PF_KTHREAD)) { + u32 curr_penalty = ctx->curr_penalty; + penalty = ctx->prev_penalty; + if (penalty < curr_penalty) + penalty = curr_penalty; + } + ctx->penalty = penalty; + + u8 new_prio = effective_prio_bore(p); + if (new_prio != prev_prio) + reweight_task_by_prio(p, new_prio); +} + +void update_curr_bore(struct task_struct *p, u64 delta_exec) { + struct bore_ctx *ctx = &p->bore; + if (ctx->stop_update) return; + + ctx->burst_time += delta_exec; + u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); + + if (curr_penalty <= ctx->prev_penalty) return; + update_penalty(p); +} + +void restart_burst_bore(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); + ctx->prev_penalty = new_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + update_penalty(p); +} + +void restart_burst_rescale_deadline_bore(struct task_struct *p) { + struct sched_entity *se = &p->se; + s64 vscaled, vremain = se->deadline - se->vruntime; + + u8 old_prio = effective_prio_bore(p); + restart_burst_bore(p); + u8 new_prio = effective_prio_bore(p); + + if (old_prio > new_prio) { + vscaled = rescale_slice(abs(vremain), old_prio, new_prio); + if (unlikely(vremain < 0)) + vscaled = -vscaled; + se->deadline = se->vruntime + vscaled; + } +} + +static inline bool task_is_bore_eligible(struct task_struct *p) +{return p && p->sched_class == &fair_sched_class && !p->exit_state;} + +#ifndef for_each_child_task +#define for_each_child_task(p, t) \ + list_for_each_entry(t, &(p)->children, sibling) +#endif + +static inline u32 count_children_upto2(struct task_struct *p) { + struct list_head *head = &p->children; + struct list_head *next = head->next; + return (next != head) + (next->next != head); +} + +static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { + u64 timestamp = bc->timestamp << BORE_BC_TIMESTAMP_SHIFT; + return now - timestamp > sched_burst_cache_lifetime; +} + +static void update_burst_cache(struct bore_bc *bc, + struct task_struct *p, u32 count, u32 total, u64 now) { + u32 average = count ? total / count : 0; + bc->penalty = max(average, p->bore.penalty); + bc->timestamp = now >> BORE_BC_TIMESTAMP_SHIFT; +} + +static u32 inherit_none(struct task_struct *parent, + u64 clone_flags, u64 now) +{ return 0; } + +static u32 inherit_from_parent(struct task_struct *parent, + u64 clone_flags, u64 now) { + if (clone_flags & CLONE_PARENT) + parent = parent->real_parent; + + struct bore_bc *bc = &parent->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *child; + u32 count = 0, total = 0; + for_each_child_task(parent, child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(child)) continue; + count++; + total += child->bore.penalty; + } + + update_burst_cache(bc, parent, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_ancestor_hub(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct task_struct *ancestor = parent; + u32 sole_child_count = 0; + + if (clone_flags & CLONE_PARENT) { + ancestor = ancestor->real_parent; + sole_child_count = 1; + } + + for (struct task_struct *next; + (next = ancestor->real_parent) != ancestor && + count_children_upto2(ancestor) <= sole_child_count; + ancestor = next, sole_child_count = 1) {} + + struct bore_bc *bc = &ancestor->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *direct_child; + u32 count = 0, total = 0; + for_each_child_task(ancestor, direct_child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + struct task_struct *descendant = direct_child; + while (count_children_upto2(descendant) == 1) + descendant = list_first_entry(&descendant->children, + struct task_struct, sibling); + + if (!task_is_bore_eligible(descendant)) continue; + count++; + total += descendant->bore.penalty; + } + + update_burst_cache(bc, ancestor, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { + struct task_struct *leader = p->group_leader; + struct bore_bc *bc = &leader->bore.group; + + if (burst_cache_expired(bc, now)) { + struct task_struct *sibling; + u32 count = 0, total = 0; + + for_each_thread(leader, sibling) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(sibling)) continue; + count++; + total += sibling->bore.penalty; + } + + update_burst_cache(bc, leader, count, total, now); + } + + return bc->penalty; +} + +void task_fork_bore(struct task_struct *p, + struct task_struct *parent, u64 clone_flags, u64 now) { + if (!task_is_bore_eligible(p) || unlikely(!sched_bore)) return; + + struct bore_ctx *ctx = &p->bore; + u32 inherited_penalty = (clone_flags & CLONE_THREAD)? + inherit_from_thread_group(parent, now): + inherit_penalty_fn(parent, clone_flags, now); + + if (ctx->prev_penalty < inherited_penalty) + ctx->prev_penalty = inherited_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + ctx->stop_update = false; + ctx->futex_waiting = false; + update_penalty(p); +} + +void reset_task_bore(struct task_struct *p) +{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } + +static void update_inherit_type(void) { + switch(sched_burst_inherit_type) { + case 1: + inherit_penalty_fn = inherit_from_parent; + break; + case 2: + inherit_penalty_fn = inherit_from_ancestor_hub; + break; + default: + inherit_penalty_fn = inherit_none; + } +} + +void __init sched_init_bore(void) { + printk(KERN_INFO "%s %s by %s\n", + SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); + + reset_task_bore(&init_task); + update_inherit_type(); +} + +static void readjust_all_task_weights(void) { + struct task_struct *task; + struct rq *rq; + struct rq_flags rf; + + scoped_guard(write_lock_irq, &tasklist_lock) + for_each_process(task) { + if (!task_is_bore_eligible(task)) continue; + rq = task_rq_lock(task, &rf); + update_rq_clock(rq); + reweight_task_by_prio(task, effective_prio_bore(task)); + task_rq_unlock(rq, task, &rf); + } +} + +int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_inherit_type(); + + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_bore_sysctls[] = { + { + .procname = "sched_bore", + .data = &sched_bore, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_bore_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_inherit_type", + .data = &sched_burst_inherit_type, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_inherit_type_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "sched_burst_penalty_offset", + .data = &sched_burst_penalty_offset, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_6_bits, + }, + { + .procname = "sched_burst_penalty_scale", + .data = &sched_burst_penalty_scale, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_12_bits, + }, + { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static int __init sched_bore_sysctl_init(void) { + register_sysctl_init("kernel", sched_bore_sysctls); + return 0; +} +late_initcall(sched_bore_sysctl_init); + +#endif // CONFIG_SYSCTL +#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..27901e63cf87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -98,6 +98,10 @@ #include "../smpboot.h" #include "../locking/mutex.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1429,7 +1433,11 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { +#ifdef CONFIG_SCHED_BORE + int prio = effective_prio_bore(p); +#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; +#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8685,6 +8693,10 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); +#endif /* CONFIG_SCHED_BORE */ + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02e16b70a790..751df396d94b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_BORE +#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ +static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ +{ \ + char buf[16]; \ + unsigned int value; \ +\ + if (cnt > 15) \ + cnt = 15; \ +\ + if (copy_from_user(&buf, ubuf, cnt)) \ + return -EFAULT; \ + buf[cnt] = '\0'; \ +\ + if (kstrtouint(buf, 10, &value)) \ + return -EINVAL; \ +\ + sysctl_sched_##name = value; \ + sched_update_##update_func(); \ +\ + *ppos += cnt; \ + return cnt; \ +} \ +\ +static int sched_##name##_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", sysctl_sched_##name); \ + return 0; \ +} \ +\ +static int sched_##name##_open(struct inode *inode, struct file *filp) \ +{ \ + return single_open(filp, sched_##name##_show, NULL); \ +} \ +\ +static const struct file_operations sched_##name##_fops = { \ + .open = sched_##name##_open, \ + .write = sched_##name##_write, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) + +#undef DEFINE_SYSCTL_SCHED_FUNC +#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,6 +261,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -500,12 +548,19 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifdef CONFIG_SCHED_BORE + debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); + debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); +#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); +#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -747,6 +802,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE + SEQ_printf(m, " %2d", p->bore.score); +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1224,6 +1282,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); +#ifdef CONFIG_SCHED_BORE + P(bore.score); +#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 226a96cd2536..0295537da1a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -58,6 +58,10 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * The initial- and re-scaling of tunables is configurable * @@ -67,28 +71,32 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant + * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) + * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice + * (default min_base_slice = 2000000 constant, units: nanoseconds) + * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds */ -#ifdef CONFIG_CACHY -unsigned int sysctl_sched_base_slice = 350000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 350000ULL; -#else +#ifdef CONFIG_SCHED_BORE +static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; +unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -#endif /* CONFIG_CACHY */ +#endif /* CONFIG_SCHED_BORE */ -#ifdef CONFIG_CACHY -__read_mostly unsigned int sysctl_sched_migration_cost = 300000UL; -#else __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; -#endif static int __init setup_sched_thermal_decay_shift(char *str) { @@ -131,12 +139,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -#ifdef CONFIG_CACHY -static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ @@ -202,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE +static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} +void sched_update_min_base_slice(void) { update_sysctl(); } +#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -232,6 +243,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } +#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -711,6 +723,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) vlag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE + limit >>= !!sched_bore; +#endif /* CONFIG_SCHED_BORE */ se->vlag = clamp(vlag, -limit, limit); } @@ -904,10 +919,17 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) */ static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#ifdef CONFIG_SCHED_BORE + u64 slice = sysctl_sched_base_slice; + bool run_to_parity = likely(sched_bore) ? + sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY); +#else /* CONFIG_SCHED_BORE */ u64 slice = normalized_sysctl_sched_base_slice; + bool run_to_parity = sched_feat(RUN_TO_PARITY); +#endif /* CONFIG_SCHED_BORE */ u64 vprot = se->deadline; - if (sched_feat(RUN_TO_PARITY)) + if (run_to_parity) slice = cfs_rq_min_slice(cfs_rq); slice = min(slice, se->slice); @@ -972,6 +994,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) curr = NULL; if (curr && protect && protect_slice(curr)) +#ifdef CONFIG_SCHED_BORE + if (!entity_is_task(curr) || + !task_of(curr)->bore.futex_waiting || + unlikely(!sched_bore)) +#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1033,6 +1060,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ +#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1044,6 +1072,7 @@ int sched_update_scaling(void) return 0; } +#endif /* CONFIG_SCHED_BORE */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1242,6 +1271,11 @@ static void update_curr(struct cfs_rq *cfs_rq) update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { +#ifdef CONFIG_SCHED_BORE + struct task_struct *p = task_of(curr); + update_curr_bore(p, delta_exec); +#endif /* CONFIG_SCHED_BORE */ + /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -3780,7 +3814,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; @@ -5140,12 +5174,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice, vruntime = avg_vruntime(cfs_rq); + u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5230,7 +5263,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } - +#ifdef CONFIG_SCHED_BORE + if (entity_is_task(se) && + likely(sched_bore) && + task_of(se)->bore.futex_waiting) + goto vslice_found; +#endif /* !CONFIG_SCHED_BORE */ + vslice = calc_delta_fair(se->slice, se); +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); + else +#endif /* CONFIG_SCHED_BORE */ /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5239,6 +5283,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; +#ifdef CONFIG_SCHED_BORE +vslice_found: +#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5249,7 +5296,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static void -requeue_delayed_entity(struct sched_entity *se); +requeue_delayed_entity(struct sched_entity *se, int flags); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5413,6 +5460,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); +#ifdef CONFIG_SCHED_BORE + if (sched_feat(DELAY_ZERO) && likely(sched_bore)) + update_entity_lag(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -6798,7 +6849,7 @@ static int sched_idle_cpu(int cpu) } static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -6811,13 +6862,22 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + flags |= ENQUEUE_WAKEUP; + else { +#endif /* CONFIG_SCHED_BORE */ + flags = 0; update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + } +#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, 0); + place_entity(cfs_rq, se, flags); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; @@ -6854,7 +6914,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); return; } @@ -6872,7 +6932,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); break; } cfs_rq = cfs_rq_of(se); @@ -7090,6 +7150,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); +#ifdef CONFIG_SCHED_BORE + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se = &p->se; + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + if (cfs_rq->curr == se) + update_curr(cfs_rq_of(&p->se)); + restart_burst_bore(p); + } +#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -7457,9 +7526,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas return new_cpu; } +static inline bool is_cpu_idle(int cpu) +{ + return available_idle_cpu(cpu) || sched_idle_cpu(cpu); +} + static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + if (is_cpu_idle(cpu) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; @@ -7552,30 +7626,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { - if (cpu == target) - continue; - /* - * Check if the CPU is in the LLC scheduling domain of @target. - * Due to isolcpus, there is no guarantee that all the siblings are in the domain. - */ - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } - - return -1; -} - -#else /* !CONFIG_SCHED_SMT: */ +#else /* CONFIG_SCHED_SMT */ static inline void set_idle_cores(int cpu, int val) { @@ -7591,11 +7642,6 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - #endif /* !CONFIG_SCHED_SMT */ /* @@ -7690,7 +7736,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!is_cpu_idle(cpu)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -7761,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (is_core_idle(target) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7769,7 +7815,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + is_core_idle(prev) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -7801,7 +7847,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + is_core_idle(recent_used_cpu) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -7837,16 +7883,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - if (sched_smt_active()) { + if (sched_smt_active()) has_idle_core = test_idle_cores(target); - if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); - if ((unsigned int)i < nr_cpumask_bits) - return i; - } - } - i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -8738,7 +8777,13 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; +#ifdef CONFIG_SCHED_BORE + bool run_to_parity = likely(sched_bore) ? + sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY); + if (run_to_parity && do_preempt_short) +#else /* CONFIG_SCHED_BORE */ if (sched_feat(RUN_TO_PARITY) && do_preempt_short) +#endif /* CONFIG_SCHED_BORE */ update_protect_slice(cfs_rq, se); return; @@ -8917,16 +8962,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ +#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_BORE + restart_burst_rescale_deadline_bore(curr); + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13174,6 +13228,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); +#ifdef CONFIG_SCHED_BORE + reset_task_bore(p); +#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..abadc5ca74e2 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -18,6 +18,9 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) * 0-lag point or until is has exhausted it's slice. */ SCHED_FEAT(RUN_TO_PARITY, true) +#ifdef CONFIG_SCHED_BORE +SCHED_FEAT(RUN_TO_PARITY_BORE, false) +#endif /* CONFIG_SCHED_BORE */ /* * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for * current. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4ee277cb92b9..162cf640186b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2102,7 +2102,11 @@ extern int group_balance_cpu(struct sched_group *sg); extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); +#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); +#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -2778,7 +2782,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern __read_mostly unsigned int sysctl_sched_nr_migrate; extern __read_mostly unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_BORE +extern unsigned int sysctl_sched_min_base_slice; +extern __read_mostly uint sysctl_sched_base_slice; +#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; -- 2.51.0