From 56b9692daf3433f7bae6663c9f491a72e52a3530 Mon Sep 17 00:00:00 2001 From: Masahito S Date: Sat, 28 Feb 2026 12:09:18 +0900 Subject: [PATCH] 6.18.3-nap-v0.1.2 --- drivers/cpuidle/Kconfig | 17 + drivers/cpuidle/governors/Makefile | 1 + drivers/cpuidle/governors/nap/Makefile | 32 + drivers/cpuidle/governors/nap/nap.c | 621 ++++++++++++++++++ drivers/cpuidle/governors/nap/nap.h | 258 ++++++++ drivers/cpuidle/governors/nap/nap_fpu.c | 449 +++++++++++++ drivers/cpuidle/governors/nap/nap_nn_avx2.c | 162 +++++ drivers/cpuidle/governors/nap/nap_nn_avx512.c | 197 ++++++ drivers/cpuidle/governors/nap/nap_nn_sse2.c | 169 +++++ 9 files changed, 1906 insertions(+) create mode 100644 drivers/cpuidle/governors/nap/Makefile create mode 100644 drivers/cpuidle/governors/nap/nap.c create mode 100644 drivers/cpuidle/governors/nap/nap.h create mode 100644 drivers/cpuidle/governors/nap/nap_fpu.c create mode 100644 drivers/cpuidle/governors/nap/nap_nn_avx2.c create mode 100644 drivers/cpuidle/governors/nap/nap_nn_avx512.c create mode 100644 drivers/cpuidle/governors/nap/nap_nn_sse2.c diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index cac5997dca..9b6c50f0d8 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -44,6 +44,23 @@ config CPU_IDLE_GOV_HALTPOLL Some virtualized workloads benefit from using it. +config CPU_IDLE_GOV_NAP + bool "Neural Adaptive Predictor (NAP) governor" + depends on X86_64 + default y + help + A machine-learning-based cpuidle governor that uses a small + neural network (MLP 16→16→10) to predict the optimal idle + state. Weights are initialized from hardware idle-state + parameters and refined via online learning (deferred + backpropagation with SGD). Requires SSE2 at minimum; + AVX2/AVX-512 are used when available. + + This is experimental. Select via cpuidle.governor=nap on + the kernel command line. + + If unsure, say Y. + config DT_IDLE_STATES bool diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile index 63abb5393a..ae688891c0 100644 --- a/drivers/cpuidle/governors/Makefile +++ b/drivers/cpuidle/governors/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o +obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/ diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile new file mode 100644 index 0000000000..325f63584c --- /dev/null +++ b/drivers/cpuidle/governors/nap/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the NAP cpuidle governor +# + +obj-$(CONFIG_CPU_IDLE_GOV_NAP) += cpuidle_gov_nap.o + +cpuidle_gov_nap-y := nap.o nap_fpu.o nap_nn_sse2.o nap_nn_avx2.o nap_nn_avx512.o + +# Kernel builds with -mno-sse -mno-sse2 -mno-avx -msoft-float -mno-80387 +# -mno-fp-ret-in-387. FPU/SIMD-using files need these removed and ISA +# flags explicitly added. +# +# CRITICAL: nap.o is intentionally compiled with NORMAL kernel flags +# (no FPU/SSE). All floating-point code lives in nap_fpu.o and the +# nap_nn_*.o files. This ensures the compiler cannot emit SSE instructions +# in governor callbacks (nap_select, nap_reflect, etc.), which would +# silently corrupt userspace FPU register state. +# +# Do NOT add CFLAGS_REMOVE/CFLAGS for nap.o — it must stay FPU-free. +FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \ + -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387 + +CFLAGS_REMOVE_nap_fpu.o += $(FPU_KILL_FLAGS) +CFLAGS_REMOVE_nap_nn_sse2.o += $(FPU_KILL_FLAGS) +CFLAGS_REMOVE_nap_nn_avx2.o += $(FPU_KILL_FLAGS) +CFLAGS_REMOVE_nap_nn_avx512.o += $(FPU_KILL_FLAGS) + +CFLAGS_nap_fpu.o += $(CC_FLAGS_FPU) +CFLAGS_nap_nn_sse2.o += $(CC_FLAGS_FPU) +CFLAGS_nap_nn_avx2.o += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma +CFLAGS_nap_nn_avx512.o += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma -mavx512f diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c new file mode 100644 index 0000000000..7c28dc9704 --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap.c @@ -0,0 +1,621 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap.c — Neural Adaptive Predictor cpuidle governor + * + * A machine-learning-based cpuidle governor that uses a small MLP (16→16→10) + * to predict the optimal idle state. Weights are Xavier-initialized at boot + * with exit-latency-aware output biases, then refined via online learning + * (deferred backpropagation with SGD). + * + * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel + * compilation). All floating-point and SIMD code lives in nap_fpu.c and + * nap_nn_{sse2,avx2,avx512}.c, which are compiled with CC_FLAGS_FPU. + * This separation ensures the compiler cannot emit SSE instructions in + * governor callbacks (nap_select, nap_reflect, etc.), which would corrupt + * userspace FPU register state. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nap.h" + +#include "../gov.h" + +/************************************************************** + * Version Information: + */ + +#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor" +#define CPUIDLE_NAP_AUTHOR "Masahito Suzuki" + +#define CPUIDLE_NAP_VERSION "0.1.2" + +/* Governor defaults */ +#define NAP_DEFAULT_LR_MILLTHS 1 /* 0.001 = 1 millths */ +#define NAP_DEFAULT_INTERVAL 16 /* learn every 16 reflects */ +#define NAP_DEFAULT_CLAMP_MILLTHS 1000 /* 1.0 = 1000 millths */ +#define NAP_DEFAULT_WARMUP 64 /* min learns before convergence check */ +#define NAP_DEFAULT_CONVERGE 768 /* 75% accuracy (x1024 scale) */ + +/* ================================================================ + * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c) + * ================================================================ */ + +DEFINE_STATIC_KEY_FALSE(nap_use_avx512); +DEFINE_STATIC_KEY_FALSE(nap_use_avx2); + +static void __init nap_detect_simd(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX512F)) { + static_branch_enable(&nap_use_avx512); + pr_info("nap: using AVX-512F\n"); + } else if (boot_cpu_has(X86_FEATURE_FMA) && + boot_cpu_has(X86_FEATURE_AVX2)) { + static_branch_enable(&nap_use_avx2); + pr_info("nap: using AVX2+FMA\n"); + } else { + pr_info("nap: using SSE2\n"); + } +} + +/* ================================================================ + * Per-CPU data + * ================================================================ */ + +DEFINE_PER_CPU(struct nap_cpu_data, nap_data); +static struct cpuidle_driver *nap_cached_drv; + +/* ================================================================ + * Reflect-time updates (integer-only, no FPU needed) + * ================================================================ */ + +static void nap_history_update(struct nap_cpu_data *d, u64 measured_ns) +{ + d->history[d->hist_idx] = measured_ns; + d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE; + if (d->hist_count < NAP_HISTORY_SIZE) + d->hist_count++; + + d->total_count++; + if (measured_ns < NAP_SHORT_THRESH_NS) + d->short_count++; +} + +static void nap_update_hit_intercept(struct nap_cpu_data *d, + struct cpuidle_driver *drv, + struct cpuidle_device *dev, + int selected_idx, u64 measured_ns) +{ + d->total_usage++; + + if (selected_idx + 1 < drv->state_count && + measured_ns > drv->states[selected_idx + 1].target_residency_ns) + d->total_above++; + + d->intercept_window++; + if (measured_ns < (u64)d->last_predicted_ns) + d->intercept_recent++; + + if (d->intercept_window >= 1024) { + d->intercept_window >>= 1; + d->intercept_recent >>= 1; + } +} + +static void nap_update_external_signals(struct nap_cpu_data *d, int cpu) +{ + u64 cur_irq = kstat_cpu_irqs_sum(cpu); + + d->prev_irq_count = cur_irq; + d->prev_idle_exit = ktime_get(); +} + +/* ================================================================ + * Governor callbacks + * ================================================================ */ + +static int nap_fallback_heuristic(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + int i; + + for (i = drv->state_count - 1; i > 0; i--) { + if (dev->states_usage[i].disable) + continue; + if (drv->states[i].exit_latency_ns <= latency_req) + return i; + } + return 0; +} + +static int nap_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + bool *stop_tick) +{ + struct nap_cpu_data *d = this_cpu_ptr(&nap_data); + int idx; + + if (unlikely(drv->state_count <= 1)) + return 0; + + if (likely(may_use_simd())) { + kernel_fpu_begin(); + idx = nap_fpu_select(drv, dev, d); + kernel_fpu_end(); + + if (idx < 0) + idx = nap_fallback_heuristic(drv, dev); + } else { + idx = nap_fallback_heuristic(drv, dev); + } + + *stop_tick = (drv->states[idx].target_residency_ns > + RESIDENCY_THRESHOLD_NS); + + d->last_selected_idx = idx; + d->stats.total_selects++; + + return idx; +} + +static void nap_reflect(struct cpuidle_device *dev, int index) +{ + struct nap_cpu_data *d = this_cpu_ptr(&nap_data); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + u64 measured_ns = dev->last_residency_ns; + + if (unlikely(!drv)) + return; + + nap_history_update(d, measured_ns); + nap_update_hit_intercept(d, drv, dev, index, measured_ns); + + d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns; + nap_update_external_signals(d, dev->cpu); + + if (d->learning_mode && ++d->learn_counter >= d->learn_interval) { + d->learn_counter = 0; + d->learn_actual_ns = measured_ns; + d->needs_learn = true; + } + + d->stats.total_residency_ns += measured_ns; + if (index > 0 && measured_ns < drv->states[index].target_residency_ns) + d->stats.undershoot_count++; +} + +static int nap_enable(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + struct nap_cpu_data *d = per_cpu_ptr(&nap_data, dev->cpu); + + memset(d, 0, sizeof(*d)); + + /* + * Defer weight initialization to the first nap_select() FPU path + * via reset_pending. nap_enable() is called from cpuidle core + * (cpuidle_enable_device) which may run on a different CPU than + * dev->cpu during governor switch. Deferring ensures FPU init + * happens on the correct CPU in its own idle context. + */ + WRITE_ONCE(nap_cached_drv, drv); + d->learning_mode = true; + d->learning_rate_millths = NAP_DEFAULT_LR_MILLTHS; + d->learn_interval = NAP_DEFAULT_INTERVAL; + d->max_grad_norm_millths = NAP_DEFAULT_CLAMP_MILLTHS; + d->warmup_threshold = NAP_DEFAULT_WARMUP; + d->convergence_thresh = NAP_DEFAULT_CONVERGE; + d->reset_pending = true; + + return 0; +} + +static void nap_disable(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + WRITE_ONCE(nap_cached_drv, NULL); +} + +/* ================================================================ + * sysfs interface (/sys/devices/system/cpu/cpuidle/nap/) + * ================================================================ */ + +static ssize_t stats_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu, len = 0; + u64 total_sel = 0, total_res = 0, total_under = 0, total_learn = 0; + int converged_cpus = 0, online_cpus = 0; + + for_each_online_cpu(cpu) { + struct nap_cpu_data *d = &per_cpu(nap_data, cpu); + + total_sel += d->stats.total_selects; + total_res += d->stats.total_residency_ns; + total_under += d->stats.undershoot_count; + total_learn += d->stats.learn_count; + if (d->converged) + converged_cpus++; + online_cpus++; + } + + len += sysfs_emit_at(buf, len, "total_selects: %llu\n", total_sel); + len += sysfs_emit_at(buf, len, "total_residency_ms: %llu\n", + div_u64(total_res, NSEC_PER_MSEC)); + len += sysfs_emit_at(buf, len, "undershoot_count: %llu\n", total_under); + len += sysfs_emit_at(buf, len, "undershoot_rate_permil: %llu\n", + total_sel ? div_u64(total_under * 1000, total_sel) : 0); + len += sysfs_emit_at(buf, len, "learn_count: %llu\n", total_learn); + len += sysfs_emit_at(buf, len, "converged_cpus: %d/%d\n", + converged_cpus, online_cpus); + return len; +} + +static ssize_t learning_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu, learning = 0, converged = 0, total = 0; + + for_each_online_cpu(cpu) { + struct nap_cpu_data *d = &per_cpu(nap_data, cpu); + + if (d->learning_mode) + learning++; + if (d->converged) + converged++; + total++; + } + + if (total == 0) + return sysfs_emit(buf, "off\n"); + + if (learning == 0) + return sysfs_emit(buf, "off\n"); + else if (converged == total) + return sysfs_emit(buf, "online\n"); + else + return sysfs_emit(buf, "warmup (%d/%d converged)\n", + converged, total); +} + +static ssize_t learning_mode_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + bool mode; + int cpu; + + if (sysfs_streq(buf, "online")) + mode = true; + else if (sysfs_streq(buf, "off")) + mode = false; + else + return -EINVAL; + + for_each_online_cpu(cpu) + per_cpu(nap_data, cpu).learning_mode = mode; + + return count; +} + +static ssize_t learning_rate_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu; + + cpu = cpumask_first(cpu_online_mask); + if (cpu >= nr_cpu_ids) + return sysfs_emit(buf, "0\n"); + return sysfs_emit(buf, "%u\n", + per_cpu(nap_data, cpu).learning_rate_millths); +} + +static ssize_t learning_rate_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int val; + int cpu; + + if (kstrtouint(buf, 10, &val) || val == 0 || val > 100) + return -EINVAL; + + for_each_online_cpu(cpu) + per_cpu(nap_data, cpu).learning_rate_millths = val; + + return count; +} + +static ssize_t learn_interval_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu; + + cpu = cpumask_first(cpu_online_mask); + if (cpu >= nr_cpu_ids) + return sysfs_emit(buf, "0\n"); + return sysfs_emit(buf, "%d\n", + per_cpu(nap_data, cpu).learn_interval); +} + +static ssize_t learn_interval_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int val; + int cpu; + + if (kstrtouint(buf, 10, &val) || val == 0 || val > 10000) + return -EINVAL; + + for_each_online_cpu(cpu) + per_cpu(nap_data, cpu).learn_interval = val; + + return count; +} + +static ssize_t reset_weights_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int cpu; + + if (!READ_ONCE(nap_cached_drv)) + return -ENODEV; + + /* + * Set a per-CPU flag; each CPU will reinitialize its own weights + * inside nap_select() within its own kernel_fpu_begin/end context. + * This avoids cross-CPU data races on the weight arrays. + */ + for_each_online_cpu(cpu) + per_cpu(nap_data, cpu).reset_pending = true; + + pr_info("nap: weight reset scheduled for all CPUs\n"); + return count; +} + +static ssize_t reset_stats_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int cpu; + + for_each_online_cpu(cpu) + memset(&per_cpu(nap_data, cpu).stats, 0, + sizeof(struct nap_stats)); + + return count; +} + +static ssize_t warmup_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu; + + cpu = cpumask_first(cpu_online_mask); + if (cpu >= nr_cpu_ids) + return sysfs_emit(buf, "0\n"); + return sysfs_emit(buf, "%u\n", + per_cpu(nap_data, cpu).warmup_threshold); +} + +static ssize_t warmup_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int val; + int cpu; + + if (kstrtouint(buf, 10, &val) || val > 100000) + return -EINVAL; + + for_each_online_cpu(cpu) + per_cpu(nap_data, cpu).warmup_threshold = val; + + return count; +} + +static ssize_t convergence_thresh_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu; + + cpu = cpumask_first(cpu_online_mask); + if (cpu >= nr_cpu_ids) + return sysfs_emit(buf, "0\n"); + return sysfs_emit(buf, "%u\n", + per_cpu(nap_data, cpu).convergence_thresh); +} + +static ssize_t convergence_thresh_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int val; + int cpu; + + if (kstrtouint(buf, 10, &val) || val > 1024) + return -EINVAL; + + for_each_online_cpu(cpu) + per_cpu(nap_data, cpu).convergence_thresh = val; + + return count; +} + +static ssize_t ema_accuracy_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu; + unsigned int ema_min = 1024, ema_max = 0; + unsigned long ema_sum = 0; + int converged = 0, total = 0; + + for_each_online_cpu(cpu) { + struct nap_cpu_data *d = &per_cpu(nap_data, cpu); + + if (d->ema_accuracy < ema_min) + ema_min = d->ema_accuracy; + if (d->ema_accuracy > ema_max) + ema_max = d->ema_accuracy; + ema_sum += d->ema_accuracy; + if (d->converged) + converged++; + total++; + } + + if (total == 0) + return sysfs_emit(buf, "no cpus online\n"); + + return sysfs_emit(buf, "min/avg/max: %u/%lu/%u (x1024)\nconverged: %d/%d\n", + ema_min, ema_sum / total, ema_max, + converged, total); +} + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", CPUIDLE_NAP_VERSION); +} + +static ssize_t simd_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (static_branch_unlikely(&nap_use_avx512)) + return sysfs_emit(buf, "avx512\n"); + else if (static_branch_unlikely(&nap_use_avx2)) + return sysfs_emit(buf, "avx2\n"); + else + return sysfs_emit(buf, "sse2\n"); +} + +static ssize_t converged_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int cpu, converged = 0, total = 0; + + for_each_online_cpu(cpu) { + struct nap_cpu_data *d = &per_cpu(nap_data, cpu); + + if (d->converged) + converged++; + total++; + } + return sysfs_emit(buf, "%d/%d\n", converged, total); +} + +static struct kobj_attribute version_attr = __ATTR_RO(version); +static struct kobj_attribute simd_attr = __ATTR_RO(simd); +static struct kobj_attribute converged_attr = __ATTR_RO(converged); +static struct kobj_attribute stats_attr = __ATTR_RO(stats); +static struct kobj_attribute learning_mode_attr = __ATTR_RW(learning_mode); +static struct kobj_attribute learning_rate_attr = __ATTR_RW(learning_rate); +static struct kobj_attribute learn_interval_attr = __ATTR_RW(learn_interval); +static struct kobj_attribute warmup_threshold_attr = __ATTR_RW(warmup_threshold); +static struct kobj_attribute convergence_thresh_attr = __ATTR_RW(convergence_thresh); +static struct kobj_attribute ema_accuracy_attr = __ATTR_RO(ema_accuracy); +static struct kobj_attribute reset_weights_attr = __ATTR_WO(reset_weights); +static struct kobj_attribute reset_stats_attr = __ATTR_WO(reset_stats); + +static struct attribute *nap_attrs[] = { + &version_attr.attr, + &simd_attr.attr, + &converged_attr.attr, + &stats_attr.attr, + &learning_mode_attr.attr, + &learning_rate_attr.attr, + &learn_interval_attr.attr, + &warmup_threshold_attr.attr, + &convergence_thresh_attr.attr, + &ema_accuracy_attr.attr, + &reset_weights_attr.attr, + &reset_stats_attr.attr, + NULL, +}; + +static const struct attribute_group nap_attr_group = { + .attrs = nap_attrs, +}; + +static struct kobject *cpuidle_kobj; + +int nap_sysfs_init(void) +{ + struct device *dev_root; + int ret; + + dev_root = bus_get_dev_root(&cpu_subsys); + if (!dev_root) + return -ENODEV; + + cpuidle_kobj = kobject_create_and_add("nap", &dev_root->kobj); + put_device(dev_root); + if (!cpuidle_kobj) + return -ENOMEM; + + ret = sysfs_create_group(cpuidle_kobj, &nap_attr_group); + if (ret) { + kobject_put(cpuidle_kobj); + cpuidle_kobj = NULL; + } + return ret; +} + +void nap_sysfs_exit(void) +{ + if (cpuidle_kobj) { + sysfs_remove_group(cpuidle_kobj, &nap_attr_group); + kobject_put(cpuidle_kobj); + cpuidle_kobj = NULL; + } +} + +/* ================================================================ + * Governor registration + * ================================================================ */ + +static struct cpuidle_governor nap_governor = { + .name = "nap", + .rating = 26, + .enable = nap_enable, + .disable = nap_disable, + .select = nap_select, + .reflect = nap_reflect, +}; + +static int __init nap_init(void) +{ + int ret; + + nap_detect_simd(); + + ret = nap_sysfs_init(); + if (ret) + pr_warn("nap: sysfs init failed: %d (continuing without sysfs)\n", ret); + + ret = cpuidle_register_governor(&nap_governor); + if (ret) { + pr_err("nap: register_governor failed: %d\n", ret); + nap_sysfs_exit(); + return ret; + } + + pr_info("%s v%s by %s registered (rating=%u)\n", + CPUIDLE_NAP_PROGNAME, CPUIDLE_NAP_VERSION, + CPUIDLE_NAP_AUTHOR, nap_governor.rating); + return 0; +} +postcore_initcall(nap_init); diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h new file mode 100644 index 0000000000..b2871942df --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap.h @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef NAP_H +#define NAP_H + +#include +#include +#include + +/* ================================================================ + * Neural network dimensions + * ================================================================ */ + +#define NAP_INPUT_SIZE 16 +#define NAP_HIDDEN_SIZE 16 +#define NAP_OUTPUT_SIZE CPUIDLE_STATE_MAX + +/* + * The SIMD forward/backward implementations (SSE2, AVX2, AVX-512) handle + * the output layer as 2×xmm (8 floats) + 2 scalars, i.e. exactly 10. + * If CPUIDLE_STATE_MAX ever changes, those routines must be updated. + */ +static_assert(NAP_OUTPUT_SIZE == 10, + "NAP SIMD kernels are hard-coded for 10 output neurons"); + +/* + * Neural network weight structure for a 16→16→10 MLP. + * + * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i. + * This layout enables efficient column-wise matrix-vector products where + * each input broadcasts across all output neurons via SIMD FMA. + * + * __aligned(64) ensures both AVX2 (vmovaps, 32-byte) and AVX-512 + * (vmovaps, 64-byte) aligned loads work correctly. + */ +struct nap_weights { + /* Hidden layer: input[16] → hidden[16] */ + float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE]; /* 256 params */ + float b_h1[NAP_HIDDEN_SIZE]; /* 16 params */ + /* Output layer: hidden[16] → output[10] */ + float w_out[NAP_HIDDEN_SIZE][NAP_OUTPUT_SIZE]; /* 160 params */ + float b_out[NAP_OUTPUT_SIZE]; /* 10 params */ +} __aligned(64); +/* Total: 442 parameters = 1,768 bytes */ + +/* ISA-specific forward pass implementations */ +void nap_nn_forward_sse2(const float *input, float *output, + float *hidden_save, const struct nap_weights *w); +void nap_nn_forward_avx2(const float *input, float *output, + float *hidden_save, const struct nap_weights *w); +void nap_nn_forward_avx512(const float *input, float *output, + float *hidden_save, const struct nap_weights *w); + +/* ISA-specific online learning (backpropagation) */ +struct nap_cpu_data; +void nap_nn_learn_sse2(struct nap_cpu_data *d, int ideal); +void nap_nn_learn_avx2(struct nap_cpu_data *d, int ideal); +void nap_nn_learn_avx512(struct nap_cpu_data *d, int ideal); + +/* Static keys for ISA dispatch (defined in nap.c) */ +DECLARE_STATIC_KEY_FALSE(nap_use_avx512); +DECLARE_STATIC_KEY_FALSE(nap_use_avx2); + +/* ================================================================ + * SIMD type definitions and helpers (GCC vector extensions) + * + * Only available when compiled with FPU/SSE flags (nap_fpu.c, + * nap_nn_*.c). nap.c is compiled without FPU flags and must + * not see these definitions. + * + * is a userspace header and cannot be used in kernel. + * We use __attribute__((__vector_size__())) and __builtin_ia32_*. + * ================================================================ */ + +#ifdef __SSE2__ + +typedef float v4sf __attribute__((__vector_size__(16))); /* xmm: 4×float */ +typedef int v4si __attribute__((__vector_size__(16))); /* xmm: 4×int32 */ +typedef float v8sf __attribute__((__vector_size__(32))); /* ymm: 8×float */ +typedef float v16sf __attribute__((__vector_size__(64))); /* zmm: 16×float */ + +/* Broadcast helpers */ +#define V4SF_SET1(x) ((v4sf){ (x), (x), (x), (x) }) +#define V4SI_SET1(x) ((v4si){ (x), (x), (x), (x) }) +#define V8SF_SET1(x) ((v8sf){ (x),(x),(x),(x),(x),(x),(x),(x) }) +#define V8SF_ZERO V8SF_SET1(0.0f) +#define V16SF_SET1(x) ((v16sf){ (x),(x),(x),(x),(x),(x),(x),(x), \ + (x),(x),(x),(x),(x),(x),(x),(x) }) + +/* Unaligned load/store helpers */ +static inline v4sf v4sf_loadu(const float *p) +{ + v4sf result; + __builtin_memcpy(&result, p, sizeof(result)); + return result; +} + +static inline void v4sf_storeu(float *p, v4sf v) +{ + __builtin_memcpy(p, &v, sizeof(v)); +} + +#ifdef __AVX__ +static inline v8sf v8sf_loadu(const float *p) +{ + v8sf result; + __builtin_memcpy(&result, p, sizeof(result)); + return result; +} + +static inline void v8sf_storeu(float *p, v8sf v) +{ + __builtin_memcpy(p, &v, sizeof(v)); +} +#endif /* __AVX__ */ + +/* Scalar/vector clamp helpers */ +static inline float fclampf(float v, float lo, float hi) +{ + if (v < lo) return lo; + if (v > hi) return hi; + return v; +} + +static inline v4sf v4sf_clamp(v4sf v, v4sf lo, v4sf hi) +{ + return __builtin_ia32_maxps(__builtin_ia32_minps(v, hi), lo); +} + +/* Type punning: float ↔ int reinterpret (no instruction generated) */ +static inline v4si v4sf_as_v4si(v4sf v) +{ + union { v4sf f; v4si i; } u = { .f = v }; + return u.i; +} + +static inline v4sf v4si_as_v4sf(v4si v) +{ + union { v4si i; v4sf f; } u = { .i = v }; + return u.f; +} + +/* + * fast_log2f_sse() — Compute log2 of 4 floats simultaneously using SSE2 + * + * Cost: ~15 cycles for 4 values (~4 cycles per value) + */ +static inline v4sf fast_log2f_sse(v4sf x) +{ + const v4si mask_exp = V4SI_SET1(0xFF); + const v4si bias = V4SI_SET1(127); + const v4si mask_mant = V4SI_SET1(0x7FFFFF); + const v4si exp_bias = V4SI_SET1(127 << 23); + + v4si xi = v4sf_as_v4si(x); + v4si exp_i = (xi >> 23) & mask_exp; + exp_i = exp_i - bias; + v4sf e = __builtin_convertvector(exp_i, v4sf); + + v4si mant_i = (xi & mask_mant) | exp_bias; + v4sf m = v4si_as_v4sf(mant_i) - V4SF_SET1(1.0f); + + v4sf p; + p = m * V4SF_SET1(0.4808f); + p = V4SF_SET1(0.7213f) - p; + p = m * p; + p = V4SF_SET1(1.4425f) - p; + p = m * p; + + return e + p; +} + +#endif /* __SSE2__ */ + +/* ================================================================ + * Feature extraction + * ================================================================ */ + +#define NAP_HISTORY_SIZE 8 +#define NAP_NUM_FEATURES 16 +#define NAP_SHORT_THRESH_NS (100 * NSEC_PER_USEC) + +struct nap_stats { + u64 total_selects; + u64 total_residency_ns; + u64 undershoot_count; + u64 learn_count; +}; + +struct nap_cpu_data { + /* Ring buffer */ + u64 history[NAP_HISTORY_SIZE]; + float log_history[NAP_HISTORY_SIZE]; + int hist_idx; + int hist_count; + + /* Statistics tracking */ + u64 total_above; + u64 total_usage; + u64 intercept_recent; + u64 intercept_window; + u64 short_count; + u64 total_count; + + /* External signal tracking */ + u64 prev_irq_count; + ktime_t prev_idle_exit; + s64 last_predicted_ns; + s64 last_prediction_error; + + /* select/reflect handoff */ + int last_selected_idx; + float nn_output[NAP_OUTPUT_SIZE]; + /* + * hidden_out[] is written with aligned SIMD stores in + * nap_nn_forward_{sse2,avx2,avx512}(): + * SSE2: movaps (16-byte aligned) + * AVX2: vmovaps (32-byte aligned) + * AVX-512: vmovaps zmm (64-byte aligned) + * Without __aligned(64), the natural struct offset of this + * field (228) is only 4-byte aligned, causing #GP faults + * in the idle task → "Attempted to kill the idle task!" panic. + */ + float hidden_out[NAP_HIDDEN_SIZE] __aligned(64); + float features_f32[NAP_NUM_FEATURES]; + + /* Deferred learning data */ + bool needs_learn; + u64 learn_actual_ns; + + /* Online learning */ + struct nap_weights weights; + bool learning_mode; + unsigned int learning_rate_millths; + unsigned int max_grad_norm_millths; + int learn_interval; + int learn_counter; + unsigned int warmup_threshold; /* min learns before convergence check */ + unsigned int convergence_thresh; /* EMA accuracy threshold (x1024) */ + unsigned int ema_accuracy; /* EMA of NN hit rate (x1024 = 100%) */ + bool converged; /* true after NN accuracy converges */ + bool reset_pending; /* set by sysfs, consumed by nap_select */ + + /* sysfs statistics */ + struct nap_stats stats; +}; + +DECLARE_PER_CPU(struct nap_cpu_data, nap_data); + +/* FPU entry point (nap_fpu.c) — call only within kernel_fpu_begin/end */ +int nap_fpu_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + struct nap_cpu_data *d); + +/* sysfs interface */ +int nap_sysfs_init(void); +void nap_sysfs_exit(void); + +#endif /* NAP_H */ diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c new file mode 100644 index 0000000000..b194f26a01 --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_fpu.c @@ -0,0 +1,449 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor + * + * This file is compiled with FPU/SSE flags enabled (CC_FLAGS_FPU). + * ALL functions here MUST be called only from within + * kernel_fpu_begin()/kernel_fpu_end() blocks. + * + * Keeping FPU code in a separate translation unit ensures the compiler + * cannot emit SSE/x87 instructions in non-FPU code paths (nap.c), + * which would silently corrupt userspace FPU register state. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "nap.h" + +/* ================================================================ + * Float math helpers + * ================================================================ */ + +static inline float sse_sqrtf(float x) +{ + v4sf v = { x, 0, 0, 0 }; + v4sf r = __builtin_ia32_sqrtps(v); + return r[0]; +} + +static inline float float_min(float a, float b) { return a < b ? a : b; } +static inline float float_max(float a, float b) { return a > b ? a : b; } + +/* Scalar log2 approximation (same algorithm as fast_log2f_sse) */ +static inline float fast_log2f(float x) +{ + union { float f; u32 i; } u = { .f = x }; + int exp = (int)((u.i >> 23) & 0xFFu) - 127; + float e = (float)exp; + float m, p; + + u.i = (u.i & 0x7FFFFFu) | (127u << 23); + m = u.f - 1.0f; + + p = m * 0.4808f; + p = 0.7213f - p; + p = m * p; + p = 1.4425f - p; + p = m * p; + + return e + p; +} + +/* ================================================================ + * Deterministic PRNG for weight initialization (LCG) + * ================================================================ */ + +static inline float nap_prng_float(u32 *state) +{ + *state = *state * 1664525u + 1013904223u; + return (float)(s32)*state * (1.0f / 2147483648.0f); +} + +/* ================================================================ + * ISA dispatch via static keys + * ================================================================ */ + +static inline void nap_nn_forward(const float *input, float *output, + float *hidden_save, + const struct nap_weights *w) +{ + if (static_branch_unlikely(&nap_use_avx512)) + nap_nn_forward_avx512(input, output, hidden_save, w); + else if (static_branch_unlikely(&nap_use_avx2)) + nap_nn_forward_avx2(input, output, hidden_save, w); + else + nap_nn_forward_sse2(input, output, hidden_save, w); +} + +static inline void nap_nn_learn(struct nap_cpu_data *d, int ideal) +{ + if (static_branch_unlikely(&nap_use_avx512)) + nap_nn_learn_avx512(d, ideal); + else if (static_branch_unlikely(&nap_use_avx2)) + nap_nn_learn_avx2(d, ideal); + else + nap_nn_learn_sse2(d, ideal); +} + +/* ================================================================ + * Weight initialization + * + * Hidden layer: Xavier uniform init with fixed PRNG seed (deterministic). + * Output biases: informed by per-state exit_latency_ns so that + * shallow (low-latency) states are preferred initially. + * The NN then learns to override these via online training. + * ================================================================ */ + +#define NAP_PRNG_SEED 42u + +static void nap_init_weights(struct nap_weights *w, + struct cpuidle_driver *drv) +{ + u32 rng = NAP_PRNG_SEED; + float scale_h1, scale_out; + int i, j; + + /* Xavier uniform: U(-sqrt(6/(fan_in+fan_out)), +sqrt(6/(...))) */ + scale_h1 = sse_sqrtf(6.0f / (float)(NAP_INPUT_SIZE + NAP_HIDDEN_SIZE)); + scale_out = sse_sqrtf(6.0f / (float)(NAP_HIDDEN_SIZE + NAP_OUTPUT_SIZE)); + + /* Hidden layer weights */ + for (i = 0; i < NAP_INPUT_SIZE; i++) + for (j = 0; j < NAP_HIDDEN_SIZE; j++) + w->w_h1[i][j] = nap_prng_float(&rng) * scale_h1; + + /* Hidden biases: zero (standard) */ + memset(w->b_h1, 0, sizeof(w->b_h1)); + + /* Output layer weights */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) + for (i = 0; i < NAP_OUTPUT_SIZE; i++) + w->w_out[j][i] = nap_prng_float(&rng) * scale_out; + + /* Output biases: -0.1 * log2(exit_latency_ns) per state. + * Shallow states get ~0, deep states get ~-1.7. + * Unused states get -100 to ensure they're never selected. + */ + for (i = 0; i < NAP_OUTPUT_SIZE; i++) { + if (drv && i < drv->state_count) { + float lat = float_max( + (float)drv->states[i].exit_latency_ns, 1.0f); + w->b_out[i] = -fast_log2f(lat) * 0.1f; + } else { + w->b_out[i] = -100.0f; + } + } +} + +/* ================================================================ + * Feature extraction helpers + * ================================================================ */ + +static float logring_avg(const struct nap_cpu_data *d) +{ + int i, n = d->hist_count; + float sum = 0.0f; + + if (n == 0) + return 0.0f; + for (i = 0; i < n; i++) + sum += d->log_history[i]; + return sum / (float)n; +} + +static float logring_min(const struct nap_cpu_data *d) +{ + int i, n = d->hist_count; + float m; + + if (n == 0) + return 0.0f; + m = d->log_history[0]; + for (i = 1; i < n; i++) + m = float_min(m, d->log_history[i]); + return m; +} + +static float logring_max(const struct nap_cpu_data *d) +{ + int i, n = d->hist_count; + float m; + + if (n == 0) + return 0.0f; + m = d->log_history[0]; + for (i = 1; i < n; i++) + m = float_max(m, d->log_history[i]); + return m; +} + +static float logring_stdev(const struct nap_cpu_data *d) +{ + int i, n = d->hist_count; + float avg, var_sum = 0.0f, diff; + + if (n < 2) + return 0.0f; + avg = logring_avg(d); + for (i = 0; i < n; i++) { + diff = d->log_history[i] - avg; + var_sum += diff * diff; + } + return sse_sqrtf(var_sum / (float)n); +} + +static float compute_trend(const struct nap_cpu_data *d) +{ + int i, base; + float recent = 0.0f, older = 0.0f, avg; + + if (d->hist_count < NAP_HISTORY_SIZE) + return 0.0f; + + base = d->hist_idx; + for (i = 0; i < 4; i++) { + older += d->log_history[(base + i) % NAP_HISTORY_SIZE]; + recent += d->log_history[(base + 4 + i) % NAP_HISTORY_SIZE]; + } + + avg = logring_avg(d); + if (avg == 0.0f) + return 0.0f; + + return (recent - older) / (4.0f * avg); +} + +static u64 compute_irq_rate(const struct nap_cpu_data *d) +{ + u64 cur_irq = kstat_cpu_irqs_sum(smp_processor_id()); + ktime_t now = ktime_get(); + u64 elapsed = ktime_to_ns(ktime_sub(now, d->prev_idle_exit)); + u64 delta_irq = cur_irq - d->prev_irq_count; + + if (elapsed == 0) + return 0; + return div_u64(delta_irq * NSEC_PER_SEC, elapsed); +} + +static void nap_extract_features(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + float out[NAP_NUM_FEATURES]) +{ + struct nap_cpu_data *d = this_cpu_ptr(&nap_data); + ktime_t sleep_length, delta_tick; + u64 busy_ns; + s64 latency_req; + float log_inputs[4] __attribute__((aligned(16))); + float log_results[4] __attribute__((aligned(16))); + + sleep_length = tick_nohz_get_sleep_length(&delta_tick); + + busy_ns = ktime_to_ns(ktime_sub(ktime_get(), d->prev_idle_exit)); + + log_inputs[0] = (float)ktime_to_ns(sleep_length); + log_inputs[1] = (float)dev->last_residency_ns; + log_inputs[2] = (float)busy_ns; + log_inputs[3] = (float)compute_irq_rate(d); + + log_inputs[0] = float_max(log_inputs[0], 1.0f); + log_inputs[1] = float_max(log_inputs[1], 1.0f); + log_inputs[2] = float_max(log_inputs[2], 1.0f); + log_inputs[3] = float_max(log_inputs[3], 1.0f); + + { + v4sf log_in = *(const v4sf *)log_inputs; + v4sf log_out = fast_log2f_sse(log_in); + *(v4sf *)log_results = log_out; + } + + { + int prev = (d->hist_idx - 1 + NAP_HISTORY_SIZE) % NAP_HISTORY_SIZE; + d->log_history[prev] = log_results[1]; + } + + /* Group A: Time prediction */ + out[0] = log_results[0]; + out[1] = log_results[1]; + out[2] = logring_avg(d); + out[3] = logring_stdev(d); + + /* Group B: Pattern analysis */ + out[4] = logring_min(d); + out[5] = logring_max(d); + out[6] = compute_trend(d); + out[7] = (d->total_count > 0) + ? (float)d->short_count / (float)d->total_count + : 0.0f; + + /* Group C: State feedback */ + out[8] = (d->total_usage > 0) + ? (float)d->total_above / (float)d->total_usage + : 0.0f; + out[9] = (d->intercept_window > 0) + ? (float)d->intercept_recent / (float)d->intercept_window + : 0.0f; + /* Sign-preserving log2 scale: maps ±N µs → ±log2(N+1) */ + { + float err_f = (float)(d->last_prediction_error / 1000); + + out[10] = (err_f >= 0.0f) + ? fast_log2f(err_f + 1.0f) + : -fast_log2f(-err_f + 1.0f); + } + out[11] = log_results[2]; + + /* Group D: External signals */ + out[12] = tick_nohz_tick_stopped() ? 1.0f : 0.0f; + /* Clamp to 8 tasks then normalize to [0, 1] */ + out[13] = (float)min_t(unsigned int, nr_iowait_cpu(dev->cpu), 8) / 8.0f; + + latency_req = cpuidle_governor_latency_req(dev->cpu); + { + u64 deepest_lat = + drv->states[drv->state_count - 1].exit_latency_ns; + out[14] = (latency_req < S64_MAX && deepest_lat > 0) + ? (float)latency_req / (float)deepest_lat + : 1.0f; + } + out[15] = log_results[3]; + + d->last_predicted_ns = ktime_to_ns(sleep_length); +} + +/* ================================================================ + * Selection helpers + * ================================================================ */ + +static int compute_ideal_state(struct cpuidle_driver *drv, u64 actual_ns) +{ + int i, best = 0; + s64 best_score = 0; + + for (i = 1; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + s64 score; + + /* + * Responsiveness criterion: only consider a state "ideal" if + * the CPU slept long enough to cover both the minimum residency + * and the exit latency. This ensures the wakeup did not incur + * avoidable latency that the caller would have felt. + */ + if (actual_ns < s->target_residency_ns + s->exit_latency_ns) + continue; + + /* + * Score: net sleep benefit with exit latency as a fair cost. + * The former (i+1) depth multiplier is removed; it biased the + * label toward deeper states regardless of actual benefit and + * worked against responsiveness. + */ + score = (s64)(actual_ns - s->target_residency_ns) + - (s64)s->exit_latency_ns; + + if (score > best_score) { + best = i; + best_score = score; + } + } + return best; +} + +static int nap_nn_argmax(const float *output, int n) +{ + int i, best = 0; + + for (i = 1; i < n; i++) + if (output[i] > output[best]) + best = i; + return best; +} + +/* ================================================================ + * FPU entry point for nap_select + * + * Called within kernel_fpu_begin()/kernel_fpu_end(). + * Returns: selected idle state index (>= 0), or -1 to fall back + * to the integer heuristic. + * ================================================================ */ + +int nap_fpu_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + struct nap_cpu_data *d) +{ + float features[NAP_NUM_FEATURES]; + float output[NAP_OUTPUT_SIZE]; + /* Handle deferred weight reset (set by sysfs or nap_enable) */ + if (unlikely(d->reset_pending)) { + nap_init_weights(&d->weights, drv); + d->converged = false; + d->ema_accuracy = 0; + d->stats.learn_count = 0; + d->needs_learn = false; + d->reset_pending = false; + } + + /* Deferred learning (always, even during warmup) */ + if (d->needs_learn) { + int ideal = compute_ideal_state(drv, d->learn_actual_ns); + int nn_best = nap_nn_argmax(d->nn_output, drv->state_count); + bool hit = (nn_best == ideal); + + d->stats.learn_count++; + + /* Track NN accuracy for convergence detection */ + if (!d->converged) { + d->ema_accuracy += (hit ? 64 : 0) + - (d->ema_accuracy >> 4); + + if (d->stats.learn_count >= d->warmup_threshold && + d->ema_accuracy >= d->convergence_thresh) { + d->converged = true; + pr_info("nap: cpu%d converged (ema=%u/%u, learns=%llu)\n", + smp_processor_id(), + d->ema_accuracy, + d->convergence_thresh, + d->stats.learn_count); + } + } + + if (!hit) + nap_nn_learn(d, ideal); + d->needs_learn = false; + } + + /* Feature extraction + NN forward pass */ + nap_extract_features(drv, dev, features); + nap_nn_forward(features, output, d->hidden_out, &d->weights); + + memcpy(d->nn_output, output, sizeof(output)); + memcpy(d->features_f32, features, sizeof(features)); + + if (unlikely(!d->converged)) + return -1; /* Caller uses heuristic */ + + /* NN-based selection */ + { + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + float best_score = output[0]; + int idx = 0, i; + + for (i = 1; i < drv->state_count; i++) { + if (dev->states_usage[i].disable) + continue; + if (drv->states[i].exit_latency_ns > latency_req) + continue; + if (output[i] > best_score) { + best_score = output[i]; + idx = i; + } + } + return idx; + } +} diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c new file mode 100644 index 0000000000..6a0323012d --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP + * + * Uses 256-bit ymm registers and vfmadd231ps for fused multiply-add. + * + * Must be called within kernel_fpu_begin/end. + * Compiled with: CFLAGS += -mavx2 -mfma + */ + +#include "nap.h" + +/* Aligned load/store: GCC translates v8sf* dereference to vmovaps */ +static inline v8sf v8sf_load(const float *p) { return *(const v8sf *)p; } +static inline void v8sf_store(float *p, v8sf v) { *(v8sf *)p = v; } + +/* FMA: a*b+c → vfmadd231ps */ +static inline v8sf v8sf_fmadd(v8sf a, v8sf b, v8sf c) +{ + return __builtin_ia32_vfmaddps256(a, b, c); +} + +void nap_nn_forward_avx2(const float *input, + float *output, + float *hidden_save, + const struct nap_weights *w) +{ + int j; + + /* === Hidden layer: 16 outputs = 2×ymm === */ + v8sf acc0 = v8sf_load(&w->b_h1[0]); + v8sf acc1 = v8sf_load(&w->b_h1[8]); + + for (j = 0; j < NAP_INPUT_SIZE; j++) { + v8sf x = V8SF_SET1(input[j]); + acc0 = v8sf_fmadd(v8sf_load(&w->w_h1[j][0]), x, acc0); + acc1 = v8sf_fmadd(v8sf_load(&w->w_h1[j][8]), x, acc1); + } + + /* ReLU: max(0, x) */ + v8sf zero = V8SF_ZERO; + acc0 = __builtin_ia32_maxps256(acc0, zero); + acc1 = __builtin_ia32_maxps256(acc1, zero); + v8sf_store(&hidden_save[0], acc0); + v8sf_store(&hidden_save[8], acc1); + + /* + * Output layer: 10 outputs = 1×ymm (8) + scalar (2) + * w_out[16][10] rows are 40 bytes apart — use unaligned loads. + */ + v8sf out_acc = v8sf_loadu(&w->b_out[0]); + float out8 = w->b_out[8], out9 = w->b_out[9]; + + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v8sf h = V8SF_SET1(hidden_save[j]); + out_acc = v8sf_fmadd(v8sf_loadu(&w->w_out[j][0]), h, out_acc); + out8 += hidden_save[j] * w->w_out[j][8]; + out9 += hidden_save[j] * w->w_out[j][9]; + } + + v8sf_storeu(&output[0], out_acc); + output[8] = out8; + output[9] = out9; +} + +/* ymm clamp: max(min(v, hi), lo) */ +static inline v8sf v8sf_clamp(v8sf v, v8sf lo, v8sf hi) +{ + return __builtin_ia32_maxps256(__builtin_ia32_minps256(v, hi), lo); +} + +/* + * Online learning (backpropagation) — AVX2+FMA + * + * Output layer (10 neurons): 1×ymm + 2 scalars, with FMA + * Hidden layer (16 neurons): 2×ymm, with FMA + */ +void nap_nn_learn_avx2(struct nap_cpu_data *d, int ideal) +{ + int i, j; + float d_out[NAP_OUTPUT_SIZE] __aligned(16); + float d_hid[NAP_HIDDEN_SIZE] __aligned(32); + float lr = (float)d->learning_rate_millths / 1000.0f; + float clamp_val = (float)d->max_grad_norm_millths / 1000.0f; + v8sf v_neg_lr = V8SF_SET1(-lr); + v8sf v_cl_hi = V8SF_SET1(clamp_val); + v8sf v_cl_lo = V8SF_SET1(-clamp_val); + v8sf vd8; + float do8, do9; + + /* Output error: nn_output - one_hot(ideal) */ + __builtin_memcpy(d_out, d->nn_output, sizeof(d_out)); + d_out[ideal] -= 1.0f; + + vd8 = v8sf_loadu(&d_out[0]); + do8 = d_out[8]; + do9 = d_out[9]; + + /* Output weight update: w_out[j][i] -= lr * clamp(h[j] * d_out[i]) */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v8sf vh = V8SF_SET1(d->hidden_out[j]); + v8sf grad = v8sf_clamp(vh * vd8, v_cl_lo, v_cl_hi); + + v8sf_storeu(&d->weights.w_out[j][0], + v8sf_fmadd(v_neg_lr, grad, + v8sf_loadu(&d->weights.w_out[j][0]))); + d->weights.w_out[j][8] -= lr * fclampf( + d->hidden_out[j] * do8, -clamp_val, clamp_val); + d->weights.w_out[j][9] -= lr * fclampf( + d->hidden_out[j] * do9, -clamp_val, clamp_val); + } + + /* Output bias update: b_out[i] -= lr * clamp(d_out[i]) */ + v8sf_storeu(&d->weights.b_out[0], + v8sf_fmadd(v_neg_lr, v8sf_clamp(vd8, v_cl_lo, v_cl_hi), + v8sf_loadu(&d->weights.b_out[0]))); + d->weights.b_out[8] -= lr * fclampf(do8, -clamp_val, clamp_val); + d->weights.b_out[9] -= lr * fclampf(do9, -clamp_val, clamp_val); + + /* Hidden gradient: d_hid[j] = relu'(h[j]) * dot(w_out[j][:], d_out[:]) */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v8sf s = v8sf_loadu(&d->weights.w_out[j][0]) * vd8; + /* hsum ymm: extract hi128, add to lo128, then hsum xmm */ + v4sf lo = __builtin_ia32_vextractf128_ps256(s, 0); + v4sf hi = __builtin_ia32_vextractf128_ps256(s, 1); + v4sf s4 = lo + hi; + float sum = s4[0] + s4[1] + s4[2] + s4[3] + + d->weights.w_out[j][8] * do8 + + d->weights.w_out[j][9] * do9; + d_hid[j] = (d->hidden_out[j] > 0) ? sum : 0; + } + + /* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */ + { + v8sf dh0 = v8sf_load(&d_hid[0]); + v8sf dh1 = v8sf_load(&d_hid[8]); + + for (i = 0; i < NAP_INPUT_SIZE; i++) { + v8sf vf = V8SF_SET1(d->features_f32[i]); + v8sf *w = (v8sf *)&d->weights.w_h1[i][0]; + + w[0] = v8sf_fmadd(v_neg_lr, + v8sf_clamp(vf * dh0, v_cl_lo, v_cl_hi), + w[0]); + w[1] = v8sf_fmadd(v_neg_lr, + v8sf_clamp(vf * dh1, v_cl_lo, v_cl_hi), + w[1]); + } + + /* Hidden bias update */ + { + v8sf *b = (v8sf *)&d->weights.b_h1[0]; + + b[0] = v8sf_fmadd(v_neg_lr, + v8sf_clamp(dh0, v_cl_lo, v_cl_hi), + b[0]); + b[1] = v8sf_fmadd(v_neg_lr, + v8sf_clamp(dh1, v_cl_lo, v_cl_hi), + b[1]); + } + } +} diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx512.c b/drivers/cpuidle/governors/nap/nap_nn_avx512.c new file mode 100644 index 0000000000..11aa4a8ba3 --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_nn_avx512.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap_nn_avx512.c — AVX-512F forward pass and backpropagation for the nap MLP + * + * Uses 512-bit zmm registers. Hidden layer fits in a single zmm (16 floats). + * Output layer uses ymm (AVX2+FMA) for 8 outputs + 2 scalars. + * + * Must be called within kernel_fpu_begin/end. + * Compiled with: CFLAGS += -mavx512f + */ + +#include "nap.h" + +/* __mmask16 is not available without immintrin.h in kernel space */ +typedef unsigned short __mmask16; + +/* + * Portable zmm max/min wrappers. + * + * GCC exposes masked 5-argument builtins: + * __builtin_ia32_maxps512_mask(a, b, passthrough, mask, rounding) + * Clang exposes unmasked 3-argument builtins: + * __builtin_ia32_maxps512(a, b, rounding) + * Both produce the same vmaxps/vminps zmm instruction when the mask + * is all-ones (0xFFFF) and the rounding mode is current-direction (0x04). + */ +#ifdef __clang__ +static inline v16sf v16sf_max(v16sf a, v16sf b) +{ + return __builtin_ia32_maxps512(a, b, 0x04 /* _MM_FROUND_CUR_DIRECTION */); +} +static inline v16sf v16sf_min(v16sf a, v16sf b) +{ + return __builtin_ia32_minps512(a, b, 0x04); +} +#else +static inline v16sf v16sf_max(v16sf a, v16sf b) +{ + return __builtin_ia32_maxps512_mask(a, b, a, (__mmask16)0xFFFF, 0x04); +} +static inline v16sf v16sf_min(v16sf a, v16sf b) +{ + return __builtin_ia32_minps512_mask(a, b, a, (__mmask16)0xFFFF, 0x04); +} +#endif + +/* FMA: a*b+c → vfmadd231ps zmm + * gcc-12+ uses _mask variant with full mask (0xFFFF) for 16 lanes. */ +static inline v16sf v16sf_fmadd(v16sf a, v16sf b, v16sf c) +{ + return __builtin_ia32_vfmaddps512_mask(a, b, c, + (__mmask16)0xFFFF, + 0x04 /* _MM_FROUND_CUR_DIRECTION */); +} + +/* FMA for ymm (output layer): a*b+c → vfmadd231ps ymm */ +static inline v8sf v8sf_fmadd(v8sf a, v8sf b, v8sf c) +{ + return __builtin_ia32_vfmaddps256(a, b, c); +} + +void nap_nn_forward_avx512(const float *input, + float *output, + float *hidden_save, + const struct nap_weights *w) +{ + int j; + + /* === Hidden layer: 16 outputs = 1×zmm === */ + v16sf acc = *(const v16sf *)&w->b_h1[0]; + + for (j = 0; j < NAP_INPUT_SIZE; j++) { + v16sf x = V16SF_SET1(input[j]); + acc = v16sf_fmadd(*(const v16sf *)&w->w_h1[j][0], x, acc); + } + + /* ReLU: max(0, x) */ + v16sf zero = V16SF_SET1(0.0f); + acc = v16sf_max(acc, zero); + *(v16sf *)&hidden_save[0] = acc; + + /* === Output layer: 10 outputs = 1×ymm (8) + 2 scalars === */ + v8sf out_acc = v8sf_loadu(&w->b_out[0]); + float out8 = w->b_out[8], out9 = w->b_out[9]; + + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v8sf h = V8SF_SET1(hidden_save[j]); + out_acc = v8sf_fmadd(v8sf_loadu(&w->w_out[j][0]), h, out_acc); + out8 += hidden_save[j] * w->w_out[j][8]; + out9 += hidden_save[j] * w->w_out[j][9]; + } + + v8sf_storeu(&output[0], out_acc); + output[8] = out8; + output[9] = out9; +} + +/* ymm clamp: max(min(v, hi), lo) */ +static inline v8sf v8sf_clamp(v8sf v, v8sf lo, v8sf hi) +{ + return __builtin_ia32_maxps256(__builtin_ia32_minps256(v, hi), lo); +} + +/* zmm clamp: max(min(v, hi), lo) */ +static inline v16sf v16sf_clamp(v16sf v, v16sf lo, v16sf hi) +{ + return v16sf_max(v16sf_min(v, hi), lo); +} + +/* + * Online learning (backpropagation) — AVX-512F + * + * Output layer (10 neurons): 1×ymm + 2 scalars, with FMA + * Hidden layer (16 neurons): 1×zmm, with FMA + */ +void nap_nn_learn_avx512(struct nap_cpu_data *d, int ideal) +{ + int i, j; + float d_out[NAP_OUTPUT_SIZE] __aligned(16); + float d_hid[NAP_HIDDEN_SIZE] __aligned(64); + float lr = (float)d->learning_rate_millths / 1000.0f; + float clamp_val = (float)d->max_grad_norm_millths / 1000.0f; + v8sf v8_neg_lr = V8SF_SET1(-lr); + v8sf v8_cl_hi = V8SF_SET1(clamp_val); + v8sf v8_cl_lo = V8SF_SET1(-clamp_val); + v16sf v16_neg_lr = V16SF_SET1(-lr); + v16sf v16_cl_hi = V16SF_SET1(clamp_val); + v16sf v16_cl_lo = V16SF_SET1(-clamp_val); + v8sf vd8; + float do8, do9; + + /* Output error: nn_output - one_hot(ideal) */ + __builtin_memcpy(d_out, d->nn_output, sizeof(d_out)); + d_out[ideal] -= 1.0f; + + vd8 = v8sf_loadu(&d_out[0]); + do8 = d_out[8]; + do9 = d_out[9]; + + /* Output weight update: w_out[j][i] -= lr * clamp(h[j] * d_out[i]) */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v8sf vh = V8SF_SET1(d->hidden_out[j]); + v8sf grad = v8sf_clamp(vh * vd8, v8_cl_lo, v8_cl_hi); + + v8sf_storeu(&d->weights.w_out[j][0], + v8sf_fmadd(v8_neg_lr, grad, + v8sf_loadu(&d->weights.w_out[j][0]))); + d->weights.w_out[j][8] -= lr * fclampf( + d->hidden_out[j] * do8, -clamp_val, clamp_val); + d->weights.w_out[j][9] -= lr * fclampf( + d->hidden_out[j] * do9, -clamp_val, clamp_val); + } + + /* Output bias update */ + v8sf_storeu(&d->weights.b_out[0], + v8sf_fmadd(v8_neg_lr, v8sf_clamp(vd8, v8_cl_lo, v8_cl_hi), + v8sf_loadu(&d->weights.b_out[0]))); + d->weights.b_out[8] -= lr * fclampf(do8, -clamp_val, clamp_val); + d->weights.b_out[9] -= lr * fclampf(do9, -clamp_val, clamp_val); + + /* Hidden gradient: d_hid[j] = relu'(h[j]) * dot(w_out[j][:], d_out[:]) */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v8sf s = v8sf_loadu(&d->weights.w_out[j][0]) * vd8; + v4sf lo = __builtin_ia32_vextractf128_ps256(s, 0); + v4sf hi = __builtin_ia32_vextractf128_ps256(s, 1); + v4sf s4 = lo + hi; + float sum = s4[0] + s4[1] + s4[2] + s4[3] + + d->weights.w_out[j][8] * do8 + + d->weights.w_out[j][9] * do9; + d_hid[j] = (d->hidden_out[j] > 0) ? sum : 0; + } + + /* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) + * 16 hidden neurons = 1×zmm per input row */ + { + v16sf dh = *(const v16sf *)&d_hid[0]; + + for (i = 0; i < NAP_INPUT_SIZE; i++) { + v16sf vf = V16SF_SET1(d->features_f32[i]); + v16sf *w = (v16sf *)&d->weights.w_h1[i][0]; + + *w = v16sf_fmadd(v16_neg_lr, + v16sf_clamp(vf * dh, + v16_cl_lo, v16_cl_hi), + *w); + } + + /* Hidden bias update */ + { + v16sf *b = (v16sf *)&d->weights.b_h1[0]; + + *b = v16sf_fmadd(v16_neg_lr, + v16sf_clamp(dh, v16_cl_lo, v16_cl_hi), + *b); + } + } +} diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c new file mode 100644 index 0000000000..8993c8dd9d --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP + * + * Baseline implementation using SSE2, which is always available on x86_64. + * No FMA — uses separate mul + add (2 instructions per MAC). + * + * Must be called within kernel_fpu_begin/end. + * Compiled with: CFLAGS += -msse2 + */ + +#include "nap.h" + +/* Aligned load/store */ +static inline v4sf v4sf_load(const float *p) { return *(const v4sf *)p; } +static inline void v4sf_store(float *p, v4sf v) { *(v4sf *)p = v; } + +/* ReLU helper */ +static inline v4sf v4sf_max(v4sf a, v4sf b) +{ + return __builtin_ia32_maxps(a, b); +} + +void nap_nn_forward_sse2(const float *input, + float *output, + float *hidden_save, + const struct nap_weights *w) +{ + int j; + + /* === Hidden layer: 16 outputs = 4×xmm === */ + v4sf acc0 = v4sf_load(&w->b_h1[0]); + v4sf acc1 = v4sf_load(&w->b_h1[4]); + v4sf acc2 = v4sf_load(&w->b_h1[8]); + v4sf acc3 = v4sf_load(&w->b_h1[12]); + + for (j = 0; j < NAP_INPUT_SIZE; j++) { + v4sf x = V4SF_SET1(input[j]); + acc0 += v4sf_load(&w->w_h1[j][0]) * x; + acc1 += v4sf_load(&w->w_h1[j][4]) * x; + acc2 += v4sf_load(&w->w_h1[j][8]) * x; + acc3 += v4sf_load(&w->w_h1[j][12]) * x; + } + + /* ReLU */ + v4sf zero = V4SF_SET1(0.0f); + acc0 = v4sf_max(acc0, zero); + acc1 = v4sf_max(acc1, zero); + acc2 = v4sf_max(acc2, zero); + acc3 = v4sf_max(acc3, zero); + v4sf_store(&hidden_save[0], acc0); + v4sf_store(&hidden_save[4], acc1); + v4sf_store(&hidden_save[8], acc2); + v4sf_store(&hidden_save[12], acc3); + + /* + * Output layer: 10 outputs = 2×xmm (8) + scalar (2) + * w_out[16][10] rows are 40 bytes apart — use unaligned loads. + */ + v4sf out0 = v4sf_loadu(&w->b_out[0]); + v4sf out1 = v4sf_loadu(&w->b_out[4]); + float out8 = w->b_out[8], out9 = w->b_out[9]; + + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v4sf h = V4SF_SET1(hidden_save[j]); + out0 += v4sf_loadu(&w->w_out[j][0]) * h; + out1 += v4sf_loadu(&w->w_out[j][4]) * h; + out8 += hidden_save[j] * w->w_out[j][8]; + out9 += hidden_save[j] * w->w_out[j][9]; + } + + v4sf_storeu(&output[0], out0); + v4sf_storeu(&output[4], out1); + output[8] = out8; + output[9] = out9; +} + +/* + * Online learning (backpropagation) — SSE2 + * + * Output layer (10 neurons): 2×xmm + 2 scalars + * Hidden layer (16 neurons): 4×xmm + */ +void nap_nn_learn_sse2(struct nap_cpu_data *d, int ideal) +{ + int i, j; + float d_out[NAP_OUTPUT_SIZE] __aligned(16); + float d_hid[NAP_HIDDEN_SIZE] __aligned(16); + float lr = (float)d->learning_rate_millths / 1000.0f; + float clamp_val = (float)d->max_grad_norm_millths / 1000.0f; + v4sf v_lr = V4SF_SET1(lr); + v4sf v_cl_hi = V4SF_SET1(clamp_val); + v4sf v_cl_lo = V4SF_SET1(-clamp_val); + v4sf vd0, vd1; + float do8, do9; + + /* Output error: nn_output - one_hot(ideal) */ + __builtin_memcpy(d_out, d->nn_output, sizeof(d_out)); + d_out[ideal] -= 1.0f; + + vd0 = *(const v4sf *)&d_out[0]; + vd1 = *(const v4sf *)&d_out[4]; + do8 = d_out[8]; + do9 = d_out[9]; + + /* Output weight update: w_out[j][i] -= lr * clamp(h[j] * d_out[i]) */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v4sf vh = V4SF_SET1(d->hidden_out[j]); + + v4sf_storeu(&d->weights.w_out[j][0], + v4sf_loadu(&d->weights.w_out[j][0]) - + v_lr * v4sf_clamp(vh * vd0, v_cl_lo, v_cl_hi)); + v4sf_storeu(&d->weights.w_out[j][4], + v4sf_loadu(&d->weights.w_out[j][4]) - + v_lr * v4sf_clamp(vh * vd1, v_cl_lo, v_cl_hi)); + d->weights.w_out[j][8] -= lr * fclampf( + d->hidden_out[j] * do8, -clamp_val, clamp_val); + d->weights.w_out[j][9] -= lr * fclampf( + d->hidden_out[j] * do9, -clamp_val, clamp_val); + } + + /* Output bias update: b_out[i] -= lr * clamp(d_out[i]) */ + v4sf_storeu(&d->weights.b_out[0], + v4sf_loadu(&d->weights.b_out[0]) - + v_lr * v4sf_clamp(vd0, v_cl_lo, v_cl_hi)); + v4sf_storeu(&d->weights.b_out[4], + v4sf_loadu(&d->weights.b_out[4]) - + v_lr * v4sf_clamp(vd1, v_cl_lo, v_cl_hi)); + d->weights.b_out[8] -= lr * fclampf(do8, -clamp_val, clamp_val); + d->weights.b_out[9] -= lr * fclampf(do9, -clamp_val, clamp_val); + + /* Hidden gradient: d_hid[j] = relu'(h[j]) * dot(w_out[j][:], d_out[:]) */ + for (j = 0; j < NAP_HIDDEN_SIZE; j++) { + v4sf s = v4sf_loadu(&d->weights.w_out[j][0]) * vd0 + + v4sf_loadu(&d->weights.w_out[j][4]) * vd1; + float sum = s[0] + s[1] + s[2] + s[3] + + d->weights.w_out[j][8] * do8 + + d->weights.w_out[j][9] * do9; + d_hid[j] = (d->hidden_out[j] > 0) ? sum : 0; + } + + /* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */ + { + v4sf dh0 = *(const v4sf *)&d_hid[0]; + v4sf dh1 = *(const v4sf *)&d_hid[4]; + v4sf dh2 = *(const v4sf *)&d_hid[8]; + v4sf dh3 = *(const v4sf *)&d_hid[12]; + + for (i = 0; i < NAP_INPUT_SIZE; i++) { + v4sf vf = V4SF_SET1(d->features_f32[i]); + v4sf *w = (v4sf *)&d->weights.w_h1[i][0]; + + w[0] -= v_lr * v4sf_clamp(vf * dh0, v_cl_lo, v_cl_hi); + w[1] -= v_lr * v4sf_clamp(vf * dh1, v_cl_lo, v_cl_hi); + w[2] -= v_lr * v4sf_clamp(vf * dh2, v_cl_lo, v_cl_hi); + w[3] -= v_lr * v4sf_clamp(vf * dh3, v_cl_lo, v_cl_hi); + } + + /* Hidden bias update: b_h1[j] -= lr * clamp(d_hid[j]) */ + { + v4sf *b = (v4sf *)&d->weights.b_h1[0]; + + b[0] -= v_lr * v4sf_clamp(dh0, v_cl_lo, v_cl_hi); + b[1] -= v_lr * v4sf_clamp(dh1, v_cl_lo, v_cl_hi); + b[2] -= v_lr * v4sf_clamp(dh2, v_cl_lo, v_cl_hi); + b[3] -= v_lr * v4sf_clamp(dh3, v_cl_lo, v_cl_hi); + } + } +} -- 2.34.1