From 6078869bd7dbdacb076d36a8fc8e3ee592131413 Mon Sep 17 00:00:00 2001 From: Masahito S Date: Thu, 11 Jun 2026 09:08:40 +0900 Subject: [PATCH] linux7.1-rc5-lru_marie-0.3.5 --- include/linux/lru_marie.h | 494 ++++++ include/linux/mm_inline.h | 104 +- include/linux/mmzone.h | 30 + include/linux/swap.h | 18 + mm/Kconfig | 30 + mm/Makefile | 1 + mm/huge_memory.c | 26 +- mm/internal.h | 55 + mm/lru_marie/Makefile | 19 + mm/lru_marie/account.h | 176 ++ mm/lru_marie/bitmap.c | 166 ++ mm/lru_marie/bitmap.h | 228 +++ mm/lru_marie/core.c | 1982 +++++++++++++++++++++++ mm/lru_marie/drain_scope.h | 87 + mm/lru_marie/pfn_install.h | 92 ++ mm/lru_marie/prefetch.h | 111 ++ mm/lru_marie/simd.h | 99 ++ mm/lru_marie/simd_generic.c | 46 + mm/lru_marie/simd_x86.c | 167 ++ mm/lru_marie/simd_x86_avx2.S | 214 +++ mm/lru_marie/simd_x86_avx512.S | 199 +++ mm/lru_marie/simd_x86_sse2.S | 215 +++ mm/lru_marie/state.c | 2745 ++++++++++++++++++++++++++++++++ mm/lru_marie/state.h | 1335 ++++++++++++++++ mm/lru_marie/version.h | 22 + mm/lru_marie/walker.c | 961 +++++++++++ mm/memcontrol-v1.c | 13 + mm/memcontrol.c | 35 + mm/mm_init.c | 3 + mm/page_alloc.c | 76 + mm/page_io.c | 187 +++ mm/rmap.c | 6 + mm/swap.c | 248 ++- mm/swap.h | 4 + mm/vmscan.c | 372 ++++- 35 files changed, 10546 insertions(+), 20 deletions(-) create mode 100644 include/linux/lru_marie.h create mode 100644 mm/lru_marie/Makefile create mode 100644 mm/lru_marie/account.h create mode 100644 mm/lru_marie/bitmap.c create mode 100644 mm/lru_marie/bitmap.h create mode 100644 mm/lru_marie/core.c create mode 100644 mm/lru_marie/drain_scope.h create mode 100644 mm/lru_marie/pfn_install.h create mode 100644 mm/lru_marie/prefetch.h create mode 100644 mm/lru_marie/simd.h create mode 100644 mm/lru_marie/simd_generic.c create mode 100644 mm/lru_marie/simd_x86.c create mode 100644 mm/lru_marie/simd_x86_avx2.S create mode 100644 mm/lru_marie/simd_x86_avx512.S create mode 100644 mm/lru_marie/simd_x86_sse2.S create mode 100644 mm/lru_marie/state.c create mode 100644 mm/lru_marie/state.h create mode 100644 mm/lru_marie/version.h create mode 100644 mm/lru_marie/walker.c diff --git a/include/linux/lru_marie.h b/include/linux/lru_marie.h new file mode 100644 index 0000000000..7ef003b89f --- /dev/null +++ b/include/linux/lru_marie.h @@ -0,0 +1,494 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LRU_MARIE_H +#define _LINUX_LRU_MARIE_H + +/* + * Marie LRU — public API. + * + * Marie represents each folio's reclaim state as a single byte in a + * flat per-PFN array allocated once at boot (see mm/lru_marie/state.h + * for the byte layout). install / delete / aging are single byte + * writes — no allocation in any fault-path operation, no per-folio + * linked-list linkage. isolate batches the array with a persistent + * cursor and SIMD scan. + * + * The vmscan core invokes Marie only when lru_marie_enabled() is true; + * otherwise the in-tree reclaim paths run unchanged. + * + * Enabled by default; opt out at boot with `lru_marie=0` on the kernel + * command line, or at runtime via /sys/kernel/mm/lru_marie/enabled + * (echo 1 / echo 0). + * + * This header exposes only the thin dispatch surface that mm/vmscan.c, + * mm/swap.c, mm/memcontrol.c, etc. need to know about. Everything + * else lives inside mm/lru_marie/. + */ + +#include +#include +#include +#include +#include + +struct folio; +struct lruvec; +struct mem_cgroup; +struct page_vma_mapped_walk; +struct pglist_data; +struct scan_control; + +#ifdef CONFIG_LRU_MARIE + +DECLARE_STATIC_KEY_TRUE(lru_marie_enabled_key); + +/** + * lru_marie_enabled - is the Marie reclaim path currently active? + * + * Inlined static-branch check. Default is enabled; the static-key + * compiles into a single unconditional jump that the predictor + * resolves in zero cycles. The MGLRU/Legacy paths cost nil when Marie + * is enabled (the common case) since the branch falls through to the + * Marie-side code without any conditional dispatch overhead. + */ +static inline bool lru_marie_enabled(void) +{ + return static_branch_likely(&lru_marie_enabled_key); +} + +DECLARE_STATIC_KEY_FALSE(marie_state_ready_key); + +/** + * marie_state_ready - has the per-PFN marie_state[] array been allocated? + * + * Distinct from lru_marie_enabled(): the enable key is the runtime + * reclaim-policy toggle (flipped by the sysfs knob and during the + * disable/enable transition), whereas this key latches true once + * marie_state[] is allocated at first-enable and NEVER flips back -- + * the array is never freed for the kernel's lifetime. + * + * The page-free hook (lru_marie_free_page_hook) must gate on THIS key, + * not on lru_marie_enabled(): during a disable transition the enable + * key is already false while marie_drain walks the bitmaps, but freed + * pages still carry stale TRACKED bits that must be wiped at the buddy + * handoff so the drain walk never dereferences a re-allocated folio's + * poisoned list head. Gating the hook on the enable key would skip + * exactly that window. + */ +static inline bool marie_state_ready(void) +{ + return static_branch_unlikely(&marie_state_ready_key); +} + +/** + * lru_marie_mark_accessed - Marie's hot-signal entry point for folio_mark_accessed. + * + * Bumps @folio's Marie tier in the per-PFN marie_state[] byte. Tier is + * the canonical hotness signal in Marie: the walker bumps tier on + * young-bit hits, and this helper lets external "user just touched" + * callers (folio_mark_accessed) feed the same channel. When tier is at + * MARIE_TIER_MAX the helper triggers a synchronous in-place promote + * (marie_state_move_to_gen to head_gen at tier 0) inside + * marie_state_inc_tier. Calling this from the user access hot path + * therefore costs at most one byte write plus a possible single CAS; + * no slab alloc, no enqueue. + * + * Why not folio_set_referenced(): Marie's tier-based gen rotation already + * encodes "recently accessed". Setting PG_referenced in addition produced + * a double-counting hot signal that the reclaim path had to reconcile, + * and the reconciliation rule (any of {PG_referenced, PG_active} treated + * as promote-in-place during reclaim) starved kswapd reclaim under + * fault-burst workloads. + */ +void lru_marie_mark_accessed(struct folio *folio); + +/** + * folio_marie_get_tier - return @folio's Marie hotness tier (0..3). + * + * Reads the per-PFN marie_state[folio_pfn(folio)] byte's MARIE_PFN_TIER + * field. Returns 0 if Marie is disabled, the PFN is out of range, or + * the folio is not Marie-tracked. + */ +unsigned int folio_marie_get_tier(const struct folio *folio); + +/** + * lru_marie_test_tracked - is @folio currently tracked by Marie? + * + * Reads the per-PFN marie_state[folio_pfn(folio)] byte's TRACKED bit. + * Returns false if Marie is disabled, the PFN is out of range, or the + * folio is not Marie-tracked. + * + * Used by mm/swap.c per-cpu folio_batch entry points (rotate / activate + * / deactivate / lazyfree) to skip queueing Marie folios: those paths + * do legacy lruvec_del_folio + lruvec_add_folio, whose list_del/list_add + * assume the folio is on a legacy lruvec list. Marie folios sit on a + * self-loop (folio->lru points at itself), not on a legacy list, so a + * legacy del/add would corrupt the list. (mz->lru_zone_size is balanced + * for Marie folios now -- marie_update_lru_size credits it at install -- + * so the hazard is list corruption, not count underflow.) + */ +bool lru_marie_test_tracked(const struct folio *folio); + +/* + * Per-cpu folio_batch LRU-op interface. mm/swap.c's folio_activate / + * folio_deactivate / deactivate_file_folio / folio_rotate_reclaimable / + * folio_mark_lazyfree each call the matching hook below; a true return + * means Marie owns the folio and has applied the operation directly on + * its per-PFN state, so the caller must NOT queue the folio onto the + * legacy per-cpu folio_batch (which assumes legacy-LRU list/mz invariants + * Marie folios break). A false return (Marie disabled or folio untracked) + * lets the caller fall through to the legacy folio_batch path unchanged. + * + * This mirrors lru_marie_add_folio / lru_marie_del_folio's bool contract: + * the Marie-specific semantics live here in mm/lru_marie/, not as inline + * gates scattered across mm/swap.c. + * + * Marie-state equivalents: + * deactivate / _file -> demote (move to oldest gen, tier 0) [MADV_COLD] + * lazyfree -> clear swapbacked + demote [MADV_FREE] + * activate / rotate -> no-op (skip the batch only) + * + * activate / rotate are reclaim-internal hints: Marie already decides + * hotness via its tier vote in folio_check_references and orders reclaim + * by gen aging, so promoting/rotating here would only fight reclaim (an + * activate-promote starves it under all-hot workloads). Only the explicit + * user madvise paths (deactivate=MADV_COLD, lazyfree=MADV_FREE) map to a + * real Marie-state change. + */ +bool lru_marie_activate(struct folio *folio); +bool lru_marie_deactivate(struct folio *folio); +bool lru_marie_rotate(struct folio *folio); +bool lru_marie_lazyfree(struct folio *folio); + +/** + * lru_marie_free_page_hook - canonical per-PFN state teardown at buddy + * handoff. + * + * Invoked from mm/page_alloc.c::free_pages_prepare for every page about + * to enter the buddy allocator. When marie_state[pfn] still carries + * TRACKED -- which happens whenever the reclaim isolate path + * (marie_evict_counters_only) decremented counters but intentionally + * preserved the state byte so install_local's TRACKED early-out kept + * blocking concurrent installs during shrink_folio_list -- this wipes + * the byte, the global (type, gen, tier) bitmap bit, and the + * gen_occupied slot in one lock-free pass. + * + * After this hook the next install at the same PFN starts from a clean + * state byte regardless of how quickly the page is re-allocated; no + * deferred drop pass is needed at the reclaim caller side. + * + * Static-branch gated by lru_marie_enabled() at the call site to keep + * the !Marie build / runtime byte-identical. + */ +void lru_marie_free_page_hook(unsigned long pfn); + +/* + * Return value of lru_marie_shrink_lruvec(): a mask of the LRU type(s) the + * Marie pick driver actually scanned this call. shrink_lruvec's legacy orphan + * drain reclaims ONLY these types (it zeroes the nr[] of any unset type), so + * it never cuts a type Marie's swappiness / clean_min_ratio / ANON_STRICT + * policy protected -- unlike stock get_scan_count, which the legacy drain + * would otherwise follow blindly (SCAN_EQUAL at priority 0, etc.). + */ +#define MARIE_DRAIN_ANON 0x1u +#define MARIE_DRAIN_FILE 0x2u + +/** + * lru_marie_shrink_lruvec - Marie's replacement for shrink_lruvec(). + * + * Called from mm/vmscan.c shrink_lruvec() when lru_marie_enabled() is true. + * Updates sc->nr_reclaimed in place. Returns a MARIE_DRAIN_* mask of the + * type(s) it scanned so the caller's legacy orphan drain can mirror the pick + * policy instead of running stock get_scan_count's. + */ +unsigned int lru_marie_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc); + +/** + * lru_marie_exit_memcg - drop any per-lruvec Marie state held for @memcg. + * + * Called from mm/memcontrol.c during memcg teardown, alongside the + * MGLRU lru_gen_exit_memcg() hook. Walks the memcg's lruvecs and + * removes them from Marie's side-channel xarray. Always safe to call, + * even when lru_marie_enabled() is false (a memcg may have been allocated + * while Marie was on, then Marie was disabled, then the memcg dies). + */ +void lru_marie_exit_memcg(struct mem_cgroup *memcg); + +/** + * lru_marie_offline_memcg - drain all lruvecs for @memcg at css_offline time. + * + * Called before css_rstat_exit() frees rstat_cpu, so lru_gen_fill_lruvec() + * inside the drain can safely call mod_memcg_lruvec_state(). Also sets + * mlv->offline on each lruvec so subsequent lru_marie_add_folio calls + * bail out and legacy lists stay empty through css_free. + */ +void lru_marie_offline_memcg(struct mem_cgroup *memcg); + +/** + * lru_marie_memcg_alloc - allocate per-memcg occupancy bitmap for @memcg. + * + * Cmdline-gated (lru_marie.memcg_bitmap=1). When the gate is off this + * is a no-op; the scan path falls back to per-candidate folio_memcg() + * lookup. When the gate is on, allocates max_pfn bits keyed by memcg + * pointer in an xarray so the scan can AND the gen bitmap with the + * memcg bitmap, eliding the per-candidate folio cacheline touch on + * cgroup-targeted reclaim. + * + * Returns 0 on success or no-op, -ENOMEM on alloc failure (caller + * may continue -- scan falls back to folio_memcg lookup). + * + * Called from mm/memcontrol.c mem_cgroup_alloc, GFP_KERNEL context. + */ +int lru_marie_memcg_alloc(struct mem_cgroup *memcg); + +/** + * lru_marie_reparent_lruvec - migrate every Marie-tracked folio from @child_lv + * into @parent_lv at memcg reparenting. + * @child_lv: child memcg's lruvec (memcg being torn down) + * @parent_lv: parent memcg's lruvec (recipient) + * + * NOTE: currently has NO in-tree caller. Marie handles memcg offline by + * draining (lru_marie_offline_memcg -> marie_drain_one_lruvec), not by + * reparenting. This is the reparent counterpart, kept for a future + * memcg-offline path that hands child folios to @parent instead of + * draining them. + * + * Lock contract for a future caller: it MUST hold both lruvecs' lru_lock + * with IRQs disabled, AND must acquire the two lru_locks in a + * deterministic global order (e.g. by lruvec address) so two concurrent + * reparents cannot deadlock A-B/B-A. This function itself only takes + * @child_mlv's per-type locks via the marie_both_mlv guard; the merge is + * a per-memcg bitmap OR + atomic counter transfer, with no per-folio + * iteration. + * + * Safe to call when lru_marie_enabled() is false or @child has no mlv yet — + * both cases short-circuit cleanly. + */ +void lru_marie_reparent_lruvec(struct lruvec *child_lv, struct lruvec *parent_lv); + +/** + * lru_marie_add_folio - try to register @folio with Marie. + * @lruvec: the lruvec @folio is being added to + * @folio: the folio + * @reclaiming: caller hint, unused at this stage + * + * Returns true if Marie took ownership of the folio (and the caller must + * skip the legacy lruvec list_add) — false if Marie declined (gate off, + * lruvec state unavailable, or allocation failed) and the caller should + * fall through to the existing MGLRU / Legacy path. + */ +bool lru_marie_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming); + +/** + * lru_marie_orphan_add - pure legacy LRU add for an untracked orphan. + * @lruvec: the lruvec @folio is being added to + * @folio: an untracked (non-Marie) folio + * @tail: add to the list tail rather than the head + * + * A del+add move_fn (swap.c: lru_activate / lru_deactivate{,_file} / + * lru_lazyfree) and the legacy reclaim putback (vmscan.c: + * move_folios_to_lru) run lruvec_del_folio() -- legacy del, mz -nr for an + * untracked folio -- and then add the folio back. Routing that add through + * lruvec_add_folio() -> lru_marie_add_folio() would ADOPT the orphan into + * Marie; the install credits Marie's own bucket but the original -nr was a + * legacy debit, so the legacy mz->lru_zone_size drifts and a later del + * underflows ("marie underflow-del"). This does the +nr legacy leg only, + * never adopting. Callers MUST first bail on lru_marie_test_tracked() + * folios -- a tracked folio is Marie-owned and must never touch a legacy + * list. + */ +void lru_marie_orphan_add(struct lruvec *lruvec, struct folio *folio, bool tail); + + +/** + * lru_marie_del_folio - try to remove @folio from Marie. + * @lruvec: the lruvec @folio is being removed from + * @folio: the folio + * @reclaiming: caller hint, unused at this stage + * + * Returns true iff @folio was tracked by Marie and has now been removed. + * Returns false if @folio was on the legacy LRU instead, in which case + * the caller continues with the legacy delete path. + */ +bool lru_marie_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming); + +/** + * lru_marie_release_folio - outer-level release for a TRACKED folio + * reaching refcount 0 (called from __page_cache_release). + * @folio: the folio being released + * @lruvecp: caller's lruvec batch pointer (may be NULL or hold a lock) + * @flagsp: caller's irqsave flags slot + * + * The dispatch contract: when Marie is enabled, upstream callers MUST + * gate by folio_marie_test_tracked() and call this helper for TRACKED + * folios INSTEAD OF the legacy folio_test_lru / lruvec_del_folio path. + * A TRACKED folio sits on a Marie self-loop, not on a legacy lruvec + * list, so legacy lruvec_del_folio's list_del would corrupt it; this + * helper unlinks the self-loop and debits mz instead. TRACKED is the + * single source of truth. + * + * Internally: relocks @lruvecp to @folio's lruvec with IRQs disabled, + * re-tests TRACKED under the lock, runs Marie's del (which leaves mz + * untouched), and clears PG_lru. If TRACKED was cleared between the + * caller's outer test and our lock acquisition (race with drain or + * evict), falls back to the legacy del so accounting stays coherent. + * Leaves the lock held in *@lruvecp for the caller's batch context. + */ +void lru_marie_release_folio(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp); + +/** + * lru_marie_split_folio - install a freshly-split tail folio under Marie. + * @lruvec: head folio's lruvec (caller holds lru_lock) + * @head: THP head folio currently tracked by Marie + * @new_folio: tail folio created by __split_huge_page + * + * Mirror of mm/huge_memory.c::lru_add_split_folio's + * "list_add_tail(&new_folio->lru, &folio->lru)" for the case where + * @head is Marie-tracked. Publishes the per-PFN state byte for the new + * folio so the dispatcher routes its eventual del through Marie; + * otherwise the new tail would be untracked, dispatcher del would fall + * to legacy update_lru_size, mlv->types[].nr_pages would not decrement, + * and reclaim heuristics would drift. + * + * Caller MUST verify lru_marie_enabled() && folio_marie_test_tracked(head) + * before calling. Caller holds @lruvec->lru_lock; the per-type lock is + * taken internally. + * + * Caller is responsible for folio_set_lru(@new_folio) AFTER this + * returns — the "state byte published before PG_lru" rule is preserved + * by the call ordering. + */ +void lru_marie_split_folio(struct lruvec *lruvec, struct folio *head, + struct folio *new_folio); + +/** + * lru_marie_look_around - opportunistic PMD scan during rmap reference check. + * @pvmw: page-vma-mapped walk supplied by folio_referenced_one() + * @nr: number of consecutive PTEs of the target folio at pvmw->address + * + * Called from rmap.c::folio_referenced_one() in the Marie branch with + * pvmw->ptl already held. Clears the target folio's young bit (returning + * its previous state) and, while the PTL is hot, scans up to + * MARIE_LOOK_AROUND_BATCH PTEs of the surrounding PMD, clearing young bits + * found there too. This batches what would otherwise be one rmap walk + * per neighbouring folio and improves the accuracy of subsequent + * folio_referenced() calls. Returns true iff the target's own PTE(s) + * were young. + */ +bool lru_marie_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr); + +/** + * lru_marie_age_node - kswapd pre-reclaim aging hook for Marie. + * @pgdat: kswapd's pgdat + * @sc: kswapd's scan_control + * + * Called from kswapd_age_node() when Marie owns the LRU. Drives the + * proactive PTE walker (marie_walk_pgdat internally) so per-PFN tier + * encoding has accurate hot/cold ordering by the time direct reclaim + * picks the oldest gen. Internally rate-limited; safe to call from any + * kswapd cycle. + */ +void lru_marie_age_node(struct pglist_data *pgdat, struct scan_control *sc); + +/** + * lru_marie_swappiness_changed - notify Marie that a swappiness value + * has been written via sysctl or memcg. + * + * Resets every per-lruvec swap_bias counter to zero so the next + * reclaim cycle starts from a neutral state under the new swappiness. + * Stale bias accumulated under the previous value would otherwise + * steer the first several picks in the wrong direction, especially + * across transitions into or out of the special-value range + * {0, 1, MAX_SWAPPINESS}. + * + * Walks every lruvec via mem_cgroup_iter unconditionally (no per-memcg + * filtering): the extra resets on unaffected lruvecs are harmless under + * a controller whose only state is the bias counter, and the + * alternative -- per memcg / per cgroup-version classification -- adds + * cost without changing observable behaviour. Sysctl writes are + * human-rate and the walk is O(N_lruvecs * one atomic64_set). + * + * Safe to call from sysctl proc_handler context (BKL-free, may sleep + * on xa_lock contention but never under the writer's caller locks). + */ +void lru_marie_swappiness_changed(void); + +/* + * Runtime-tunable knobs exposed via /sys/kernel/mm/lru_marie/. + * Read with READ_ONCE; sysfs store writes with WRITE_ONCE. Hot-path + * snapshots are taken at the top of each loop iteration so a concurrent + * write only takes effect on the next pass. + */ +extern unsigned long marie_gen_growth_threshold; +extern unsigned long marie_walker_interval_critical; /* jiffies */ +extern unsigned long marie_walker_interval_low; /* jiffies */ +extern unsigned long marie_walker_interval_normal; /* jiffies */ +extern unsigned long marie_walker_interval_idle; /* jiffies */ + +#ifdef CONFIG_SWAP +/* + * kcompmari mode (sysfs /sys/kernel/mm/lru_marie/kcompmari): + * signed -100..+100, default +24. + * + * 0 — disabled (kthread fan-out off, swap_writeout inline) + * +1..+100 — Marie-gated. |value| is the queue depth at which the + * producer treats the kfifo as full and falls back to + * synchronous writeout. + * -1..-100 — force mode. Same queue-length semantics; runs even + * when Marie is off. + * + * Default +24 mirrors the queue length kcompressd-unofficial proved + * sound under sustained anon pressure. The producer reads vm_kcompmari + * directly to derive the queue depth; the on/off and Marie/force gates + * are encoded as a pair of static branches so the hot path in + * mm/page_io.c::kcompmari_store costs a single jump in the common case: + * + * kcompmari_enabled_key — true when vm_kcompmari != 0 (default TRUE) + * kcompmari_force_key — true when vm_kcompmari < 0 (default FALSE) + * + * The Marie-gated branch (positive value) reuses lru_marie_enabled_key + * directly, so no extra branch is paid when Marie is on. + */ +DECLARE_STATIC_KEY_TRUE(kcompmari_enabled_key); +DECLARE_STATIC_KEY_FALSE(kcompmari_force_key); + +extern int vm_kcompmari; + +/** + * kcompmari_active - should kswapd off-load this swap-out to kcompmari? + * + * Default-on: the enabled_key starts TRUE for the +24 default. Setting + * vm_kcompmari to 0 flips it off; negative values force-on regardless + * of Marie; positive values gate on lru_marie_enabled_key. + */ +static inline bool kcompmari_active(void) +{ + if (!static_branch_likely(&kcompmari_enabled_key)) + return false; + if (static_branch_unlikely(&kcompmari_force_key)) + return true; + return lru_marie_enabled(); +} +#endif /* CONFIG_SWAP */ + +/* + * Marie's per-folio state lives entirely in the per-PFN byte + * marie_state[pfn] (declared in mm/lru_marie/state.h). Public callers + * reach Marie state via the dispatch surface above + * (lru_marie_add_folio / lru_marie_del_folio / lru_marie_shrink_lruvec / + * lru_marie_exit_memcg / lru_marie_look_around / lru_marie_age_node). + * folio->flags carries no Marie bits. + */ + +#endif /* CONFIG_LRU_MARIE */ + +/* + * CONFIG_LRU_MARIE=n: this header intentionally exposes NO inline + * shims. Every call site in mm/ is wrapped in #ifdef CONFIG_LRU_MARIE, + * so when Marie is off the kernel image contains no Marie symbols and + * no Marie calls at all. Refusing to provide no-ops here makes any + * stray, un-gated reference fail to compile loudly rather than silently + * disappearing into a return-false stub. + */ + +#endif /* _LINUX_LRU_MARIE_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index a171070e15..06b86e03a2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,22 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); + /* + * Marie's reclaim isolate path (marie_evict_counters_only) and + * deferred post-reclaim teardown (marie_state_drop_pfn_after_reclaim + * via marie_state_shrink_lruvec) intentionally run this without + * lru_lock: install/evict serialise via marie_state[pfn]'s TRACKED + * bit and folio_test_clear_lru, and the per-CPU vmstat helpers + * called below are preempt-off-safe on their own. Skip the lockdep + * assertion in that mode. Legacy / MGLRU paths still get full + * coverage when lru_marie_enabled() is false. + */ +#ifdef CONFIG_LRU_MARIE + if (!lru_marie_enabled()) + lockdep_assert_held(&lruvec->lru_lock); +#else lockdep_assert_held(&lruvec->lru_lock); +#endif WARN_ON_ONCE(nr_pages != (int)nr_pages); mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); @@ -104,14 +120,14 @@ static inline bool lru_gen_switching(void) return static_branch_unlikely(&lru_switch); } #ifdef CONFIG_LRU_GEN_ENABLED -static inline bool lru_gen_enabled(void) +static inline bool lru_gen_core_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else -static inline bool lru_gen_enabled(void) +static inline bool lru_gen_core_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); @@ -119,11 +135,58 @@ static inline bool lru_gen_enabled(void) } #endif +/* + * lru_gen_core_enabled() reports the raw MGLRU core static key. Almost no + * caller wants that directly -- they want lru_gen_enabled() below, which is + * additionally masked off whenever Marie is the active LRU manager. + * + * Marie and MGLRU are mutually exclusive at runtime. When Marie owns aging, + * every MGLRU code path must be inert: folio_mark_accessed() -> + * lru_gen_inc_refs(), the reclaim/aging dispatch, workingset refault, the + * rmap look-around, and so on. Reporting MGLRU as disabled here makes "both + * managers touch the same folio" structurally unrepresentable for every + * lru_gen_enabled() reader, so an MGLRU writer that forgets a + * !lru_marie_enabled() guard can no longer stamp LRU_GEN / LRU_REFS state + * onto a Marie-owned folio -- residue that would otherwise leak into + * PAGE_FLAGS_CHECK_AT_FREE (LRU_GEN_MASK) at the buddy handoff. + * + * The Marie<->MGLRU ownership handoff (mm/lru_marie enable/disable + * transition) must still observe the real key; it calls + * lru_gen_core_enabled() directly. + */ +static inline bool lru_gen_enabled(void) +{ +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + return false; +#endif + return lru_gen_core_enabled(); +} + static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } +/* + * Move lruvec contents between legacy lruvec->lists[lru] and + * lrugen->folios[gen][type][zone] using MGLRU's canonical add/del + * helpers. Exported for external LRU drivers (mm/lru_marie) that + * need to keep MGLRU's state_is_valid invariant intact across their + * own enable/disable transitions. + * + * lru_gen_fill_lruvec -- legacy lists -> lrugen (MGLRU's normal + * enable-time fill, made callable) + * lru_gen_drain_lruvec -- lrugen -> lruvec_add_folio path + * (when another driver's gate is on the + * folios route into that driver directly) + * + * Caller holds lruvec->lru_lock irqsave; helper releases+reacquires + * across cond_resched. + */ +void lru_gen_fill_lruvec(struct lruvec *lruvec); +void lru_gen_drain_lruvec(struct lruvec *lruvec); + static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -312,6 +375,11 @@ static inline void folio_migrate_refs(struct folio *new, const struct folio *old } #else /* !CONFIG_LRU_GEN */ +static inline bool lru_gen_core_enabled(void) +{ + return false; +} + static inline bool lru_gen_enabled(void) { return false; @@ -322,6 +390,9 @@ static inline bool lru_gen_switching(void) return false; } +static inline void lru_gen_fill_lruvec(struct lruvec *lruvec) { } +static inline void lru_gen_drain_lruvec(struct lruvec *lruvec) { } + static inline bool lru_gen_in_fault(void) { return false; @@ -350,8 +421,23 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_add_folio(lruvec, folio, false)) + return; + + /* + * If Marie is enabled, lru_marie_add_folio failed only due to allocation + * exhaustion (slab + mempool reserve both depleted). Skip MGLRU + * and fall directly to the legacy LRU lists: shrink_lruvec runs + * legacy reclaim alongside Marie specifically to drain these + * orphans, but MGLRU is bypassed entirely when lru_marie_enabled(). + */ + if (!lru_marie_enabled() && lru_gen_add_folio(lruvec, folio, false)) + return; +#else if (lru_gen_add_folio(lruvec, folio, false)) return; +#endif update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); @@ -366,8 +452,17 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_add_folio(lruvec, folio, true)) + return; + + /* See lruvec_add_folio() — Marie alloc failure falls to legacy, not MGLRU. */ + if (!lru_marie_enabled() && lru_gen_add_folio(lruvec, folio, true)) + return; +#else if (lru_gen_add_folio(lruvec, folio, true)) return; +#endif update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); @@ -382,6 +477,11 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_del_folio(lruvec, folio, false)) + return; +#endif + if (lru_gen_del_folio(lruvec, folio, false)) return; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9adb2ad21d..f2c4a1cd80 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -24,6 +24,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -783,6 +784,21 @@ struct lruvec { struct pglist_data *pgdat; #endif struct zswap_lruvec_state zswap_lruvec_state; +#ifdef CONFIG_LRU_MARIE + /* + * Pointer to this lruvec's struct marie_lruvec — the single source + * of truth (no side xarray). Lazily allocated on first use by + * marie_get_lruvec() via cmpxchg, and cleared then freed by + * marie_drop_lruvec() under lv->lru_lock when the lruvec's memcg is + * torn down. Fault-path dispatchers (lru_marie_add_folio, + * lru_marie_del_folio, etc.) read it with a single READ_ONCE. + * + * void * because struct marie_lruvec is internal to + * mm/lru_marie/state.h. Its lifetime is tied to the lruvec/memcg, so + * any caller holding a valid lruvec observes a live mlv without RCU. + */ + void *marie_mlv; +#endif }; /* Isolate for asynchronous migration */ @@ -1529,6 +1545,20 @@ typedef struct pglist_data { atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ +#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP) +/* + * kfifo backing storage capacity (in folio* slots). The sysfs knob + * vm_kcompmari sets the effective queue length in [-100, +100]; this + * matches the upper bound. ~800 bytes per pgdat regardless of the + * currently configured depth. + */ +#define KCOMPMARI_FIFO_SIZE 100 + wait_queue_head_t kcompmari_wait; + struct task_struct *kcompmari; + struct kfifo kcompmari_fifo; + spinlock_t kcompmari_fifo_lock; +#endif + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; diff --git a/include/linux/swap.h b/include/linux/swap.h index 7a09df6977..cb9d9acecb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -421,6 +421,24 @@ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; +#ifdef CONFIG_LRU_MARIE +/* + * linux/mm/page_io.c: monotonic counter incremented on every failed swap-out + * bio completion (bio->bi_status != 0). The early-OOM gate in + * mm/page_alloc.c:should_reclaim_retry consults the delta from + * alloc_context.initial_swap_write_failed to detect "swap backend has free + * entries but cannot actually write" — primarily ZRAM/zswap zs_malloc + * failures under combined RAM + swap pressure, but also disk swap I/O + * errors. Sustained delta > MAX_SWAP_WRITE_FAIL_RETRIES skips the standard + * MAX_RECLAIM_RETRIES wait and triggers OOM directly. + * + * Marie-only: only the Marie-gated path in should_reclaim_retry consumes + * this counter, so it is omitted entirely under CONFIG_LRU_MARIE=n to + * keep vanilla MGLRU/Legacy builds byte-identical. + */ +extern atomic_long_t nr_swap_write_failed; +#endif + /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { diff --git a/mm/Kconfig b/mm/Kconfig index e8bf1e9e6a..2914498cad 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1396,6 +1396,36 @@ config LRU_GEN_WALKS_MMU depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG # } +# Marie LRU { +config LRU_MARIE + bool "Marie LRU" + def_bool y + depends on MMU + help + Marie LRU represents each folio's reclaim state as a single + byte in a flat per-PFN array allocated once at boot. install, + delete, and aging are all single byte writes — there is no + allocation in any fault-path Marie operation, and no linked + lists for the reclaim path to chase. isolate batches the array + with SIMD plus a persistent cursor, so a 32-folio batch + typically costs a few hundred PFNs of sequential read. + + The PTE young-bit walker is SIMD-accelerated and auto-dispatched + at boot (AVX-512F > AVX2 > SSE2 on x86; scalar elsewhere); a + bloom-filter forward feedback from rmap keeps walker cost on + hot PMDs. + + Memory cost: ~1 byte per RAM PFN (4 MiB on 16 GiB, 16 MiB on + 64 GiB, capped at 4 GiB by the 32-bit PFN limit Marie requires). + + Enabled by default; disable at boot with lru_marie=0 on the + kernel cmdline, or at runtime via /sys/kernel/mm/lru_marie/enabled. + When disabled, vmscan falls through to the in-tree LRU paths + unchanged. + + Say Y unless you understand what this is. +# } + config ARCH_SUPPORTS_PER_VMA_LOCK def_bool n diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab0824..a2162965c3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -75,6 +75,7 @@ ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif +obj-$(CONFIG_LRU_MARIE) += lru_marie/ obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 970e077019..9b6315ceef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -3562,10 +3563,31 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!folio_test_lru(folio)); - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio)) { new_folio->mlock_count = 0; - else + } else { +#ifdef CONFIG_LRU_MARIE + /* + * If Marie owns @folio (the head), the legacy + * list_add_tail below would put the new tail on the + * legacy LRU without a TRACKED state byte, leaving it + * invisible to Marie's per-mlv bookkeeping + * (mlv->types[].nr_pages, marie_nr_folios). Route + * through Marie's split helper which sets TRACKED, + * publishes the per-PFN state at the same gen as + * @folio, and increments the folio counter. The + * helper falls back to plain list_add_tail when + * @folio is not Marie-tracked, so the static branch + * is the only gate the !lru_marie_enabled() case sees. + */ + if (lru_marie_enabled()) + lru_marie_split_folio(lruvec, folio, new_folio); + else + list_add_tail(&new_folio->lru, &folio->lru); +#else list_add_tail(&new_folio->lru, &folio->lru); +#endif + } folio_set_lru(new_folio); } } diff --git a/mm/internal.h b/mm/internal.h index 5a2ddcf68e..9307a24661 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -625,12 +625,55 @@ extern unsigned long highest_memmap_pfn; */ #define MAX_RECLAIM_RETRIES 16 +#ifdef CONFIG_LRU_MARIE +/* + * Maximum number of swap-write failures (incremented by mm/page_io.c + * __end_swap_bio_write on bio->bi_status != 0) tolerated within a single + * __alloc_pages_slowpath attempt before the early-OOM gate gives up. Lets + * a handful of transient failures (concurrent ZRAM ops, brief retry + * windows) recover, but trips OOM well before MAX_RECLAIM_RETRIES on + * sustained backend rejection. Marie-only; omitted under + * CONFIG_LRU_MARIE=n. + */ +#define MAX_SWAP_WRITE_FAIL_RETRIES 16 + +#endif + /* * in mm/vmscan.c: + * + * struct scan_control is private to vmscan.c. Out-of-tree LRU + * experiments (mm/lru_marie) read/update individual fields via the + * sc_* accessors declared below; the struct itself is opaque to + * everything outside vmscan.c. */ +struct scan_control; + bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); +struct reclaim_stat; +unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +int vmscan_reclaimer_offset(struct scan_control *sc); +bool vmscan_can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, + struct scan_control *sc); + +/* + * scan_control accessors -- read/update the few fields out-of-tree + * readers need without exposing the struct layout. All defined in + * vmscan.c next to the struct definition; trivial enough that the + * compiler routinely inlines the body across LTO. Non-LTO builds + * pay one extra call per use, which lands only on cold paths + * (entry of marie_state_shrink_lruvec and the inner tier loop). + */ +int sc_priority(const struct scan_control *sc); +int sc_reclaim_idx(const struct scan_control *sc); +bool sc_reclaim_target_reached(const struct scan_control *sc); +void sc_add_reclaimed(struct scan_control *sc, unsigned long nr); +bool sc_cgroup_reclaim(const struct scan_control *sc); int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat); @@ -693,6 +736,18 @@ struct alloc_context { */ enum zone_type highest_zoneidx; bool spread_dirty_pages; + +#ifdef CONFIG_LRU_MARIE + /* + * Snapshot of nr_swap_write_failed at the entry to + * __alloc_pages_slowpath. should_reclaim_retry takes the delta to + * decide whether the swap backend has rejected enough writes during + * THIS allocation attempt to skip the rest of the reclaim retry + * budget and OOM directly. See include/linux/swap.h for the + * counter's contract. Marie-only; omitted under CONFIG_LRU_MARIE=n. + */ + long initial_swap_write_failed; +#endif }; /* diff --git a/mm/lru_marie/Makefile b/mm/lru_marie/Makefile new file mode 100644 index 0000000000..c1e9c9becf --- /dev/null +++ b/mm/lru_marie/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Marie LRU. Reached from mm/Makefile via obj-$(CONFIG_LRU_MARIE) += lru_marie/. +# Public dispatch surface for the rest of the kernel is in +# include/linux/lru_marie.h; everything below is Marie-private. + +obj-y += bitmap.o +obj-y += core.o +obj-y += state.o +obj-y += walker.o + +ifdef CONFIG_X86 +obj-y += simd_x86.o +obj-y += simd_x86_sse2.o +obj-y += simd_x86_avx2.o +obj-y += simd_x86_avx512.o +else +obj-y += simd_generic.o +endif diff --git a/mm/lru_marie/account.h b/mm/lru_marie/account.h new file mode 100644 index 0000000000..ef1b7b7695 --- /dev/null +++ b/mm/lru_marie/account.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_ACCOUNT_H +#define _MM_LRU_MARIE_ACCOUNT_H + +#include +#include +#include + +#include "state.h" + +/* + * Marie's four counter+vmstat updates that move together on every + * install/evict. Until Step 3 of the abstraction plan these were + * hand-written at five sites (install, evict_locked, evict_counters_only, + * survivor install, survivor evict-on-free). The drift hazard that + * cost us 9c6a93782 was: each site picked its own IRQ-state discipline + * because no helper enforced it. + * + * marie_pc_add(&mlv->types[type].nr_pages, +-nr) + * marie_pc_add(&marie_nr_folios, +-1 ) + * marie_update_lru_size(lv, lru, zone, +-nr) + * marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], +-nr) + * + * marie_update_lru_size credits/debits the per-memcg mz->lru_zone_size + * (unified with legacy update_lru_size), so a Marie folio is counted in + * mz from install to evict exactly like a legacy/MGLRU folio. + * marie_lru_zone_size is Marie's own per-bucket tally, kept only for the + * reparent transfer; it no longer feeds lruvec_lru_size(). + * + * Two contexts: + * + * LOCKED - caller holds lv->lru_lock with IRQs off. Used by the + * install / evict_locked / del_folio_locked / + * fill / drain hot paths. Helpers assert both held + * conditions via lockdep. mlv is non-NULL. + * + * ISOLATE - caller holds NOTHING (no lru_lock, IRQs on). Used by + * the reclaim isolate path and the survivor putback. + * Helpers own local_irq_save/restore so the marie_pc_add + * fast path and __mod_zone_page_state inside + * marie_update_lru_size are safe against same-CPU + * softirq reentrancy (the very property 9c6a93782 + * introduced). mlv may be NULL: under reclaim a brand- + * new lruvec may fail GFP_ATOMIC at marie_get_lruvec, and + * teardown can race with the isolate. In that case only + * the global counters move; the per-mlv counters are + * gone with the missing carrier (no leak -- mirrors the + * pre-helper code in marie_evict_counters_only). + * + * The helpers do NOT touch folio flags, the per-PFN state byte, the + * scan bitmap, or memcg L1. Those belong to pfn_install.h + * (marie_pfn_publish_inherit) and to marie_state_publish_at_gen / + * marie_state_drop_pfn. Each layer keeps its own invariant. + */ + +/* + * Drain any deferred legacy-mz delta the lock-free isolate paths accumulated + * for (@lru, @zone) into mz->lru_zone_size. MUST be called with @mlv's + * lru_lock held (the locked funnels below do), so the non-atomic mz RMW inside + * marie_update_lru_size is serialised. atomic_long_xchg claims the pending + * delta in one shot against concurrent lock-free accumulators. + */ +static inline void marie_mz_drain_locked(struct marie_lruvec *mlv, + enum lru_list lru, int zone) +{ +#ifdef CONFIG_MEMCG + long d = atomic_long_xchg(&mlv->mz_pending[lru][zone], 0); + + /* + * Apply ONLY the deferred mz->lru_zone_size delta -- the vmstat halves + * (NR_LRU_BASE / NR_ZONE_LRU_BASE) were updated immediately and lock-free + * in the isolate path (they are per-CPU safe). Re-running the full + * marie_update_lru_size here would double-count vmstat. + */ + if (d) + mem_cgroup_update_lru_size(mlv->lruvec, lru, zone, d); +#endif +} + +static inline void marie_account_install(struct marie_lruvec *mlv, + struct folio *f, + enum lru_list lru, int zone) +{ + int type = folio_is_file_lru(f); + long nr = folio_nr_pages(f); + + lockdep_assert_held(&mlv->lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + marie_pc_add(&mlv->types[type].nr_pages, nr); + marie_pc_add(&marie_nr_folios, 1); + /* Fold in any lock-free isolate deltas first, then our own, under lock. */ + marie_mz_drain_locked(mlv, lru, zone); + marie_update_lru_size(mlv->lruvec, lru, zone, nr); + marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], nr); +} + +static inline void marie_account_evict(struct marie_lruvec *mlv, + struct folio *f, + enum lru_list lru, int zone) +{ + int type = folio_is_file_lru(f); + long nr = folio_nr_pages(f); + + lockdep_assert_held(&mlv->lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + marie_pc_add(&mlv->types[type].nr_pages, -nr); + marie_pc_add(&marie_nr_folios, -1); + marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], -nr); + /* Fold in any lock-free isolate deltas first, then our own, under lock. */ + marie_mz_drain_locked(mlv, lru, zone); + marie_update_lru_size(mlv->lruvec, lru, zone, -nr); +} + +static inline void marie_account_install_isolate(struct lruvec *lv, + struct marie_lruvec *mlv, + struct folio *f, + enum lru_list lru, int zone) +{ + int type = folio_is_file_lru(f); + long nr = folio_nr_pages(f); + unsigned long flags; + + WARN_ON_ONCE(irqs_disabled()); + + local_irq_save(flags); + marie_pc_add(&marie_nr_folios, 1); + /* vmstat halves are per-CPU safe lock-free; do them now. */ + __update_lru_size(lv, lru, zone, nr); + if (likely(mlv)) { + marie_pc_add(&mlv->types[type].nr_pages, nr); + marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], nr); +#ifdef CONFIG_MEMCG + /* + * No lru_lock here -- DEFER only the non-atomic mz->lru_zone_size + * RMW (the race that drifts mz negative) instead of doing it + * lock-free. The next LOCKED op on this bucket drains it. If mlv + * is NULL (offlining / GFP_ATOMIC fail) there is nowhere to defer + * and no shadow either, so mz is left untouched -- mz and shadow + * stay paired and the dying memcg is reparented. + */ + atomic_long_add(nr, &mlv->mz_pending[lru][zone]); +#endif + } + local_irq_restore(flags); +} + +static inline void marie_account_evict_isolate(struct lruvec *lv, + struct marie_lruvec *mlv, + struct folio *f, + enum lru_list lru, int zone) +{ + int type = folio_is_file_lru(f); + long nr = folio_nr_pages(f); + unsigned long flags; + + WARN_ON_ONCE(irqs_disabled()); + + local_irq_save(flags); + marie_pc_add(&marie_nr_folios, -1); + /* vmstat halves are per-CPU safe lock-free; do them now. */ + __update_lru_size(lv, lru, zone, -nr); + if (likely(mlv)) { + marie_pc_add(&mlv->types[type].nr_pages, -nr); + marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], -nr); +#ifdef CONFIG_MEMCG + /* DEFER only the lock-free mz RMW; drained under lru_lock. See + * marie_account_install_isolate for the mlv==NULL rationale. */ + atomic_long_add(-nr, &mlv->mz_pending[lru][zone]); +#endif + } + local_irq_restore(flags); +} + +#endif /* _MM_LRU_MARIE_ACCOUNT_H */ diff --git a/mm/lru_marie/bitmap.c b/mm/lru_marie/bitmap.c new file mode 100644 index 0000000000..da8f595a6b --- /dev/null +++ b/mm/lru_marie/bitmap.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hierarchical PFN bitmap operations. See bitmap.h for the design + * overview. Used by both the global per-(type, gen, tier) plane and + * the per-memcg plane. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitmap.h" +#include "state.h" /* max_pfn, marie_l2_shift, marie_pfn_to_l2_bit */ + +/* + * 512 cacheline-aligned spinlocks, one per L2 bit. Each lock makes + * one concurrent scanner the exclusive owner of the PFN range + * covered by that L2 bit -- collisions never produce wasted + * candidate scan work, only a single try_lock failure that costs + * one atomic op. ____cacheline_aligned_in_smp prevents false sharing + * between adjacent locks while keeping UP-build footprint flat. + * + * 32 KiB total on SMP (64 B x 512). Shared across every marie_bitmap + * instance: the lock is over the PFN address space, not per-bitmap. + * Two scanners walking different (type, gen, tier) bitmaps in the + * same L2 range still serialise via the same lock, avoiding wasted + * parallel L1 fetches of the same physical cachelines. + * + * Trylock / unlock are static inline in bitmap.h; this file only + * holds the storage and the boot-time init. + */ +struct marie_bm_range_lock marie_bm_range_locks[MARIE_L2_BITS]; + +void marie_bm_range_locks_init(void) +{ + int i; + + for (i = 0; i < MARIE_L2_BITS; i++) + spin_lock_init(&marie_bm_range_locks[i].lock); +} + +int marie_bm_init(struct marie_bitmap *bm) +{ + unsigned long bytes; + + if (!max_pfn) + return 0; + bytes = BITS_TO_LONGS(max_pfn) * sizeof(unsigned long); + bm->l1 = kvmalloc(bytes, GFP_KERNEL | __GFP_ZERO); + if (!bm->l1) + return -ENOMEM; + return 0; +} + +void marie_bm_free(struct marie_bitmap *bm) +{ + if (!bm) + return; + kvfree(bm->l1); + bm->l1 = NULL; +} + +/* + * marie_bm_set / marie_bm_clear / marie_bm_test are static inline in + * bitmap.h -- they sit on the install / del / promote hot path and + * out-of-lining costs measurable cycles per fault. + */ + +/* + * Inclusive [start_word, end_word) covering one L2 bit's worth of L1 words. + * Clipped to the actual l1 storage extent. + */ +static void marie_bm_l1_word_range(unsigned int l2bit, + unsigned long *start_word, + unsigned long *end_word) +{ + unsigned long pfns_per_l2 = 1UL << marie_l2_shift; + unsigned long start_pfn = (unsigned long)l2bit << marie_l2_shift; + unsigned long end_pfn = start_pfn + pfns_per_l2; + unsigned long max_words = BITS_TO_LONGS(max_pfn); + + *start_word = start_pfn / BITS_PER_LONG; + *end_word = DIV_ROUND_UP(end_pfn, BITS_PER_LONG); + if (*end_word > max_words) + *end_word = max_words; +} + +void marie_bm_drop_l2_range(struct marie_bitmap *bm, unsigned int l2bit) +{ + unsigned long start_word, end_word, wi; + + if (!bm->l1) + return; + marie_bm_l1_word_range(l2bit, &start_word, &end_word); + for (wi = start_word; wi < end_word; wi++) + bm->l1[wi] = 0; + atomic_set(&bm->l2_count[l2bit], 0); + clear_bit(l2bit, bm->l2); +} + +void marie_bm_reset(struct marie_bitmap *bm) +{ + int i; + + if (!bm->l1) + return; + if (max_pfn) + bitmap_zero(bm->l1, max_pfn); + bitmap_zero(bm->l2, MARIE_L2_BITS); + for (i = 0; i < MARIE_L2_BITS; i++) + atomic_set(&bm->l2_count[i], 0); +} + +void marie_bm_merge(struct marie_bitmap *dst, struct marie_bitmap *src) +{ + const int l2_words = BITS_TO_LONGS(MARIE_L2_BITS); + int lw; + + if (!src || !src->l1) + return; + + for (lw = 0; lw < l2_words; lw++) { + unsigned long w = src->l2[lw]; + + while (w) { + unsigned int b = __ffs(w); + unsigned int l2bit = lw * BITS_PER_LONG + b; + unsigned long start_word, end_word, wi; + int child_count, parent_new; + + w &= w - 1; + + marie_bm_l1_word_range(l2bit, &start_word, &end_word); + + if (dst && dst->l1) { + for (wi = start_word; wi < end_word; wi++) { + unsigned long cw = src->l1[wi]; + + if (!cw) + continue; + dst->l1[wi] |= cw; + src->l1[wi] = 0; + } + child_count = atomic_xchg(&src->l2_count[l2bit], 0); + if (child_count <= 0) + continue; + parent_new = atomic_add_return(child_count, + &dst->l2_count[l2bit]); + if (parent_new == child_count) + set_bit(l2bit, dst->l2); + } else { + for (wi = start_word; wi < end_word; wi++) + src->l1[wi] = 0; + atomic_set(&src->l2_count[l2bit], 0); + } + } + src->l2[lw] = 0; + } +} diff --git a/mm/lru_marie/bitmap.h b/mm/lru_marie/bitmap.h new file mode 100644 index 0000000000..1439771849 --- /dev/null +++ b/mm/lru_marie/bitmap.h @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hierarchical PFN bitmap shared by Marie's global per-(type, gen, tier) + * planes and per-memcg plane. + * + * Two layers held by one struct: + * L1: per-PFN bit, sized BITS_TO_LONGS(max_pfn). set_bit()/clear_bit() + * (atomic). One word covers 64 PFNs. + * + * L2: 512-bit summary, each bit covers (max_pfn / 512) PFNs (one + * "L2 range", typically 32 MiB on an 8 GiB system). A companion + * per-cell atomic_t refcount tracks how many L1 bits are set in + * that range. The L2 bit transitions on the 0 <-> 1 counter + * boundary, performed inside the same atomic_*_return path, + * so concurrent set/clear cannot desynchronise the bit from + * the counter. + * + * Two consumers: + * - Global plane: one struct per (type, gen, tier), 16 instances + * total (marie_track_bm[type][gen][tier]). + * - Memcg plane: one struct per non-root memcg, wrapped by + * struct marie_memcg_bm. + * + * The same struct + the same set/clear/merge/iter operations are + * used by both consumers. No internal lock; producers serialise via + * the existing Marie lock hierarchy (lru_lock on the install/del + * side, reparent_locks on the merge side, marie_l2_locks[bit] + * trylock on the scanner side). + */ +#ifndef _MM_LRU_MARIE_BITMAP_H +#define _MM_LRU_MARIE_BITMAP_H + +#include +#include +#include +#include +#include /* max_pfn */ +#include +#include + +/* + * MARIE_L2_BITS sizes the L2 summary plane. Placed here so the + * struct can lay out its inline arrays without pulling in state.h. + */ +#define MARIE_L2_BITS 512 + +/* + * PFN -> L2 bit shift, set at marie_state_init time so that + * (1 << marie_l2_shift) PFNs map to one L2 bit and 512 L2 bits cover + * the full max_pfn range. shift = ceil(log2(max_pfn / 512)). + */ +extern unsigned int marie_l2_shift; + +static inline unsigned int marie_pfn_to_l2_bit(unsigned long pfn) +{ + unsigned int b = pfn >> marie_l2_shift; + + return b < MARIE_L2_BITS ? b : MARIE_L2_BITS - 1; +} + +static inline unsigned long marie_l2_bit_pfn_start(unsigned int bit) +{ + return (unsigned long)bit << marie_l2_shift; +} + +static inline unsigned long marie_l2_bit_pfn_end(unsigned int bit) +{ + return ((unsigned long)bit + 1) << marie_l2_shift; +} + +struct marie_bitmap { + unsigned long *l1; /* BITS_TO_LONGS(max_pfn) words */ + unsigned long l2[BITS_TO_LONGS(MARIE_L2_BITS)]; /* 64 B inline */ + atomic_t l2_count[MARIE_L2_BITS]; /* 2 KiB inline */ +}; + +/* + * marie_bm_init - allocate @bm->l1 sized for the system max_pfn. + * @bm->l2 and @bm->l2_count are zero-initialised by the caller (the + * struct itself is typically zero-allocated). Returns 0 on success, + * -ENOMEM on allocation failure. + */ +int marie_bm_init(struct marie_bitmap *bm); + +/* marie_bm_free - release @bm->l1 (no-op when never initialised). */ +void marie_bm_free(struct marie_bitmap *bm); + +/* + * marie_bm_set - mark @pfn tracked. + * + * Atomically sets the L1 bit at @pfn. The per-cell refcount is then + * atomic_inc_return'd; on the 0 -> 1 transition the L2 summary bit + * for @pfn's range is set. Idempotent w.r.t. already-set L1 bit if + * the same PFN is set twice (cell_count overcounts; balanced by a + * matching number of clears). + * + * static inline because this is a hot-path operation invoked at + * every install / promote / move; out-of-lining would add a function + * call + bound-check overhead per call. + */ +static inline void marie_bm_set(struct marie_bitmap *bm, unsigned long pfn) +{ + unsigned int l2bit; + + if (!bm->l1 || pfn >= max_pfn) + return; + set_bit(pfn, bm->l1); + l2bit = marie_pfn_to_l2_bit(pfn); + if (atomic_inc_return(&bm->l2_count[l2bit]) == 1) + set_bit(l2bit, bm->l2); +} + +/* + * marie_bm_clear - mark @pfn untracked. + * + * Atomically clears the L1 bit at @pfn. The per-cell refcount is + * atomic_dec_return'd; on the 1 -> 0 transition the L2 summary bit + * for @pfn's range is cleared. + */ +static inline void marie_bm_clear(struct marie_bitmap *bm, unsigned long pfn) +{ + unsigned int l2bit; + + if (!bm->l1 || pfn >= max_pfn) + return; + clear_bit(pfn, bm->l1); + l2bit = marie_pfn_to_l2_bit(pfn); + if (atomic_dec_return(&bm->l2_count[l2bit]) == 0) + clear_bit(l2bit, bm->l2); +} + +/* + * marie_bm_test - is @pfn tracked? Lock-free single-word read. + * Returns false when @bm->l1 is unallocated. + */ +static inline bool marie_bm_test(const struct marie_bitmap *bm, + unsigned long pfn) +{ + if (!bm->l1 || pfn >= max_pfn) + return false; + return test_bit(pfn, bm->l1); +} + +/* + * marie_bm_drop_l2_range - bulk-clear all L1 / L2 / counter state + * for the L2 range identified by @l2bit. Used when recycling one + * range of a bitmap (precise, touches the L1 words covered by the + * range as well). + * + * Caller must guarantee no concurrent set/clear on @bm for the + * affected PFN range (try_advance_head fences via head_gen cmpxchg). + */ +void marie_bm_drop_l2_range(struct marie_bitmap *bm, unsigned int l2bit); + +/* + * marie_bm_reset - reset @bm to fully empty: L1 cleared, L2 cleared, + * all l2_count cells zeroed. + * + * L1 must be cleared too: leaving stale L1 bits and resetting only + * L2 + l2_count would let a subsequent marie_bm_set(@pfn) on a + * different PFN in a stale-set L1 word leave that stale bit visible + * to the scanner (which now sees the just-set L2 bit and enters the + * range). Worse, a later marie_bm_clear() on the stale PFN would + * dec the l2_count below zero, corrupting the refcount invariant. + * + * Used by try_advance_head when recycling a (type, gen, tier) slot + * for the next ring cycle. Caller must fence subsequent installs + * (head_gen cmpxchg in try_advance_head's case) so no install can + * target @bm until the reset is visible. + */ +void marie_bm_reset(struct marie_bitmap *bm); + +/* + * marie_bm_merge - L2-pruned word-wise OR of @src into @dst. + * + * Walks @src->l2 sparsely (only set bits via __ffs unset-loop). For + * each populated L2 range, OR's the matching L1 word range into + * @dst and atomic_xchg's @src's l2_count contribution into @dst's + * (set_bit on @dst's L2 fires at the 0 -> N transition). The + * processed words / counters in @src are zeroed. + * + * @dst == NULL: @src is simply drained (l1 cleared, l2_count zeroed, + * l2 cleared) -- used by reparent when no parent bitmap exists. + * + * Cost scales with @src's populated L2 range count, not max_pfn. + * + * Caller must serialise against concurrent set/clear on either + * bitmap (reparent_locks on the memcg offline path is sufficient). + */ +void marie_bm_merge(struct marie_bitmap *dst, struct marie_bitmap *src); + +/* + * L2 range coordination locks: 512 spinlocks (one per L2 bit, ~32 KiB + * total), used by scanners to claim exclusive ownership of a PFN + * range for the duration of their L1 walk in that range. + * + * Shared by ALL marie_bitmap instances: the lock is over the PFN + * address space, not the bitmap instance. Two scanners walking + * different (type, gen, tier) bitmaps in the same L2 range still + * serialise via the same lock, avoiding wasted parallel L1 fetches + * of the same physical cachelines. + * + * The storage is exposed (rather than wrapped in opaque accessors) + * so the per-bit trylock / unlock can be static inline in this + * header -- scanners take them once per processed L2 bit, which is + * hot enough that a function-call wrapper costs measurable cycles. + * Callers must guarantee @l2bit < MARIE_L2_BITS (always true for + * indices produced by __ffs on an L2 word). + */ +struct marie_bm_range_lock { + spinlock_t lock; +} ____cacheline_aligned_in_smp; + +extern struct marie_bm_range_lock marie_bm_range_locks[MARIE_L2_BITS]; + +void marie_bm_range_locks_init(void); + +static inline bool marie_bm_range_trylock(unsigned int l2bit) +{ + return spin_trylock(&marie_bm_range_locks[l2bit].lock); +} + +static inline void marie_bm_range_unlock(unsigned int l2bit) +{ + spin_unlock(&marie_bm_range_locks[l2bit].lock); +} + +#endif /* _MM_LRU_MARIE_BITMAP_H */ diff --git a/mm/lru_marie/core.c b/mm/lru_marie/core.c new file mode 100644 index 0000000000..f376a63990 --- /dev/null +++ b/mm/lru_marie/core.c @@ -0,0 +1,1982 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/lru_marie/core.c — Marie LRU. + * + * Multi-graded Adaptive Reclaim & Independent Eviction (Marie) + * + * Architecture in one paragraph: + * - per-PFN state byte (marie_state[pfn]) as the single source of + * truth for every folio's (TRACKED, type, zone, gen, tier) tuple; + * install / del / aging are single byte writes with no allocation + * in any fault-path operation + * - per-lruvec mlv carrying per-type locks and per-(lru, zone) + * percpu counters; per-type counter writes hit the local CPU diff + * so install/del do not bounce a shared cache line + * - per-pgdat walker driven from kswapd, with rmap-fed bloom + * feedback so PMD scans concentrate on hot regions + * - cycling per-type gen ring (MARIE_PFN_NR_GENS = 4) encoded in + * the per-PFN byte, advanced by install cadence + * (marie_install_advance_hook) and by the reclaim-driven trigger + * (occupied gen count < 2 at shrink_lruvec entry) + * - SIMD PTE young-bit batch scan with boot-time AVX-512F / AVX2 / + * SSE2 dispatch on x86; scalar fallback on arm64 and elsewhere + * + * Core types: + * + * marie_state[] — global per-PFN byte array (state.{h,c}) + * struct marie_type — per-type slot inside marie_lruvec (anon / + * file): per-type lock + nr_pages counter + * struct marie_lruvec — Marie's per-lruvec state (anon + file types, + * per-(lru, zone) page count, swap_bias) + * + * lv->marie_mlv — per-lruvec pointer to struct marie_lruvec, + * the single source of truth (no side xarray; + * lifetime tied to the lruvec/memcg, no RCU) + * + * marie_get_lruvec() — lazy lookup/allocation (cmpxchg publish) + * marie_drop_lruvec() — remove and free one entry + * lru_marie_exit_memcg() — drop all entries belonging to a dying memcg + * + * Recommended userspace configuration: + * - vm.swappiness = 1 + * swappiness historically encoded the relative IO cost of swap + * vs. filesystem paging, on the assumption that file cache and + * anon working set carry comparable "hotness" distributions and + * comparable refault costs. That assumption was authored against + * spinning-disk-era hardware and no longer matches modern + * systems: + * + * Storage type File cache cost Recommended + * ------------------- ---------------- ------------- + * SSD+ZRAM (Modern) Low 1 (Marie default) + * HDD (Slow,Unresponsive) High Higher (60+) + * + * On modern desktops with NVMe-class file storage, lost file + * cache refaults in microseconds and is largely transparent to + * the user. ZRAM-backed swap, by contrast, is "free in RAM" only + * on the surface: every swapout/swapin pays compression CPU, + * L1/L2/L3 cache pollution from the codec working set, and + * blocks the calling context -- costs that are systematically + * hidden in IO accounting but ergonomically very visible as UI + * stutter and jank. + * + * Worse, on a ZRAM-equipped system in normal steady state the + * pagecache typically fills physical memory. Any proportional + * anon eviction at that point disturbs the anon working set just + * to make room for what is mostly cold pagecache anyway -- + * trading a transparent SSD refault on the file side for a + * visible ZRAM hit on the anon side. The cart goes before the + * horse. + * + * swappiness = 1 captures the resulting policy precisely: anon + * is fully protected until the file pagecache falls below the + * clean_min_ratio floor, at which point swap engages as a true + * last resort. Marie's per-PFN reclaim driver maps this onto + * MARIE_PICK_FILE_THEN_ANON -- FILE scanned first, ANON engaged + * ONLY when skip_file is set inside marie_state_shrink_lruvec + * (i.e. the floor has been breached). Per-call transient FILE + * failures (empty oldest gen, all shrink_folio_list rejects, + * etc.) do NOT leak into ANON -- the clean_min_ratio floor is + * the single depletion signal. The bias controller stays at + * zero throughout, because swappiness=1 short-circuits the + * proportional update path. + * + * Higher values (s = 2..199) remain useful on slower-storage + * systems where the file refault cost is no longer negligible; + * Marie honours them via the stubborn proportional controller + * in marie_swap_bias_update. s = 0 is a hard "never swap" -- + * reach OOM rather than touch anon. s = 200 is the symmetric + * "anon only" override. Both are intentional user policy + * overrides; clean_min_ratio does not punch through them. + * - systemd-oomd OFF + * systemd-oomd reacts to PSI before Marie's clean_min_ratio + * floor + no-progress OOM path has a chance to stabilise + * reclaim. With Marie engaged the kernel-side OOM gate is more + * accurate, and userspace OOMD ends up killing tasks Marie + * would have rescued. Disable it (or leave its swap thresholds + * at 100%) for predictable behaviour. + */ + +#define pr_fmt(fmt) "lru_marie: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" /* struct scan_control, shrink_folio_list */ +#include "drain_scope.h" +#include "pfn_install.h" +#include "simd.h" +#include "state.h" +#include "version.h" + +DEFINE_STATIC_KEY_TRUE(lru_marie_enabled_key); +EXPORT_SYMBOL_GPL(lru_marie_enabled_key); + +/* + * Marie indexes its per-PFN state array by raw PFN. The implementation + * caps max_pfn at 2^32 (= 4 KiB pages × 2^32 = 16 TiB of physical + * address space, holes included): the per-PFN byte array would be at + * most 4 GiB under that cap, and several other internal helpers + * assume the PFN fits in 32 bits. On a box that violates this Marie + * refuses to enable; Legacy / MGLRU continue to run unchanged. + * + * marie_pfn_unsupported is latched at subsys_initcall once max_pfn is + * stable (set during setup_arch / memblock init) and read-only after + * that, so the runtime cost is a single __read_mostly load. + */ +#define MARIE_MAX_SUPPORTED_PFN (1UL << 32) +static bool marie_pfn_unsupported __read_mostly; + +/* + * --------------------------------------------------------------------- + * install path -- fully synchronous, no staging or pending queues + * --------------------------------------------------------------------- + * + * lru_marie_add_folio dispatches into marie_folio_install (under the + * per-type lock for THP, lock-free for small folios). That helper + * publishes the per-PFN state byte, sets the tracking bitmaps, bumps + * the per-mlv / + * global percpu_counters, and set PG_lru -- all under the caller's + * lru_lock irqsave. No per-CPU staging, no session-end flush hook, no + * wrapper allocation, no async drain, no kworker dispatch. + * + * Walker tier promotion is similarly synchronous: when a young PTE + * references a folio whose tier is already saturated, the walker calls + * marie_state_move_to_gen() directly on the per-PFN byte (no queueing). + */ + +/* + * --------------------------------------------------------------------- + * data structures (struct definitions live in mm/lru_marie/state.h) + * --------------------------------------------------------------------- + * + * struct marie_type / marie_lruvec, the MARIE_PFN_NR_GENS / + * MARIE_NR_TIERS / MARIE_TIER_MAX / MARIE_ISOLATE_BATCH constants, and + * the per-mlv alloc/free helpers (marie_alloc_lruvec, + * marie_free_lruvec) live in state.{h,c}. Runtime-tunable knobs + * (marie_gen_growth_threshold, etc.) live in the sysfs section at the + * bottom of this file. Lifecycle (xarray lookup, RCU defer, memcg + * teardown) stays here. + */ + +/* + * Per-lruvec Marie state is reached directly through lv->marie_mlv (the + * single source of truth; no side xarray, no RCU). marie_nr_lruvecs + * counts live mlvs for stats only. + */ +static atomic_long_t marie_nr_lruvecs; + +/* + * Exported via mm/lru_marie/state.h for the install / evict / drain + * helpers to update during TRACKED 0<->1 transitions. percpu_counter + * so per-folio writes hit the local CPU's diff and only flush to the + * global on every percpu_counter_batch ops; reads use + * percpu_counter_sum (accurate, slower) in stats_show and + * percpu_counter_read_positive (approximate, fast) where a hot + * heuristic is good enough. + */ +struct percpu_counter marie_nr_folios; + +/** + * marie_get_lruvec - lookup or lazily allocate the Marie state for @lv. + * + * Returns NULL if Marie is disabled (so callers can skip cheaply) or if + * an atomic-context allocation fails. Otherwise returns the per-lruvec + * state, which lives until marie_drop_lruvec() or lru_marie_exit_memcg(). + * + * Hot path uses the cached @lv->marie_mlv pointer set after a successful + * xa_cmpxchg in this very function (or refreshed on a cache miss). + * Profile showed xas_load at ~1.5 % of cycles before this cache landed — + * the dispatcher's xarray lookup ran on every fault even though the + * answer was stable for the lruvec's lifetime. + */ +struct marie_lruvec *marie_get_lruvec(struct lruvec *lv) +{ + struct marie_lruvec *mlv, *old; + gfp_t gfp; + + if (!lru_marie_enabled()) + return NULL; + + mlv = READ_ONCE(lv->marie_mlv); + if (likely(mlv)) + return mlv; + +#ifdef CONFIG_MEMCG + { + struct mem_cgroup *memcg = lruvec_memcg(lv); + + if (memcg && css_is_dying(&memcg->css)) + return NULL; + } +#endif + + /* Use GFP_ATOMIC to be safe from any context. The first hit for + * any given lruvec pays this cost; subsequent reads hit the cached + * pointer above. */ + gfp = (in_task() && !irqs_disabled()) ? GFP_KERNEL : GFP_ATOMIC; + mlv = marie_alloc_lruvec(lv, gfp); + if (!mlv) + return NULL; + + /* + * Publish authoritatively into lv->marie_mlv -- the single source + * of truth for this lruvec's Marie state (no side xarray). cmpxchg + * resolves the lazy-alloc race: the loser frees its allocation and + * adopts the winner's. The pointer lives until the lruvec's memcg is + * freed (marie_drop_lruvec from mem_cgroup_free), so any caller + * holding a valid lruvec sees a live mlv without RCU. + */ + old = cmpxchg((struct marie_lruvec **)&lv->marie_mlv, NULL, mlv); + if (old) { + marie_free_lruvec(mlv); + return old; + } + + atomic_long_inc(&marie_nr_lruvecs); + return mlv; +} + +/* Forward declaration: drain implementation lives further down with + * marie_fill_one_lruvec near the change_state machinery. */ +static void marie_drain_one_lruvec(struct lruvec *lruvec, + struct list_head *to_free); + +static void marie_drop_lruvec(struct lruvec *lv) +{ + struct marie_lruvec *mlv; + MARIE_DRAIN_DEFER(to_free); + + /* + * Atomic "drain + xa_erase" under @lv->lru_lock. + * + * The invariant: every TRACKED bit owned by @lv must be cleared + * before the xa_erase publishes "@lv has no mlv". Otherwise a + * concurrent folio_put on a TRACKED folio could reach + * lru_marie_del_folio, observe xa_load(@lv) == NULL, fall into + * the cleanup branch and decrement counters on a torn-down mlv. + * + * Draining (wiping per-PFN state for every still-tracked folio + * and handing folios back to legacy lruvec lists) under the same + * lru_lock that brackets xa_erase closes the race: by the time + * xa_erase is visible, no folio under @lv carries MARIE_TRACKED. + * lru_marie_add_folio holds lru_lock too, so it cannot install + * new TRACKED folios during this critical section. + * + * Drain itself does not sleep -- it operates entirely on the + * per-(type, gen, tier) bitmaps and counters -- so it is safe + * inside spin_lock_irq. drop_lruvec is rare (toggle / memcg + * teardown); the extra drain work paid here is not on any hot + * path. spin_lock_irq already disables preemption and IRQs (the + * former migrate_disable() only existed to give the now-removed + * synchronize_rcu() a stable CPU context). + */ + scoped_guard(spinlock_irq, &lv->lru_lock) { + marie_drain_one_lruvec(lv, &to_free); + /* + * Capture and clear the authoritative lv->marie_mlv under + * lru_lock. A concurrent dispatcher (lru_marie_add_folio / + * lru_marie_del_folio) holds the same lru_lock, and + * marie_drain_one_lruvec above cleared MARIE_TRACKED on + * every still-tracked folio, so once this lock is dropped + * no path can route a del back into @mlv. + */ + mlv = READ_ONCE(lv->marie_mlv); + WRITE_ONCE(lv->marie_mlv, NULL); + } + /* to_free is auto-flushed at function return via __cleanup. */ + + if (!mlv) + return; + atomic_long_dec(&marie_nr_lruvecs); + + /* + * No synchronize_rcu() needed. With the side xarray gone there is no + * ref-free RCU reader of @mlv: swappiness_changed enumerates via + * mem_cgroup_iter (ref-pinned), the walker's pass-end housekeeping no + * longer walks per-mlv, and its per-PTE lookups deref @mlv only for + * folios charged to a live memcg. Every other accessor is serialised + * by lv->lru_lock above. drop_lruvec runs at mem_cgroup_free + * (refcount 0, no charges) or runtime disable, so @mlv is unreachable + * by the time it is freed. + */ + marie_free_lruvec(mlv); +} + +/* + * --------------------------------------------------------------------- + * generation lifecycle (helpers in mm/lru_marie/state.c) + * --------------------------------------------------------------------- + * + * Marie keeps a cycling per-type gen ring of MARIE_PFN_NR_GENS (= 4) + * slots, encoded directly in the per-PFN state byte. Aging is driven + * by three signals: + * + * - lru_marie_add_folio always lands on the current head gen + * (atomic_read(&marie_head_gen[type])). + * - marie_state_isolate_scan_l2lock always pulls from the oldest + * occupied gen (marie_find_oldest_occupied). + * - shrink_folio_list classifies each isolated folio: + * FOLIOREF_RECLAIM → freed (this folio truly was cold) + * FOLIOREF_KEEP → returned in folio_list, putback re-routes + * FOLIOREF_ACTIVATE → ditto, with PG_active set + * putback re-installs the survivor at (oldest+1)&3 with + * target_tier = max(prev_tier, w_tier) via + * marie_install_at_gen. + * - head_gen advances per type via marie_try_advance_head, fired by + * install cadence (marie_install_advance_hook checks + * marie_gen_installs against marie_gen_growth_threshold) and by + * the reclaim-driven trigger at shrink_lruvec entry (occupied gen + * count < 2). Advance is drain-wait gated: the next slot must be + * fully empty (marie_gen_occupied[next][type] == 0). + * + * The reclaim cycle alone does not surface every hot folio; a per-pgdat + * SIMD walker (mm/lru_marie/walker.c) clears young PTEs and bumps + * marie_state_inc_tier on tracked folios. When tier saturates, the + * walker calls marie_state_move_to_gen() synchronously to move the + * folio into the head gen at tier 0 (no pending queue). The rmap path + * (lru_marie_look_around) feeds the walker via a per-pgdat bloom + * filter so PMD scans concentrate on regions the rmap recently + * flagged hot. + */ + +/* + * --------------------------------------------------------------------- + * folio add / del + * --------------------------------------------------------------------- + */ + +/* + * lru_marie_add_folio: per-folio synchronous install. + * + * All folios are installed by marie_folio_install under the caller's + * lru_lock irqsave: per-PFN state byte (TRACKED + initial tier + type + + * zone + head_gen), tracking bitmaps, per-mlv / global counters + * (percpu_counter), and PG_lru are all published in one synchronous + * call. No per-CPU staging and no session-end flush hook -- every + * install is self-contained, so no carry-over state can leak across + * calls or across lruvecs. + * + * Skipped folio classes: + * + * - Unevictable folios: struct folio overlays folio->lru with + * folio->mlock_count via union. mm/mlock.c writes mlock_count + * directly while the folio is "owned" by an lruvec but NOT on a + * list. Marie keeps unevictable folios on the legacy path so + * mlock_count stays addressable. + * + * THP folios are routed through the per-type lock at the dispatcher + * level so the install is ordered against concurrent operations on + * the THP's lifetime. The per-type lock is purely a caller concern; + * marie_folio_install's body is identical for both branches. + */ +bool lru_marie_add_folio(struct lruvec *lv, struct folio *folio, bool reclaiming) +{ + struct marie_lruvec *mlv; + + lockdep_assert_held(&lv->lru_lock); + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(in_hardirq()); + + if (!lru_marie_enabled()) + return false; + if (folio_test_unevictable(folio)) + return false; + + mlv = marie_get_lruvec(lv); + if (!mlv) + return false; + if (unlikely(READ_ONCE(mlv->offline))) + return false; + + /* + * Large folios (THP) take the per-type lock on the way in so the + * install is ordered against drain / reparent on the same type; + * small folios run lock-free with only lru_lock. Both branches + * route to marie_folio_install; the per-type lock is purely a + * caller concern. + */ + if (folio_test_large(folio)) { + bool ok; + int type = folio_is_file_lru(folio); + + scoped_guard(marie_type_lock, &mlv->types[type]) + ok = marie_folio_install(folio, mlv); + return ok; + } + + return marie_folio_install(folio, mlv); +} +EXPORT_SYMBOL_GPL(lru_marie_add_folio); + +/* + * Non-adopting legacy LRU add for an untracked orphan inside a del+add + * move_fn (swap.c: lru_activate / lru_deactivate{,_file} / lru_lazyfree) or + * the legacy reclaim putback (vmscan.c: move_folios_to_lru). + * + * Those paths run lruvec_del_folio() (legacy del, mz -nr for an untracked + * folio) and then add the folio back. Routing that add through + * lruvec_add_folio() -> lru_marie_add_folio() would ADOPT the folio into + * Marie: the install credits Marie's own accounting, but the original -nr was + * a legacy debit, so mz->lru_zone_size drifts and a later legacy/Marie del + * underflows ("marie underflow-del" / mem_cgroup_update_lru_size lru_size -1). + * Do a pure legacy add (the +nr leg) instead. + * + * Callers MUST first bail on lru_marie_test_tracked() folios -- a tracked + * folio is Marie-owned and must never touch a legacy list. Shared by swap.c's + * move_fns and vmscan.c's putback; see the header doc in lru_marie.h. + */ +void lru_marie_orphan_add(struct lruvec *lruvec, struct folio *folio, bool tail) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + if (tail) + list_add_tail(&folio->lru, &lruvec->lists[lru]); + else + list_add(&folio->lru, &lruvec->lists[lru]); +} +EXPORT_SYMBOL_GPL(lru_marie_orphan_add); + +/** + * lru_marie_split_folio - install a freshly-split tail folio under Marie. + * @lv: head folio's lruvec (caller holds lru_lock) + * @head: THP head folio currently RESIDENT in Marie + * @new_folio: tail folio created by __split_huge_page + * + * Mirrors mm/huge_memory.c::lru_add_split_folio's + * "list_add_tail(&new_folio->lru, &folio->lru)" for the Marie case so + * that @new_folio: + * + * - inherits @head's tier 0 install at the current head_gen + * (a freshly-split tail page has no independent hotness signal + * yet -- subsequent walker passes promote it on young hits) + * - has its TRACKED bit set in marie_state[pfn] so dispatcher del + * routes through Marie (without TRACKED, dispatcher del would + * fall through to legacy update_lru_size and bypass + * mlv->types[].nr_pages bookkeeping) + * + * Accounting note: lru_size and mlv->types[].nr_pages are NOT + * incremented for @new_folio. The original head install +N covered the + * full pre-split compound, and each sub-folio's eventual del decrements + * by its own folio_nr_pages; the sum balances. marie_nr_folios IS + * incremented because it is a folio count, not a page count, and the + * post-split state has 1 + ntails folios where there was 1 before. + * + * Caller MUST hold @lv->lru_lock and have established that @head is + * Marie-tracked (folio_marie_test_tracked) before invoking this. The + * helper takes the per-type lock internally via + * scoped_guard(marie_type_lock, ...). + * + * No-op (and returns) if @head is unevictable -- legacy + * lru_add_split_folio handles that branch separately, before calling + * here. + */ +void lru_marie_split_folio(struct lruvec *lv, struct folio *head, + struct folio *new_folio) +{ + struct marie_lruvec *mlv; + + lockdep_assert_held(&lv->lru_lock); + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(in_hardirq()); + + /* + * Caller already checked lru_marie_enabled() via the static branch, + * but @head may not be Marie-tracked (e.g. THP added to legacy LRU + * because Marie alloc failed at the original add). Fall back to + * plain list_add_tail in that case so @new_folio joins @head's + * neighbour link on the legacy LRU list as it would have without + * Marie. + */ + if (!folio_marie_test_tracked(head)) { + list_add_tail(&new_folio->lru, &head->lru); + return; + } + + if (folio_test_unevictable(head)) + return; + + mlv = marie_get_lruvec(lv); + if (!mlv) { + /* + * @head is Marie-tracked but marie_get_lruvec returned NULL -- + * its memcg is offlining (marie_drop_lruvec already NULLed + * lv->marie_mlv) or a rare GFP_ATOMIC alloc failed. + * + * Do NOT fall back to a bare list_add_tail: that would leave + * @new_folio !TRACKED while the caller stamps PG_lru on it, and + * Marie's install NEVER credited mz->lru_zone_size (Marie folios + * bypass it by design). The tail's eventual legacy lruvec_del + * would then underflow mz->lru_zone_size, and the list_add_tail + * onto @head's Marie self-loop would corrupt the list at del. + * + * Instead publish the tail's TRACKED state (inheriting head's + * type/gen at tier 0) so its del routes through + * lru_marie_del_folio's orphan path, and keep folio->lru a + * self-loop (INIT_LIST_HEAD) like every other Marie folio -- the + * orphan path asserts list_empty(&folio->lru). Skip the per-type + * lock and the mlv counters (there is no mlv); the per-PFN publish + * primitives are atomic/lock-free and the buddy-handoff hook + * (marie_state_drop_pfn_at_free) clears the byte regardless. + */ + int type = folio_is_file_lru(head); + + if (folio_test_active(new_folio)) + folio_clear_active(new_folio); + INIT_LIST_HEAD(&new_folio->lru); + marie_pfn_publish_inherit(new_folio, type, + (u8)atomic_read(&marie_head_gen[type]), + 0, folio_zonenum(new_folio)); + marie_pc_add(&marie_nr_folios, 1); + return; + } + + /* Marie's invariant: clear PG_active before publishing TRACKED. */ + if (folio_test_active(new_folio)) + folio_clear_active(new_folio); + + /* Head and new_folio share the same Marie type (folio split does + * not change LRU category), so head's per-type lock guards both. */ + scoped_guard(marie_type_lock, &mlv->types[folio_is_file_lru(head)]) { + int type = folio_is_file_lru(head); + int zone = folio_zonenum(new_folio); + u8 head_gen = (u8)atomic_read(&marie_head_gen[type]); + + /* + * Inherit head's tier-0 install at the current head_gen + * (a freshly-split tail page has no independent hotness + * signal). marie_pfn_publish_inherit writes the state byte, + * the (type, gen, tier) bitmap, the per-memcg L1 bitmap, and + * gen_occupied++; it deliberately skips gen_installs because + * the tail inherits the parent's install budget. + * + * folio->lru is initialised to a self-loop, exactly as + * marie_folio_install() does for a fresh install: every Marie + * folio is OFF the legacy lruvec lists and tracked purely by the + * per-PFN state + bitmap. The old list_add_tail onto @head's link + * instead built a multi-element ring from @head and all of its + * split tails; the reclaim isolate path + * (marie_evict_counters_only + list_add, state.c) and every other + * self-loop-assuming site then corrupted that ring, abandoning + * neighbours that still pointed at the moved folio -- folios + * orphaned off mz accounting (the mz->lru_zone_size underflow) and + * list_del corruption / use-after-free of a reused page (the + * userspace SEGV). PG_lru is set by the caller (lru_add_split_folio) + * after this returns, after the per-PFN state is published, so a + * concurrent __page_cache_release observing PG_lru=1 also observes + * marie_state[pfn] & MARIE_PFN_TRACKED. + */ + INIT_LIST_HEAD(&new_folio->lru); + marie_pfn_publish_inherit(new_folio, type, head_gen, 0, zone); + marie_pc_add(&marie_nr_folios, 1); + } +} +EXPORT_SYMBOL_GPL(lru_marie_split_folio); + +bool lru_marie_del_folio(struct lruvec *lv, struct folio *folio, bool reclaiming) +{ + struct marie_lruvec *mlv; + + lockdep_assert_held(&lv->lru_lock); + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(in_hardirq()); + + /* + * TRACKED takes priority over the lru_marie_enabled() gate. A + * folio with TRACKED=1 may still be Marie-owned after the gate + * flips false during a disable transition: marie_change_state + * iterates lruvecs serially, and between the gate flip and the + * per-lruvec drain_one_lruvec there is a window where + * not-yet-drained lruvecs hold TRACKED folios. If we bailed at + * the gate here, the dispatcher (mm_inline.h::lruvec_del_folio) + * would fall through to legacy lruvec_del_folio, which does + * list_del(&folio->lru) on a folio that is on a Marie self-loop, + * not on the lruvec list -- corrupting the folio. (mz->lru_zone_size + * stays balanced either way now that marie_update_lru_size credits + * mz at install, but the list must still be handled by Marie.) + * + * Trusting TRACKED is safe regardless of the gate: TRACKED is + * only ever set under Marie's install helpers (marie_folio_install + * on every install path, marie_pfn_publish_inherit on split's tail, + * marie_state_publish_at_gen on the reclaim survivor putback, and + * marie_fill_one_lruvec for the enable-time legacy sweep) and + * only ever cleared by Marie's evict paths or by + * marie_drain_pfn_locked. + */ + if (!folio_marie_test_tracked(folio)) + return false; + + /* + * Direct lv->marie_mlv read (rather than marie_get_lruvec) so we + * work even when the gate is off mid-transition. Race against + * marie_drop_lruvec is closed by lru_lock serialisation: + * marie_drop_lruvec clears lv->marie_mlv under the same lv->lru_lock + * that our caller (lruvec_del_folio) holds. + */ + mlv = READ_ONCE(lv->marie_mlv); + if (!mlv) { + /* + * @lv has no mlv but @folio still carries MARIE_TRACKED. + * + * Per-PFN paradigm: Marie's per-folio state lives entirely + * in marie_state[pfn]; folio->lru is always a self-loop + * (Marie never re-attaches folios onto Marie-owned lists). + * marie_drop_lruvec runs marie_drain_one_lruvec (which + * clears every TRACKED bit via marie_state_drop_pfn) and + * xa_erase under the SAME lru_lock the dispatcher holds, + * so by the time xa_load returns NULL no folio under @mlv + * remains TRACKED. Reaching here therefore implies a + * cross-mlv stale state: the folio was tracked by some + * other mlv whose xa entry is also gone, or an enable + * transient where alloc on this lv has not yet completed. + * + * Note: alloc-failure at add time (marie_alloc_lruvec + * returning NULL) is NOT a path that reaches here — that + * path makes lru_marie_add_folio return false and the folio + * goes onto the legacy LRU with TRACKED=0, so the earlier + * folio_marie_test_tracked check short-circuits before this. + * + * Wiping the per-PFN byte and returning true is safe because + * folio->lru is a self-loop; no live list neighbour points + * at @folio. + */ + VM_WARN_ON_ONCE_FOLIO(!list_empty(&folio->lru), folio); + /* Wipe per-PFN state so the orphan does not reappear in the bitmap walk. */ + marie_state_drop_pfn(folio); + if (folio_test_active(folio)) + folio_clear_active(folio); + return true; + } + + /* + * External-removal entry runs without acquiring the per-type + * lock. The caller (lruvec_del_folio reaching here from + * compaction / folio_put -> __page_cache_release) holds + * lruvec->lru_lock, which serialises every other path that could + * clear MARIE_TRACKED. The eviction's list_del_init is + * unconditional (folio->lru is either a self-loop or on legacy + * lruvec->lists[lru], whose mutation is already covered by the + * caller's lru_lock). + * + * marie_del_folio_locked -> marie_evict_locked -> marie_account_evict + * owns the full counter wind-down, including the single + * marie_nr_folios -1. Do NOT decrement it again here (the old + * caller-side -1 predated the account.h funnel and double-counted + * every generic del of a tracked folio). + */ + return marie_del_folio_locked(mlv, folio); +} +EXPORT_SYMBOL_GPL(lru_marie_del_folio); + +/* + * Outer-level release entry called from __page_cache_release when the + * caller has determined that TRACKED is set. See the contract in + * . + * + * Why a TRACKED outer gate (rather than the legacy folio_test_lru + * gate) matters: a Marie-installed folio is on a self-loop + * (folio->lru points at itself), not on a legacy lruvec list. If the + * legacy gate let such a folio reach mm_inline.h::lruvec_del_folio, + * its list_del(&folio->lru) would operate on the self-loop instead of + * a real list and corrupt Marie's bookkeeping. With TRACKED as the + * outer gate, Marie folios are routed here, which unlinks the + * self-loop and debits mz->lru_zone_size (marie_update_lru_size is + * unified with legacy update_lru_size, so the +nr at install and the + * -nr here balance structurally). + */ +void lru_marie_release_folio(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp) +{ + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + + lockdep_assert_held(&(*lruvecp)->lru_lock); + lockdep_assert_irqs_disabled(); + + /* + * lru_marie_del_folio re-tests TRACKED under the lock and handles + * both the normal Marie folio case and the orphan case (mlv freed + * but TRACKED still set, see its body). Returns true on Marie + * ownership; false means TRACKED was cleared between our caller's + * outer test and our lock acquisition (race with drain). + */ + if (lru_marie_del_folio(*lruvecp, folio, false)) { + __folio_clear_lru_flags(folio); + return; + } + + /* + * Drain raced us. The folio is now on a legacy lruvec list with mz + * credited (drain's line 1012). Run the legacy del to keep PG_lru, + * the list membership, and mz consistent. + */ + if (folio_test_clear_lru(folio)) + lruvec_del_folio(*lruvecp, folio); + __folio_clear_lru_flags(folio); +} +EXPORT_SYMBOL_GPL(lru_marie_release_folio); + +/* + * --------------------------------------------------------------------- + * memcg lifecycle hook + * --------------------------------------------------------------------- + */ + +void lru_marie_exit_memcg(struct mem_cgroup *memcg) +{ + int nid; + + might_sleep(); + + for_each_node(nid) { + struct lruvec *lv = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + + marie_drop_lruvec(lv); + } + + /* Release per-memcg bitmap if allocated (no-op when gate is off). */ + marie_memcg_bitmap_free(memcg); +} +EXPORT_SYMBOL_GPL(lru_marie_exit_memcg); + +/* + * Drain one lruvec at css_offline time, while rstat_cpu is still valid. + * Setting mlv->offline under lru_lock before the drain ensures that any + * concurrent lru_marie_add_folio (Patch 2) sees the flag and bails out, + * keeping the legacy lists truly empty by the time css_free runs. + */ +static void marie_offline_lruvec(struct lruvec *lv) +{ + struct marie_lruvec *mlv; + MARIE_DRAIN_DEFER(to_free); + + scoped_guard(spinlock_irq, &lv->lru_lock) { + mlv = READ_ONCE(lv->marie_mlv); + if (mlv) + WRITE_ONCE(mlv->offline, true); + marie_drain_one_lruvec(lv, &to_free); + } + /* to_free is auto-flushed at function return via __cleanup. */ +} + +void lru_marie_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + might_sleep(); + + if (!lru_marie_enabled()) + return; + + for_each_node(nid) { + struct lruvec *lv = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + + marie_offline_lruvec(lv); + } +} +EXPORT_SYMBOL_GPL(lru_marie_offline_memcg); + +/* + * NOTE: currently unused -- no in-tree caller. Marie's memcg offline path + * (lru_marie_offline_memcg) drains rather than reparents. Kept as the + * reparent counterpart for a future offline path. A future caller MUST + * hold both lruvecs' lru_lock (IRQs off) AND acquire them in a + * deterministic global order to avoid A-B/B-A between concurrent reparents; + * this function only takes @child_mlv's per-type locks (marie_both_mlv). + */ +void lru_marie_reparent_lruvec(struct lruvec *child_lv, struct lruvec *parent_lv) +{ + struct marie_lruvec *child_mlv, *parent_mlv; + + lockdep_assert_held(&child_lv->lru_lock); + lockdep_assert_held(&parent_lv->lru_lock); + lockdep_assert_irqs_disabled(); + + if (!lru_marie_enabled()) + return; + + child_mlv = READ_ONCE(child_lv->marie_mlv); + if (!child_mlv) + return; + + /* + * Best-effort parent_mlv materialisation. A caller is required to + * hold both lruvecs' lru_lock with IRQs disabled, so marie_get_lruvec + * falls back to GFP_ATOMIC. On allocation failure parent_mlv stays NULL and + * reparent zeroes child's per-memcg bitmap without merging -- + * folios then rely on global tracking only (no per-memcg filter). + */ + parent_mlv = marie_get_lruvec(parent_lv); + + /* + * Reparent touches child_mlv's per-type counters; take both + * per-type locks in canonical order via the marie_both_mlv guard. + * Caller has IRQs disabled (objcg_lock pinned), matching the + * guard's spin_lock_irqsave contract. + * + * Reparent is per-folio iteration free: it merges the per-memcg + * L1/L2 bitmap from child into parent (L2-pruned word OR) and + * transfers per-type / per-(lru, zone) percpu_counters. The + * per-PFN state bytes are memcg-agnostic and stay in place, so + * marie_nr_folios does not change here -- reparent returns 0 and + * there is no global counter adjustment. + */ + scoped_guard(marie_both_mlv, child_mlv) + marie_reparent_locked(child_mlv, parent_mlv); +} +EXPORT_SYMBOL_GPL(lru_marie_reparent_lruvec); + +/* + * Invoked from the vm.swappiness sysctl handler and memcg's + * memory.swappiness writer when a swappiness value has changed. + * Walks the marie_lruvec xarray once and resets every swap_bias to + * zero so the proportional controller restarts from neutral under + * the new weight ratio. See lru_marie.h for the rationale. + * + * Resets unconditionally rather than filtering by memcg / cgroup + * version: extra resets on lruvecs whose effective swappiness did + * not actually change are harmless under a controller whose only + * state is the bias counter, and the filtering would add code + * without changing observable behaviour. Sysctl writes are + * human-rate so the xa walk's cost is negligible. + */ +void lru_marie_swappiness_changed(void) +{ + struct mem_cgroup *memcg; + + might_sleep(); + + if (!lru_marie_enabled()) + return; + + /* + * Reset every lruvec's swap_bias to neutral. Enumerate via + * mem_cgroup_iter (which ref-pins each memcg across the step, + * keeping its mlv alive) rather than a ref-free xarray walk -- a + * human-rate sysctl path, so the per-memcg ref is irrelevant. + * + * Also covers the !memcg / mem_cgroup_disabled node lruvec: + * mem_cgroup_iter then yields NULL, the body runs once with + * memcg==NULL, and mem_cgroup_lruvec(NULL, pgdat) resolves to + * pgdat->__lruvec. Best-effort -- a missed lruvec simply keeps its + * bias and reconverges under the controller. + */ + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + struct lruvec *lv = + mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + struct marie_lruvec *mlv = READ_ONCE(lv->marie_mlv); + + if (mlv) + atomic64_set(&mlv->swap_bias, 0); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +} +EXPORT_SYMBOL_GPL(lru_marie_swappiness_changed); + +/* + * --------------------------------------------------------------------- + * enable / disable + * --------------------------------------------------------------------- + */ + +/* + * Per-lruvec migration helpers used by marie_change_state(). Caller + * holds @lruvec->lru_lock with IRQs disabled, which orders correctly + * against the inner per-type locks taken by the scoped_guard + * (marie_both_mlv) below. + * + * marie_drain_one_lruvec(): walk every (type, gen, tier) bitmap, wipe + * the per-PFN state for each tracked folio under this lv, and hand the + * folio back to legacy lruvec->lists[lru] (or MGLRU's lrugen if MGLRU + * is the fallback). After return, the lruvec is purely legacy LRU / + * MGLRU. + * + * marie_fill_one_lruvec(): force-allocate @mlv for @lruvec and pull + * every evictable folio off @lruvec->lists[lru] into Marie via the + * canonical lruvec_del_folio + marie_folio_install pair. After + * return, the lruvec's standard lru lists hold only unevictable folios; + * everything else is tracked via marie_state[]. + */ + +/* Non-allocating lookup; mirror of marie_get_lruvec without the + * lru_marie_enabled() gate (state-change runs while the gate is + * mid-flip). Same lv->marie_mlv cache as the gated path. */ +static struct marie_lruvec *marie_lookup_lruvec(struct lruvec *lv) +{ + /* lv->marie_mlv is authoritative; NULL means no Marie state. */ + return READ_ONCE(lv->marie_mlv); +} + +/* As marie_get_lruvec() but bypasses the lru_marie_enabled() gate. */ +static struct marie_lruvec *marie_force_alloc_lruvec(struct lruvec *lv, + gfp_t gfp) +{ + struct marie_lruvec *mlv, *old; + + mlv = READ_ONCE(lv->marie_mlv); + if (mlv) + return mlv; + + mlv = marie_alloc_lruvec(lv, gfp); + if (!mlv) + return NULL; + + old = cmpxchg((struct marie_lruvec **)&lv->marie_mlv, NULL, mlv); + if (old) { + marie_free_lruvec(mlv); + return old; + } + atomic_long_inc(&marie_nr_lruvecs); + return mlv; +} + +/* + * Drain one tracked PFN found by a bitmap walk: wipe the per-PFN state + * artifacts and hand the folio back to legacy lruvec->lists[lru] + * (or detach for unevictable), updating counters. + * + * folio->lru is a self-loop at this point (install/flush both leave it + * that way), so list_move just inserts into the legacy list head + * without disturbing any prior list. Caller-held lruvec->lru_lock + * serialises the legacy-list mutation. + * + * Caller must hold mlv's marie_both_mlv scope (type_lock for both + * types). + */ +static void marie_drain_pfn_locked(struct marie_lruvec *mlv, + struct lruvec *lruvec, + unsigned long pfn, int type, + struct list_head *to_free) +{ + struct folio *folio; + long nr; + enum lru_list lru; + int z; + u8 state_byte; + + lockdep_assert_held(&lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + if (!pfn_valid(pfn)) + return; + folio = pfn_folio(pfn); + if (!folio || folio_pfn(folio) != pfn) + return; + /* + * Re-validate against the per-PFN byte: any racing del path + * would have cleared TRACKED already. Walking the bitmap without + * a per-PFN lock means we may observe a stale bit whose backing + * folio is gone; the byte is the source of truth for the + * transition. + */ + state_byte = READ_ONCE(marie_state[pfn]); + if (!(state_byte & MARIE_PFN_TRACKED)) + return; + + /* + * Pin + claim before touching folio->lru. The reclaim isolate + * path (marie_evict_counters_only) runs lock-free -- it claims a + * folio via folio_test_clear_lru WITHOUT lru_lock and can free a + * TRACKED folio concurrently with this drain even though we hold + * lruvec->lru_lock. If we list_move() such a folio after it has + * been freed (folio->lru poisoned to LIST_POISON) the list op + * dereferences the poison and oopses (NULL write at + * marie_drain_pfn_locked). + * + * 1. folio_try_get: fails if the folio is already at refcount 0 + * (mid-free). The page-free hook clears the per-PFN state, so + * nothing to do here. + * 2. folio_test_clear_lru: the same atomic claim the isolate + * path uses. If PG_lru is already clear, an in-flight isolate + * owns the folio (its folio->lru is on a private reclaim list, + * not Marie's self-loop); skip and let that path + the free + * hook clean up. If we win the claim, the folio is ours: its + * folio->lru is a Marie self-loop, safe to list_move onto the + * legacy list, and we re-publish PG_lru afterwards. + */ + if (!folio_try_get(folio)) + return; + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + /* Re-check TRACKED under our exclusive claim. */ + if (!(READ_ONCE(marie_state[pfn]) & MARIE_PFN_TRACKED)) { + /* + * Same last-reference hazard as the tail below: under the + * css_offline drain our pin can be the last one. Defer the free + * to @to_free (released by the caller after lru_lock is dropped); + * freeing here would re-enter __page_cache_release under the held + * lru_lock. The folio is a Marie self-loop here (never + * list_move'd), so no list/counter unwind is needed. + */ + if (folio_put_testzero(folio)) { + folio_clear_active(folio); + list_add(&folio->lru, to_free); + } else { + folio_set_lru(folio); + } + return; + } + + nr = folio_nr_pages(folio); + z = folio_zonenum(folio); + + /* + * Wipe per-PFN state directly via marie_state_drop_pfn + * (byte + bitmap + counters + l2_range_count + memcg L1). + */ + marie_state_drop_pfn(folio); + + /* + * Mirror Marie's install invariant: PG_active=0 before computing + * the lru index, so the -nr on marie_lru_zone_size lands on the + * same INACTIVE bucket Marie's install +nr'd. + */ + if (folio_test_active(folio)) + folio_clear_active(folio); + lru = folio_lru_list(folio); + + /* + * Hand the folio to its legacy lruvec list. This is mz-NEUTRAL: + * Marie already credited mz->lru_zone_size at install + * (marie_update_lru_size is unified with legacy update_lru_size), + * so the folio is counted in mz the whole time -- the drain only + * moves it from its Marie self-loop onto the real lruvec list. Do + * NOT re-credit mz here; that would double-count. PG_lru is + * re-published below, and the PG_lru=1 invariant is "on a real + * lruvec list AND counted in mz->lru_zone_size". + * + * This MUST cover LRU_UNEVICTABLE the same way -- generic + * lruvec_add_folio places unevictable folios on + * lists[LRU_UNEVICTABLE]. An earlier list_del_init()+skip + * special-case orphaned a re-published-PG_lru folio off every list; + * when exit_mmap later munlocked and freed it, the generic + * lruvec_del_folio walked a folio that was no longer on any list -> + * corrupted-rmap Oops in shrink_folio_list. Keep every drained folio + * on a real list. Only reachable via marie_drop_lruvec (memcg + * offline), so swap tests without cgroup churn never exercised it. + */ + list_move(&folio->lru, &lruvec->lists[lru]); + + marie_pc_add(&mlv->types[type].nr_pages, -nr); + marie_pc_add(&marie_nr_folios, -1); + /* Marie's internal per-bucket tally drops; mz keeps the +nr. */ + marie_pc_add(&mlv->marie_lru_zone_size[lru][z], -nr); + + /* + * Drop our transient pin and re-publish on the legacy LRU. + * + * folio_put_testzero, not a plain folio_put: the css_offline drain + * (marie_offline_lruvec) runs while exit_mmap is concurrently freeing + * the dying memcg's folios, so our pin can be the LAST reference. A + * plain folio_put would then enter __page_cache_release, which -- with + * the PG_lru we set below -- re-acquires lruvec->lru_lock, the very + * lock the caller already holds: a recursive self-deadlock (IRQs off + * -> hard lockup). The change-state drain never hit this because its + * folios are still mapped (pin never reaches zero). + * + * When the put frees the folio: leave PG_lru clear, undo the + * list_move, and debit mz->lru_zone_size. The folio is leaving the + * LRU for the buddy allocator, so this -nr is the del-debit that + * settles the +nr Marie credited at install (no legacy + * lruvec_del_folio will run for it -- PG_lru stays clear and the + * free goes straight to @to_free). DEFER the actual free to + * @to_free: the caller calls marie_drain_release() after dropping + * lru_lock, so __folio_put() (mem_cgroup_uncharge, deferred-split + * unqueue, buddy free) never runs under lru_lock -- matching the + * release_pages() / survivor-putback discipline. + */ + if (folio_put_testzero(folio)) { + list_del(&folio->lru); +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, z, -nr); +#endif + folio_clear_active(folio); + list_add(&folio->lru, to_free); + } else { + folio_set_lru(folio); + } +} + +/* + * Walk marie_gen_bitmap[type][gen][tier] AND (when memcg-targeted) + * the per-memcg L1, restricted to mlv's pgdat PFN range. For each set + * bit, drain the underlying folio onto its legacy lruvec list. + * + * Outer loop is L2-pruned via the per-(type, gen, tier) L2 bitmap; + * the L2 bit is refcount-maintained against marie_l2_cell_count so + * an L2 bit set implies at least one L1 bit set in the same cell. + * + * Inner L1 iteration uses a local word copy (__ffs/blsr extraction), + * so the marie_state_drop_pfn calls that clear the global bitmap + * behind our back do not perturb forward progress. + * + * Caller holds mlv's marie_both_mlv scope (drain runs with the gate off + * or during memcg teardown; concurrent install/del on this mlv's + * memcg-bound PFNs is quiescent). + */ +static void marie_drain_bitmap_walk_one(struct marie_lruvec *mlv, + struct lruvec *lruvec, + int type, int gen, int tier, + struct list_head *to_free) +{ + struct marie_bitmap *bm = &marie_track_bm[type][gen][tier]; + unsigned long *l1 = bm->l1; + unsigned long *l2 = bm->l2; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + unsigned long *memcg_l1 = NULL, *memcg_l2 = NULL; + unsigned long start_pfn, end_pfn; + unsigned int start_l2, end_l2, l2_word, l2_word_end; + + lockdep_assert_held(&lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + if (!l1 || !marie_state) + return; + start_pfn = pgdat->node_start_pfn; + end_pfn = pgdat_end_pfn(pgdat); + if (end_pfn > marie_state_size) + end_pfn = marie_state_size; + if (start_pfn >= end_pfn) + return; + + if (memcg && !mem_cgroup_is_root(memcg)) { + memcg_l1 = marie_memcg_bitmap_get(memcg); + memcg_l2 = marie_memcg_bitmap_get_l2(memcg); + } + + start_l2 = marie_pfn_to_l2_bit(start_pfn); + end_l2 = marie_pfn_to_l2_bit(end_pfn - 1) + 1; + if (end_l2 > MARIE_L2_BITS) + end_l2 = MARIE_L2_BITS; + l2_word = start_l2 / BITS_PER_LONG; + l2_word_end = DIV_ROUND_UP(end_l2, BITS_PER_LONG); + + for (; l2_word < l2_word_end; l2_word++) { + unsigned long l2w = l2[l2_word]; + + if (memcg_l2) + l2w &= memcg_l2[l2_word]; + if (l2_word == start_l2 / BITS_PER_LONG && + (start_l2 % BITS_PER_LONG)) + l2w &= ~((1UL << (start_l2 % BITS_PER_LONG)) - 1); + if (l2_word + 1 == l2_word_end && + (end_l2 % BITS_PER_LONG)) + l2w &= (1UL << (end_l2 % BITS_PER_LONG)) - 1; + + while (l2w) { + unsigned int bit = l2_word * BITS_PER_LONG + __ffs(l2w); + unsigned long lo, hi; + unsigned long word_i, end_word; + + l2w &= l2w - 1; + + lo = marie_l2_bit_pfn_start(bit); + hi = marie_l2_bit_pfn_end(bit); + if (lo < start_pfn) + lo = start_pfn; + if (hi > end_pfn) + hi = end_pfn; + + word_i = lo / BITS_PER_LONG; + end_word = BITS_TO_LONGS(hi); + + for (; word_i < end_word; word_i++) { + unsigned long w = l1[word_i]; + + if (memcg_l1) + w &= memcg_l1[word_i]; + + if (word_i == lo / BITS_PER_LONG && + (lo % BITS_PER_LONG)) + w &= ~((1UL << (lo % BITS_PER_LONG)) - 1); + if (word_i + 1 == end_word && + (hi % BITS_PER_LONG)) + w &= (1UL << (hi % BITS_PER_LONG)) - 1; + + while (w) { + unsigned int b = __ffs(w); + unsigned long pfn = word_i * BITS_PER_LONG + b; + + w &= w - 1; + marie_drain_pfn_locked(mlv, lruvec, + pfn, type, + to_free); + } + } + } + } +} + +static void marie_drain_one_lruvec(struct lruvec *lruvec, + struct list_head *to_free) +{ + struct marie_lruvec *mlv = marie_lookup_lruvec(lruvec); + int t, g, tier; + + lockdep_assert_held(&lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + if (!mlv) + return; + + scoped_guard(marie_both_mlv, mlv) { + /* + * No pending-queue drain needed: install / evict / tier + * saturate are all synchronous in the per-PFN paradigm + * (marie_state_inc_tier handles saturation in-place via + * marie_state_move_to_gen), so no out-of-band state + * remains after the gate flip. + */ + + /* + * Iterate the global L1 bitmap per (type, gen, tier) AND'd + * with the memcg L1 (when targeted), restricted to this + * pgdat's PFN range. Each set bit is drained via + * marie_drain_pfn_locked: state byte / bitmap / counters + * are wiped and the folio is handed back to legacy + * lruvec->lists[lru] under the caller's lru_lock. + */ + for (t = 0; t < ANON_AND_FILE; t++) + for (g = 0; g < MARIE_PFN_NR_GENS; g++) + for (tier = 0; tier < MARIE_PFN_NR_TIERS; tier++) + marie_drain_bitmap_walk_one(mlv, lruvec, + t, g, tier, + to_free); + } + + /* + * Hand off legacy residue to MGLRU if MGLRU is the fallback. + * + * Marie's gate is off here (the caller flipped it before + * invoking drain). The folios just deposited on + * lruvec->lists[lru] need to migrate onto lrugen so MGLRU's + * state_is_valid invariant ("lrugen enabled => legacy lists + * empty") holds, and so MGLRU's own reclaim path can see and + * evict them while Marie is dormant. + * + * lru_gen_fill_lruvec calls fill_evictable internally, which + * uses lruvec_del_folio (skips Marie because the gate is off, + * falls through to the plain legacy list_del) + lru_gen_add_folio + * to land each folio on lrugen with correct accounting. + * + * Invoked OUTSIDE the marie_both_mlv scope above: the helper + * does not need (and must not be entangled with) Marie's + * per-type locks once the drain proper is complete. + * + * At css_free time this call is a guaranteed no-op: mlv->offline + * was set at css_offline time and prevented all new installs, so + * lruvec->lists[lru] is empty and fill_evictable() returns + * immediately without touching rstat_cpu. + * + * Use lru_gen_core_enabled(), not lru_gen_enabled(): this is the + * ownership handoff and must see the raw MGLRU key. lru_gen_enabled() + * is masked to false whenever Marie is enabled, but here Marie's gate + * is already off (the caller flipped it) so the two agree -- we use + * the raw key for intent and symmetry with marie_fill_one_lruvec. + */ + if (!READ_ONCE(mlv->offline) && lru_gen_core_enabled()) + lru_gen_fill_lruvec(lruvec); +} + + +static void marie_fill_one_lruvec(struct lruvec *lruvec) +{ + struct marie_lruvec *mlv; + enum lru_list lru; + + lockdep_assert_held(&lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + mlv = marie_force_alloc_lruvec(lruvec, GFP_ATOMIC); + if (!mlv) + return; + + /* + * If MGLRU was the fallback while Marie was off, its lrugen + * lists hold every folio added during that window. Drain them + * via MGLRU's canonical lru_gen_del_folio + lruvec_add_folio + * sequence: with Marie's gate already on at this point in + * marie_change_state, lruvec_add_folio routes each folio + * straight into lru_marie_add_folio, which sets per-PFN state + * via marie_folio_install. No MGLRU internals are + * touched on the Marie side; accounting drift is impossible + * because the drain uses MGLRU's own helpers throughout. + * + * Use lru_gen_core_enabled(), not lru_gen_enabled(): Marie's gate is + * already ON at this point, so lru_gen_enabled() is masked to false + * and would skip this drain, stranding MGLRU-tagged folios on lrugen + * (gen bits intact) -- exactly the residue this whole pass exists to + * clear. The handoff must see the raw MGLRU key. + */ + if (lru_gen_core_enabled()) + lru_gen_drain_lruvec(lruvec); + + scoped_guard(marie_both_mlv, mlv) { + /* + * Walk legacy lruvec->lists[lru]. By the time we reach + * here, MGLRU lrugen contents have already been drained + * by the caller via lru_gen_drain_lruvec (with Marie's + * gate on, those folios route through lru_marie_add_folio + * directly into Marie's normal install path, so the + * per-PFN state is set correctly without any work here). + * + * What remains on the legacy lists is the residue from a + * previous marie_drain_one_lruvec under + * MGLRU=n -- folios Marie itself evicted back to legacy + * lists during a prior disable transition. Import each + * via lruvec_del_folio + marie_folio_install. + */ + for (lru = 0; lru < NR_LRU_LISTS; lru++) { + struct folio *f, *next_f; + + if (lru == LRU_UNEVICTABLE) + continue; + + list_for_each_entry_safe(f, next_f, + &lruvec->lists[lru], lru) { + /* + * Transfer ownership from the legacy LRU to + * Marie via the canonical del/add pair. + * + * lruvec_del_folio: PG_active is still set + * at this point so folio_lru_list() returns + * the correct old bucket (ACTIVE or INACTIVE) + * for the mz and vmstat debits; the matching + * credit lands in INACTIVE after install + * because marie_folio_install clears + * PG_active before computing inst_lru. + * + * marie_folio_install: clears PG_active (via + * set_mask_bits), sets PG_lru, writes the + * per-PFN state byte and bitmaps, and credits + * all Marie counters -- exactly the same path + * the fault-install uses. The per-type lock + * is already held by marie_both_mlv at this + * scope; routing through lru_marie_add_folio + * would deadlock by trying to re-acquire it. + */ + lruvec_del_folio(lruvec, f); + marie_folio_install(f, mlv); + } + } + + } +} + +/* + * Per-lruvec body of marie_change_state. Acquires lru_lock with IRQs + * off (drain/fill require it), invokes the appropriate transition, + * releases, then runs marie_drop_lruvec outside lru_lock (it frees the + * mlv after dropping the lock). + * + * marie_drop_lruvec is idempotent (lv->marie_mlv reads NULL on a second + * visit), so this helper is safe to call on lruvecs that never carried + * an mlv. + */ +static void marie_change_state_lruvec(struct lruvec *lruvec, bool enable) +{ + MARIE_DRAIN_DEFER(to_free); + + /* + * spin_lock_irq disables preemption and IRQs; the earlier + * migrate_disable() was only needed to give a now-removed + * synchronize_rcu() a stable CPU context and is no longer + * required (marie_drop_lruvec carries the same note). + */ + scoped_guard(spinlock_irq, &lruvec->lru_lock) { + if (enable) + marie_fill_one_lruvec(lruvec); + else + marie_drain_one_lruvec(lruvec, &to_free); + } + /* to_free is auto-flushed at function return (no-op on enable). */ + + if (!enable) + marie_drop_lruvec(lruvec); + + cond_resched(); +} + +static int marie_change_state(bool enable) +{ + static DEFINE_MUTEX(state_mutex); + struct mem_cgroup *memcg; + int ret = 0; + + /* + * Refuse enable on boxes whose PFN space does not fit in 32 bits; + * Marie's gen lists store folios as packed 32-bit PFN indices and + * would corrupt them silently. Disable is always allowed. + */ + if (enable && unlikely(marie_pfn_unsupported)) + return -EOPNOTSUPP; + + cgroup_lock(); + cpus_read_lock(); + get_online_mems(); + mutex_lock(&state_mutex); + + if (enable == static_branch_likely(&lru_marie_enabled_key)) + goto unlock; + + /* + * Gate is flipped BEFORE the per-lruvec walk in both directions. + * + * Disable: gate flips off first so no new TRACKED bits get set + * while we drain. lru_marie_add_folio sees gate=off and bails; + * drain_one_lruvec catches in-flight adds via lru_lock + * serialisation. Concurrent lru_marie_del_folio relies on the + * TRACKED-first check (not the gate) to route TRACKED folios + * through Marie on not-yet-drained lruvecs. + * + * Enable: gate flips on first so concurrent lru_marie_add_folio + * routes new faults into Marie's synchronous install path + * while we walk existing legacy LRU lists to install pre-fault + * folios. If the gate flipped after fill (the historical order), + * dels of already-filled folios (TRACKED=1) would race against + * gate=off and the dispatcher would underflow mz->lru_zone_size + * via a legacy update_lru_size that Marie already accounted for. + */ + if (enable) + static_branch_enable_cpuslocked(&lru_marie_enabled_key); + else + static_branch_disable_cpuslocked(&lru_marie_enabled_key); + + /* + * Disable path iterates non-root memcgs first, then root. The + * global tracking bitmaps (marie_track_bm[type][gen][tier]) are + * the union of every memcg's residency. The per-memcg L1/L2 mask + * applied inside marie_drain_bitmap_walk_one narrows the walk to + * the current memcg's PFNs -- but only for non-root memcgs. + * root_mem_cgroup has NO per-memcg bitmap (marie_memcg_bitmap_set + * short-circuits on root), so a root-lruvec drain runs with + * memcg_l1 = memcg_l2 = NULL and walks every set bit in the + * global bitmap. + * + * If root were drained first (the natural mem_cgroup_iter order), + * the no-mask walk would scoop every tracked PFN -- including + * those owned by non-root memcgs -- onto root's lruvec lists. Each + * such folio's +nr lives in its true owner's mz->lru_zone_size + * (credited at install), but it now sits on root's lruvec. The + * subsequent lru_gen_fill_lruvec on root's lruvec would then issue + * lruvec_del_folio against root's lruvec, decrementing root's + * mz->lru_zone_size for a page the owner memcg was charged for -- + * underflowing root while leaving the owner over-counted, and + * MGLRU's reclaim under memory pressure livelocks on the corrupted + * counters. + * + * Draining non-root memcgs first incrementally clears their bits + * from the global bitmap, so by the time root's no-mask pass runs + * only PFNs that genuinely belong to root remain set. No special + * per-PFN folio_memcg() check is needed -- the bitmap arithmetic + * makes the filter implicit. + * + * Enable path is symmetric in iteration but does NOT depend on + * the order: marie_fill_one_lruvec walks @lruvec's legacy / lrugen + * lists, not the global bitmap, so cross-memcg confusion is + * impossible. Kept as a single normal-order pass for simplicity. + */ + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + if (!enable && mem_cgroup_is_root(memcg)) + continue; + + for_each_node(nid) + marie_change_state_lruvec(mem_cgroup_lruvec(memcg, + NODE_DATA(nid)), + enable); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + if (!enable) { + int nid; + + for_each_node(nid) + marie_change_state_lruvec(mem_cgroup_lruvec(NULL, + NODE_DATA(nid)), + false); + } + + pr_info("%s\n", enable ? "enabled" : "disabled"); + +unlock: + mutex_unlock(&state_mutex); + put_online_mems(); + cpus_read_unlock(); + cgroup_unlock(); + return ret; +} + +/* boot param: lru_marie=0 / lru_marie=1. At boot the cgroup tree is + * not yet populated and no lruvec carries folios, so a plain static-key + * toggle is sufficient — marie_change_state's mem_cgroup_iter would + * have nothing to migrate anyway. */ +static int __init marie_setup(char *str) +{ + int v; + + if (!str || kstrtoint(str, 0, &v)) + return 0; + if (v) + static_branch_enable(&lru_marie_enabled_key); + else + static_branch_disable(&lru_marie_enabled_key); + return 1; +} +__setup("lru_marie=", marie_setup); + + +unsigned int lru_marie_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + WARN_ON_ONCE(!sc); + + /* + * Per-PFN bitmap scan is the sole reclaim driver in Marie. The + * returned MARIE_DRAIN_* mask tells shrink_lruvec which orphan type(s) + * its legacy drain may reclaim (exactly the type(s) Marie scanned). + */ + return marie_state_shrink_lruvec(lruvec, sc); +} + +/* + * --------------------------------------------------------------------- + * /sys/kernel/mm/lru_marie/ + * --------------------------------------------------------------------- + */ + +static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", + static_branch_likely(&lru_marie_enabled_key) ? 1 : 0); +} + +static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + bool v; + int err = kstrtobool(buf, &v); + + if (err) + return err; + + err = marie_change_state(v); + if (err) + return err; + return count; +} + +static struct kobj_attribute marie_enabled_attr = __ATTR_RW(enabled); + +/* + * /sys/kernel/mm/lru_marie/version + * + * Read-only. Exposes MARIE_VERSION so userspace tooling (benchmark + * scripts, sysadmins, support pastes) can identify which Marie build + * is running without parsing dmesg. + */ +static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%s\n", MARIE_VERSION); +} + +static struct kobj_attribute marie_version_attr = __ATTR_RO(version); + +static ssize_t stats_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, + "nr_lruvecs %ld\n" + "nr_folios %lld\n", + atomic_long_read(&marie_nr_lruvecs), + percpu_counter_sum(&marie_nr_folios)); +} + +static struct kobj_attribute marie_stats_attr = __ATTR_RO(stats); + +/* + * clean_min_ratio sysfs knob. + * Range 0..100 (percentage of node_present_pages). + */ +static ssize_t clean_min_ratio_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", READ_ONCE(marie_clean_min_ratio)); +} + +static ssize_t clean_min_ratio_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int v; + int err = kstrtouint(buf, 10, &v); + + if (err) + return err; + if (v > 100) + return -EINVAL; + WRITE_ONCE(marie_clean_min_ratio, v); + return count; +} + +static struct kobj_attribute marie_clean_min_ratio_attr = + __ATTR_RW(clean_min_ratio); + +#ifdef CONFIG_SWAP +/* + * kcompmari sysfs knob: signed -100..+100, default +24. + * + * 0 — disabled. kcompmari_store short-circuits to false + * and swap_writeout falls straight through to inline + * zswap_store / __swap_writepage. + * +1..+100 — Marie-gated. Queue length = |v|. The kfifo backing + * storage is sized at KCOMPMARI_FIFO_SIZE (the max); + * |v| is the soft depth at which the producer treats + * the queue as full and falls back to sync writeout. + * Tracks lru_marie_enabled() so disabling Marie at + * runtime also quiesces kcompmari without a second + * sysfs write. + * -1..-100 — force mode. Queue length = |v|. Runs even when + * Marie is off, for users who want the async-compress + * helper independently of the Marie reclaim path. + * + * Default +24 mirrors the queue length kcompressd-unofficial proved + * sound under sustained anon pressure. Use -24 to force kcompmari on + * even with Marie off; use 0 to disable entirely. + * + * Encoded as two static branches (kcompmari_enabled_key and + * kcompmari_force_key declared in ) so the hot path + * costs a single predicted jump in the common (enabled, Marie-gated) case. + */ +DEFINE_STATIC_KEY_TRUE(kcompmari_enabled_key); +EXPORT_SYMBOL_GPL(kcompmari_enabled_key); +DEFINE_STATIC_KEY_FALSE(kcompmari_force_key); +EXPORT_SYMBOL_GPL(kcompmari_force_key); + +int vm_kcompmari = 24; +EXPORT_SYMBOL_GPL(vm_kcompmari); + +static ssize_t kcompmari_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", READ_ONCE(vm_kcompmari)); +} + +static ssize_t kcompmari_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int v; + int err = kstrtoint(buf, 10, &v); + + if (err) + return err; + if (v < -100 || v > 100) + return -EINVAL; + WRITE_ONCE(vm_kcompmari, v); + + if (v != 0) + static_branch_enable(&kcompmari_enabled_key); + else + static_branch_disable(&kcompmari_enabled_key); + if (v < 0) + static_branch_enable(&kcompmari_force_key); + else + static_branch_disable(&kcompmari_force_key); + + return count; +} + +static struct kobj_attribute marie_kcompmari_attr = __ATTR_RW(kcompmari); +#endif /* CONFIG_SWAP */ + +#ifdef CONFIG_X86 +/* + * SIMD walker kill-switch: /sys/kernel/mm/lru_marie/simd + * + * Default 1: walker uses the boot-detected SIMD wrapper (AVX-512F / + * AVX2 / SSE2). Writing 0 flips marie_simd_enabled_key so the walker + * falls through to a scalar pte_young loop in mm/lru_marie/simd_x86.c. + */ +static ssize_t simd_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", + static_branch_likely(&marie_simd_enabled_key) ? 1 : 0); +} + +static ssize_t simd_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + bool v; + int err = kstrtobool(buf, &v); + + if (err) + return err; + if (v) + static_branch_enable(&marie_simd_enabled_key); + else + static_branch_disable(&marie_simd_enabled_key); + return count; +} + +static struct kobj_attribute marie_simd_attr = __ATTR_RW(simd); +#endif /* CONFIG_X86 */ + +/* + * --------------------------------------------------------------------- + * Reclaim / walker tunables (runtime-adjustable via sysfs) + * --------------------------------------------------------------------- + * + * Each variable is read with READ_ONCE on its hot path. Reclaim-loop + * snapshots take the value at the top of each pass, so concurrent + * sysfs writes take effect on the next pass without locking. + */ + +/* + * marie_clean_min_ratio — file-pagecache floor as a percentage of + * node_present_pages. marie_state_shrink_lruvec diverts file reclaim + * to anon when the node's NR_*_FILE total drops below this fraction, + * preserving a working set of clean cache for codepaths that depend + * on it (executable text, mapped data files, etc.) instead of + * letting unbounded anon pressure flush it. 0 disables the floor + * (legacy behaviour); 100 caps every file fault as protected. + * Range 0..100; default 10. + */ +unsigned int marie_clean_min_ratio = 10; + +/* + * marie_gen_growth_threshold — pages installed onto the head gen + * before marie_install_advance_hook triggers marie_try_advance_head. + * Default 8192 pages (= MARIE_ISOLATE_BATCH << 8, i.e. 32 MiB). + * Lower values produce finer gen granularity (more aging churn but + * tighter hot/cold separation); higher values coarsen the ring (less + * churn, broader gens). + * + * marie_install_advance_hook combines this static floor with a + * dynamic total_occupied / 8 leg so heavy workloads scale the + * trigger automatically. + */ +unsigned long marie_gen_growth_threshold = (unsigned long)SWAP_CLUSTER_MAX << 8; + +/* + * marie_walker_interval_* — adaptive walker pass deadline per pgdat, + * stored in jiffies. marie_walker_interval() picks one based on the + * zone's free-page state relative to its watermarks: + * + * free < min -> critical + * free < low -> low + * free < high -> normal + * free >= high -> idle + * + * Defaults mirror the original literal cadence (HZ/30, HZ/10, HZ/4, + * HZ — ~33 ms, 100 ms, 250 ms, 1 s on HZ=1000). Hot writers see the + * value via READ_ONCE inside marie_walker_interval(); the sysfs + * helpers convert to and from ms for user friendliness. + */ +unsigned long marie_walker_interval_critical = HZ / 30; +unsigned long marie_walker_interval_low = HZ / 10; +unsigned long marie_walker_interval_normal = HZ / 4; +unsigned long marie_walker_interval_idle = HZ; + +static ssize_t gen_growth_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%lu\n", + READ_ONCE(marie_gen_growth_threshold)); +} + +static ssize_t gen_growth_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long v; + int err = kstrtoul(buf, 10, &v); + + if (err) + return err; + /* Must hold at least one MARIE_ISOLATE_BATCH-sized pop. */ + if (v < SWAP_CLUSTER_MAX || v > (1UL << 28)) + return -EINVAL; + WRITE_ONCE(marie_gen_growth_threshold, v); + return count; +} + +static struct kobj_attribute marie_gen_growth_threshold_attr = + __ATTR_RW(gen_growth_threshold); + +/* + * Walker-interval knob factory: every stage uses the same show/store + * shape (ms in, jiffies stored, clamped to >= 1 jiffy). Range is + * 1..60000 ms — anything shorter than a jiffy is meaningless on + * commodity HZ, anything longer than a minute defeats the adaptive + * gating. + */ +#define MARIE_WALKER_INTERVAL_KNOB(name, var) \ +static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sysfs_emit(buf, "%u\n", \ + jiffies_to_msecs(READ_ONCE(var))); \ +} \ +static ssize_t name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + unsigned int ms; \ + unsigned long j; \ + int err = kstrtouint(buf, 10, &ms); \ + \ + if (err) \ + return err; \ + if (ms < 1 || ms > 60000) \ + return -EINVAL; \ + j = msecs_to_jiffies(ms); \ + if (j < 1) \ + j = 1; \ + WRITE_ONCE(var, j); \ + return count; \ +} \ +static struct kobj_attribute marie_##name##_attr = __ATTR_RW(name) + +MARIE_WALKER_INTERVAL_KNOB(walker_interval_critical_ms, + marie_walker_interval_critical); +MARIE_WALKER_INTERVAL_KNOB(walker_interval_low_ms, + marie_walker_interval_low); +MARIE_WALKER_INTERVAL_KNOB(walker_interval_normal_ms, + marie_walker_interval_normal); +MARIE_WALKER_INTERVAL_KNOB(walker_interval_idle_ms, + marie_walker_interval_idle); + +static struct attribute *marie_attrs[] = { + &marie_enabled_attr.attr, + &marie_version_attr.attr, + &marie_stats_attr.attr, + &marie_clean_min_ratio_attr.attr, +#ifdef CONFIG_SWAP + &marie_kcompmari_attr.attr, +#endif +#ifdef CONFIG_X86 + &marie_simd_attr.attr, +#endif + &marie_gen_growth_threshold_attr.attr, + &marie_walker_interval_critical_ms_attr.attr, + &marie_walker_interval_low_ms_attr.attr, + &marie_walker_interval_normal_ms_attr.attr, + &marie_walker_interval_idle_ms_attr.attr, + NULL, +}; + +static const struct attribute_group marie_attr_group = { + .attrs = marie_attrs, +}; + +static int __init marie_init(void) +{ + struct kobject *marie_kobj; + int err; + + printk(KERN_INFO "%s %s by %s\n", + MARIE_PROGNAME, MARIE_VERSION, MARIE_AUTHOR); + + marie_prefetch_params_init(); + + /* + * Latch the 32-bit PFN gate. max_pfn is established by setup_arch / + * memblock init well before subsys_initcall, so this single read is + * authoritative for the lifetime of the system. If the box overflows + * the 32-bit PFN window we disable Marie up front, regardless of + * lru_marie= boot param or the static-key default, and refuse later + * sysfs enables in marie_change_state. + */ + if (max_pfn > MARIE_MAX_SUPPORTED_PFN) { + marie_pfn_unsupported = true; + if (static_branch_likely(&lru_marie_enabled_key)) + static_branch_disable(&lru_marie_enabled_key); + pr_warn("disabled: max_pfn %lu exceeds 32-bit limit (%lu); Marie requires physical address space <= 16 TiB\n", + max_pfn, MARIE_MAX_SUPPORTED_PFN); + } else { + /* + * Allocate the per-PFN state array now that the gate has + * been verified. If this fails we cannot run, so disable + * Marie and continue boot with the in-tree LRU paths. + */ + err = marie_state_init(); + if (err) { + marie_pfn_unsupported = true; + if (static_branch_likely(&lru_marie_enabled_key)) + static_branch_disable(&lru_marie_enabled_key); + pr_warn("disabled: marie_state_init failed (%d)\n", + err); + } + } + + /* + * Initialise the global marie_nr_folios percpu_counter. (Earlier + * revisions also set up slab caches and per-CPU pools here; the + * per-PFN paradigm has none of that to allocate.) + */ + err = marie_counters_init(); + if (err < 0) + return err; + + marie_walker_init(); + + marie_kobj = kobject_create_and_add("lru_marie", mm_kobj); + if (!marie_kobj) { + pr_err("failed to create /sys/kernel/mm/lru_marie\n"); + return -ENOMEM; + } + + err = sysfs_create_group(marie_kobj, &marie_attr_group); + if (err) { + pr_err("failed to create /sys/kernel/mm/lru_marie attributes: %d\n", err); + kobject_put(marie_kobj); + return err; + } + + pr_info("currently %s\n", + static_branch_likely(&lru_marie_enabled_key) ? "enabled" : "disabled"); + return 0; +} +subsys_initcall(marie_init); diff --git a/mm/lru_marie/drain_scope.h b/mm/lru_marie/drain_scope.h new file mode 100644 index 0000000000..918d2ce914 --- /dev/null +++ b/mm/lru_marie/drain_scope.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_DRAIN_SCOPE_H +#define _MM_LRU_MARIE_DRAIN_SCOPE_H + +#include +#include +#include +#include + +/* + * Drain-path deferred-free scaffold. + * + * Background: marie_drain_pfn_locked drops the transient pin on every + * still-tracked folio while holding lv->lru_lock with IRQs off. If a + * pin is the last reference (the css_offline drain races exit_mmap), + * a plain folio_put inside the lock would recurse into + * __page_cache_release -> folio_lruvec_lock_irqsave on the same + * lru_lock and self-deadlock. e2eafb4c0 fixed this by collecting + * last-ref folios onto a deferred-free list and freeing them AFTER + * lru_lock is dropped. + * + * Every drain caller now needs the same three-step scaffold: + * + * LIST_HEAD(to_free); + * spin_lock_irq(&lv->lru_lock); + * marie_drain_one_lruvec(lv, &to_free); + * ...maybe other in-lock work... + * spin_unlock_irq(&lv->lru_lock); + * marie_drain_release(&to_free); + * + * Three sites repeat it (marie_drop_lruvec, marie_offline_lruvec, + * marie_change_state_lruvec). A fourth site that forgets the release + * leaks every last-ref folio it collected; a fifth that forgets the + * to_free list outright would deadlock again. + * + * Make the release impossible to forget: MARIE_DRAIN_DEFER declares a + * list_head with the GCC __cleanup attribute, which runs + * marie_drain_release at scope exit unconditionally. The lock half is + * the existing scoped_guard(spinlock_irq, ...). Caller code becomes: + * + * MARIE_DRAIN_DEFER(to_free); + * scoped_guard(spinlock_irq, &lv->lru_lock) { + * marie_drain_one_lruvec(lv, &to_free); + * ... + * } + * // lock released; to_free auto-flushed when its scope ends. + * + * Forgetting MARIE_DRAIN_DEFER is a compile error: marie_drain_one_lruvec + * still requires a struct list_head * argument, and there is nothing to + * pass. + */ + +/* + * Release folios the drain found at refcount 0 -- the transient pin in + * marie_drain_pfn_locked was the last reference. Run after lru_lock has + * been dropped so __folio_put (mem_cgroup_uncharge, + * folio_unqueue_deferred_split's split_queue_lock, the buddy free) + * never runs under lru_lock. Mirrors release_pages()'s deferred free. + */ +static inline void marie_drain_release(struct list_head *to_free) +{ + struct folio *folio, *next; + + list_for_each_entry_safe(folio, next, to_free, lru) { + list_del(&folio->lru); + __folio_put(folio); + } +} + +static inline void __marie_drain_release_cleanup(struct list_head *l) +{ + marie_drain_release(l); +} + +/* + * Declare a deferred-free list in the current scope. The list is + * initialised empty; at scope exit, marie_drain_release runs on it + * unconditionally (empty -> no-op). Threading the list into + * marie_drain_one_lruvec / marie_drain_bitmap_walk_one is the caller's + * responsibility -- those helpers' lockdep_assert_held catches the + * "lock not actually held" mistake. + */ +#define MARIE_DRAIN_DEFER(name) \ + struct list_head name __cleanup(__marie_drain_release_cleanup) = \ + LIST_HEAD_INIT(name) + +#endif /* _MM_LRU_MARIE_DRAIN_SCOPE_H */ diff --git a/mm/lru_marie/pfn_install.h b/mm/lru_marie/pfn_install.h new file mode 100644 index 0000000000..77a4b09f68 --- /dev/null +++ b/mm/lru_marie/pfn_install.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_PFN_INSTALL_H +#define _MM_LRU_MARIE_PFN_INSTALL_H + +#include +#include +#include +#include + +#include "bitmap.h" +#include "state.h" + +/* + * Marie's "publish a PFN as TRACKED" primitive, factored out of the + * install/split paths. + * + * What it writes (the single source of truth for "Marie owns this PFN"): + * - marie_state[pfn]: TRACKED | (gen) | (tier) | (type) | (zone) + * - marie_track_bm[type][gen][tier]: scan bit for this PFN + * - per-memcg L1 bitmap (folio_memcg(f)) + * - atomic_long_inc(&marie_gen_occupied[gen][type]) + * + * What it deliberately does NOT touch: + * - folio->flags (PG_active / PG_lru) -- the install path flips these + * in one atomic mask write after publish; the split path's caller + * sets PG_lru later. + * - folio->lru list pointers -- INIT_LIST_HEAD vs list_add_tail differs + * between install and split. + * - mlv counters and vmstat lru_size -- accounted by the caller (or by + * marie_folio_install for the fresh-install path). + * - marie_gen_installs -- this is the install "throttle" counter that + * drives gen advance; split intentionally does NOT bump it because + * the split tail inherits its parent's install budget (the parent + * was already counted at fault-install). + * + * Caller context: lru_lock held with IRQs off. The publish is a plain + * non-atomic byte write because lru_lock serialises every install on the + * same PFN, and the "already TRACKED" early-out in marie_folio_install + * catches concurrent re-install attempts. + */ +static inline void marie_pfn_publish_inherit(struct folio *f, int type, + u8 gen, u8 tier, int zone) +{ + unsigned long pfn = folio_pfn(f); + + marie_state[pfn] = MARIE_PFN_TRACKED | + (gen << MARIE_PFN_GEN_SHIFT) | + (tier << MARIE_PFN_TIER_SHIFT) | + (type ? MARIE_PFN_TYPE_FILE : 0) | + marie_pfn_zone_bits(zone); + marie_bm_set(&marie_track_bm[type][gen][tier], pfn); + marie_memcg_bitmap_set(folio_memcg(f), pfn); + atomic_long_inc(&marie_gen_occupied[gen][type]); +} + +/* + * marie_folio_install - the unified fresh-install path. + * + * Single entry point that replaces the former marie_install_local / + * marie_install_locked pair. Both call sites (lru_marie_add_folio for THP + * via per-type lock + small folio direct, and marie_change_state_lruvec + * during gate-on fill) now route here. The per-type lock context that + * used to distinguish "locked" from "local" is the caller's concern, not + * this function's: the body only requires lru_lock + IRQs off and uses + * the same publish + flag flip + account sequence in both cases. + * + * Sequence: + * 1. TRACKED early-out (returns false). Defends against gate-flip race + * and reclaim-survivor re-install (TRACKED is preserved across + * isolate by design; marie_state_publish_at_gen handles the + * survivor putback separately, never this function). + * 2. Capture (PG_active, PG_workingset) -> 2-bit tier signal. + * 3. Clear PG_active early; the final flag write is still a single + * atomic set_mask_bits, but capturing was_active before the clear + * keeps the tier value coherent with the byte we publish below. + * 4. INIT_LIST_HEAD(&f->lru) -- a recycled folio arrives with + * LIST_POISON{1,2} that would later fault list_del_init. + * 5. Publish per-PFN state via marie_pfn_publish_inherit. + * 6. Bump marie_gen_installs and trigger the advance hook. Split path + * skips this bump (publish_inherit only). + * 7. set_mask_bits(PG_active->0, PG_lru->1) -- one atomic flag write. + * Ordered AFTER step 5 so a concurrent __page_cache_release + * observing PG_lru=1 also observes TRACKED=1. + * 8. Account (mlv->types[type].nr_pages, marie_nr_folios, vmstat + * lru_size, mlv->marie_lru_zone_size). Step 3 of the abstraction + * plan will factor these into marie_account_install. + * + * Returns true on success, false on TRACKED early-out. + */ +bool marie_folio_install(struct folio *f, struct marie_lruvec *mlv); + +#endif /* _MM_LRU_MARIE_PFN_INSTALL_H */ diff --git a/mm/lru_marie/prefetch.h b/mm/lru_marie/prefetch.h new file mode 100644 index 0000000000..7f9a2eeb3d --- /dev/null +++ b/mm/lru_marie/prefetch.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_PREFETCH_H +#define _MM_LRU_MARIE_PREFETCH_H + +/* + * Two-stage software prefetch primitives used by Marie's per-PFN + * array scan. The bitmap-driven isolate loop issues + * + * marie_prefetch_l3(target_N_ahead); // pull from DRAM into L3 + * marie_prefetch_l1(target_K_ahead); // pull from L3 into L1 + * + * where N (~marie_l3_ahead) is sized to cover DRAM round-trip + * (~200 cycles) and K (~marie_l1_ahead) is sized to cover the L3-> + * L1 round-trip (~30 cycles). Splitting lets the AGU fire the long- + * haul prefetch as early as the bitmap walk can predict the next + * candidate PFN, without keeping the L1 occupied with all the + * pending lines at once. + * + * The kernel's generic prefetch() expands to PREFETCHNTA on x86 + * (L1 with bypass-LRU semantics), which is wrong for the L3-ahead + * leg — NTA evicts quickly from L1 and never settles in L3, so by + * the time the target should be in L3 it is gone. We therefore + * drop to the bare instructions: + * + * prefetcht0 -- T0 hint, fetched into all cache levels (L1+L2+L3) + * prefetcht2 -- T2 hint, fetched into L2/L3 but not L1 + * + * Non-x86 builds get no-op stubs; the scan still works, just + * without the prefetch acceleration (HW prefetcher alone). + */ + +#ifdef CONFIG_X86 +static __always_inline void marie_prefetch_l1(const void *addr) +{ + asm volatile("prefetcht0 %0" :: "m" (*(const char *)addr)); +} + +static __always_inline void marie_prefetch_l3(const void *addr) +{ + asm volatile("prefetcht2 %0" :: "m" (*(const char *)addr)); +} +#else +static __always_inline void marie_prefetch_l1(const void *addr) { (void)addr; } +static __always_inline void marie_prefetch_l3(const void *addr) { (void)addr; } +#endif + +/* + * Ahead distances for the two-stage prefetch ring. Values are set at + * boot by marie_prefetch_params_init() based on CPUID and stored in + * the file-static variables in state.c. MARIE_L3_AHEAD_MAX is the + * compile-time upper bound used to size the on-stack ring[] array; + * the runtime value (marie_l3_ahead) may be smaller on MSHR-limited + * microarchitectures. + * + * prefetcht2 requests are tracked by L2/L3 MSHRs (independent of L1 + * LFBs); prefetcht0 requests are tracked by L1 LFBs. Tiers chosen by + * marie_prefetch_params_init(): + * + * AVX-512F (Zen 4/5, Sapphire Rapids): L2 MSHR ~32 → l3=32, l1=8 + * AMD Zen 3 (fam 0x19): L2 MSHR ~24 → l3=24, l1=8 + * AMD Zen 1/2 (fam 0x17): L2 MSHR ~20 → l3=20, l1=8 + * AMD Excavator (fam 0x15): L2 MSHR ~12 → l3=16, l1=6 + * Intel Skylake+ (CLFLUSHOPT): L2 MSHR ~24 → l3=24, l1=8 + * Intel Haswell/Broadwell: L2 MSHR ~16 → l3=16, l1=6 + * x86_64-v2 or below / non-x86: L2 MSHR ~8 → l3= 8, l1=2 + * + * marie_l3_mask = marie_l3_ahead - 1 (all values are powers of 2, + * enabling bitwise-AND modulo in the hot path). + * + * These can be promoted to sysfs tunables in a later commit if + * profiling shows different sweet spots per workload. + */ +#define MARIE_L3_AHEAD_MAX 32 /* on-stack ring[] sizing upper bound */ + +void marie_prefetch_params_init(void); + +/* + * Cache-line cursor look-ahead for marie_state[] (1 byte per PFN, 64 PFN + * per cache line). Unlike the per-PFN struct page prefetch (where 1 PFN + * = 1 cache line already gives ring-depth look-ahead), state[] is dense + * — without an explicit cursor, the producer issues up to 64 prefetches + * for the same cache line and gains zero look-ahead in cache-line space. + * + * Sized for the sparse-bitmap fast-skip case. On OOO x86 (~5 cycles/PFN, + * DRAM ~200 cycles) we need ≥ 40 PFN on top of the runtime ring lag + * (up to 32); on MSHR-limited in-order x86 (~20 cy/PFN, ring lag 8) + * ~18 PFN suffices. 512 PFN (8 cache lines) covers all tiers and also + * absorbs bitmap-density jumps within an L2 range. + * + * L1 distance is the L3→L1 analogue: shorter latency target, smaller + * margin since L1d evicts aggressively. + */ +#define MARIE_STATE_L3_AHEAD_PFN 512 +#define MARIE_STATE_L1_AHEAD_PFN 64 + +/* + * Cache-line cursor look-ahead for the bitmap arrays (l1[], mbm[]) used + * by the isolate producer. The arrays are u64 (8 words per cache line, + * each word covering 64 PFN). The producer reads one word per "word_rem + * exhausted" event; in the sparse-bitmap worst case (1 bit per word) the + * word transition rate hits ~5-30 cycles per consumer iter, so the next + * cache line must be on the way well before the cursor crosses it. + * + * 16 words = 2 cache lines ahead gives margin for the sparse case while + * keeping the prefetch budget modest. Only L3 hint is needed — once a + * bitmap cache line lands in L3, the L3->L1 promote (~30-40 cycles) is + * easily hidden by the per-word consumer drain (64 PFN × 5+ cycles). + */ +#define MARIE_BM_L3_AHEAD_WORDS 16 + +#endif /* _MM_LRU_MARIE_PREFETCH_H */ diff --git a/mm/lru_marie/simd.h b/mm/lru_marie/simd.h new file mode 100644 index 0000000000..146fe5fd3d --- /dev/null +++ b/mm/lru_marie/simd.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_SIMD_H +#define _MM_LRU_MARIE_SIMD_H + +/* + * Marie SIMD-accelerated PTE scan. + * + * Two call modes: + * + * Batched (walker): + * lru_marie_simd_batch_begin(); + * for_each(pmd in batch) + * lru_marie_simd_young_pte_mask_raw(pte_table, bitmap); + * lru_marie_simd_batch_end(); + * + * Single-shot (any non-batched caller): + * lru_marie_simd_young_pte_mask(pte_table, bitmap); + * + * On x86 batch_begin/end map to kernel_fpu_begin/end so the per-call + * FPU save-restore is amortised across the batch. On every other arch + * (including arm64) the generic scalar fallback is used and + * batch_begin/end are no-ops. + */ + +#include +#include +#include + +#ifdef CONFIG_X86 +/* + * Runtime kill-switch for the boot-detected SIMD walker, exposed via + * /sys/kernel/mm/lru_marie/simd. Default true: walker uses the widest + * SIMD kernel that arch_initcall could pick (AVX-512F > AVX2 > SSE2). + * Writing 0 to the sysfs file flips the static branch so the walker + * falls back to a pure scalar pte_young loop in the same translation + * unit. + * + * Other arches use the generic scalar fallback already, so the toggle + * does not need to exist there and the sysfs attribute is hidden. + */ +DECLARE_STATIC_KEY_TRUE(marie_simd_enabled_key); + +static inline bool marie_simd_enabled(void) +{ + return static_branch_likely(&marie_simd_enabled_key); +} +#else +static inline bool marie_simd_enabled(void) { return false; } +#endif + +/* + * Number of unsigned longs needed to hold the young-bit bitmap for one + * PMD's worth of PTEs (PTRS_PER_PTE = 512 on x86_64; the value is + * pulled from the arch's pgtable headers via the caller's includes). + */ +#define MARIE_SIMD_PTE_BITMAP_LONGS ((512 + BITS_PER_LONG - 1) / BITS_PER_LONG) + +/** + * lru_marie_simd_batch_begin - open a SIMD batch (FPU bracket on x86). + * + * Holds preempt-disabled until the matching batch_end. The caller is + * responsible for keeping the bracketed region short -- batch a small + * fixed number of _raw scans then close the batch and let + * cond_resched() run before opening the next one. + * + * Calling _raw without an enclosing batch_begin is undefined on x86 + * (FPU registers will be corrupted relative to userspace state); on + * scalar arches it is harmless because batch_begin/end are no-ops. + */ +void lru_marie_simd_batch_begin(void); + +/** + * lru_marie_simd_batch_end - close a SIMD batch (kernel_fpu_end on x86). + */ +void lru_marie_simd_batch_end(void); + +/** + * lru_marie_simd_young_pte_mask_raw - scan one PMD without opening a bracket. + * @table: pointer to the first pte_t in the PMD's PTE array (512 entries) + * @bitmap: output, MARIE_SIMD_PTE_BITMAP_LONGS unsigned longs. + * + * Caller MUST hold an enclosing lru_marie_simd_batch_begin/end pair. + * Used in the per-PMD walker hot path to amortise the FPU save/restore + * cost across a batch of consecutive PMD scans. + */ +void lru_marie_simd_young_pte_mask_raw(const void *table, unsigned long *bitmap); + +/** + * lru_marie_simd_young_pte_mask - single-shot scan (begin + raw + end). + * @table: pointer to the first pte_t in the PMD's PTE array (512 entries) + * @bitmap: output, MARIE_SIMD_PTE_BITMAP_LONGS unsigned longs. + * + * Self-contained convenience wrapper for any non-batched caller. The + * walker uses the batched path directly; this wrapper exists for + * completeness and any future single-shot use. + */ +void lru_marie_simd_young_pte_mask(const void *table, unsigned long *bitmap); + +#endif /* _MM_LRU_MARIE_SIMD_H */ diff --git a/mm/lru_marie/simd_generic.c b/mm/lru_marie/simd_generic.c new file mode 100644 index 0000000000..0045098bfa --- /dev/null +++ b/mm/lru_marie/simd_generic.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/lru_marie/simd_generic.c -- fallback PTE young-bit scan for arches + * without a SIMD path. + * + * Uses the arch-provided pte_young() helper so we don't need to know + * the accessed-bit name on every architecture. + * + * batch_begin / batch_end are no-ops here because the scan is scalar + * (no FPU state to preserve). Every arch other than x86 currently + * lands on this file (including arm64, where a future NEON variant + * could be slotted in once its FPSIMD save/restore cost has been + * profiled against the per-pmd gain). + */ + +#include +#include /* pte_young */ +#include + +#include "simd.h" + +#define PTES_PER_PMD 512 + +void lru_marie_simd_batch_begin(void) { } +EXPORT_SYMBOL_GPL(lru_marie_simd_batch_begin); + +void lru_marie_simd_batch_end(void) { } +EXPORT_SYMBOL_GPL(lru_marie_simd_batch_end); + +void lru_marie_simd_young_pte_mask_raw(const void *table, unsigned long *bitmap) +{ + const pte_t *pte = (const pte_t *)table; + int i; + + for (i = 0; i < PTES_PER_PMD; i++) { + if (pte_young(pte[i])) + __set_bit(i, bitmap); + } +} +EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask_raw); + +void lru_marie_simd_young_pte_mask(const void *table, unsigned long *bitmap) +{ + lru_marie_simd_young_pte_mask_raw(table, bitmap); +} +EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask); diff --git a/mm/lru_marie/simd_x86.c b/mm/lru_marie/simd_x86.c new file mode 100644 index 0000000000..b119b6c265 --- /dev/null +++ b/mm/lru_marie/simd_x86.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/lru_marie/simd_x86.c -- x86-64 PTE young-bit scan dispatch. + * + * Three SIMD .S kernels are linked in: lru_marie_simd_x86_{sse2,avx2,avx512}.S. + * arch_initcall picks the widest available at boot: + * AVX-512F: 8 PTEs/iter via VPTESTMQ kmask + * AVX2: 4 PTEs/iter via VPCMPEQQ + * SSE2: 4 PTEs/iter via PSHUFD pack + * + * SSE2 is the floor -- x86-64 ABI-mandatory since 2003, always works, + * no cpu_has() check needed. It's the default initial value of the + * static call, so even if arch_initcall runs late the walker never + * falls back to the slower scalar path. + * + * FPU bracket lifecycle is caller-driven: + * lru_marie_simd_batch_begin() -> kernel_fpu_begin() + * lru_marie_simd_young_pte_mask_raw() x N + * lru_marie_simd_batch_end() -> kernel_fpu_end() + * + * FPU is batched across N consecutive PMD scans (the walker picks N -- + * see MARIE_FPU_BATCH in mm/lru_marie/walker.c) so the ~100 ns + * kernel_fpu_begin/end overhead is amortised across the batch. Widening + * the bracket to whole-pass scope is avoided: the bitmap iteration + * (folio_marie_inc_tier per young bit) runs inside the bracket, so a + * pass-wide bracket would extend the preempt-disabled window by the + * full iteration (~100 ms). Batching keeps the bitmap iteration inside + * the same PTL window; the preempt window grows by N SIMD scans + * (~100-300 ns each), not N full per-PMD bodies. + * + * The single-shot lru_marie_simd_young_pte_mask() is retained for any + * non-batched caller (currently none in-tree) and just wraps + * batch_begin / _raw / batch_end. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "simd.h" + +#define PTES_PER_PMD 512 + +/* + * Default true: walker uses the boot-detected SIMD wrapper. Flipped + * by writes to /sys/kernel/mm/lru_marie/simd; a write of 0 routes + * lru_marie_simd_young_pte_mask through the scalar pte_young loop + * below for benchmark A/B comparisons. + */ +DEFINE_STATIC_KEY_TRUE(marie_simd_enabled_key); +EXPORT_SYMBOL_GPL(marie_simd_enabled_key); + +/* Defined in mm/lru_marie/simd_x86_{sse2,avx2,avx512}.S. + * Caller must hold kernel_fpu_begin/end. */ +asmlinkage void lru_marie_simd_scan_sse2(const pte_t *pte_table, + unsigned long *bitmap); +asmlinkage void lru_marie_simd_scan_avx2(const pte_t *pte_table, + unsigned long *bitmap); +asmlinkage void lru_marie_simd_scan_avx512(const pte_t *pte_table, + unsigned long *bitmap); + +/* ------------------------------------------------------------------ */ +/* Scalar fallback */ +/* ------------------------------------------------------------------ */ + +/* + * Reference scalar implementation. Used as the SIMD off-path when + * marie_simd_enabled_key is flipped via /sys/kernel/mm/lru_marie/simd + * for A/B-comparing the SIMD walker against a scalar pte_young loop + * without rebuilding the kernel. Also doubles as a correctness oracle + * for future SIMD bug fixes. No FPU state -- safe to call regardless + * of bracket state. + */ +static void marie_simd_scan_scalar(const pte_t *pte, unsigned long *bitmap) +{ + int i; + + for (i = 0; i < PTES_PER_PMD; i++) { + if (pte_val(pte[i]) & _PAGE_ACCESSED) + __set_bit(i, bitmap); + } +} + +/* ------------------------------------------------------------------ */ +/* Boot-time dispatch */ +/* ------------------------------------------------------------------ */ + +/* + * Boot-patched direct call to the .S kernel. arch_initcall upgrades + * from the SSE2 default to AVX2 / AVX-512F if those feature bits are + * set. Each call site compiles to a single direct CALL instruction + * (text-patched at static_call_update time), avoiding the indirect- + * call retpoline tax in the per-PMD walker hot path. + * + * The static call points DIRECTLY at the .S kernel -- no FPU-bracket + * wrapper. Callers must hold an FPU bracket via + * lru_marie_simd_batch_begin/end. + */ +DEFINE_STATIC_CALL(marie_simd_scan, lru_marie_simd_scan_sse2); + +static int __init marie_simd_x86_init(void) +{ + /* + * Pick the widest SIMD impl available at boot. Order matters: + * AVX-512F first (8 PTEs/iter, simplest .S via VPTESTMQ kmask), + * then AVX2 (4 PTEs/iter via VPCMPEQQ), then SSE2 (4 PTEs/iter + * via PSHUFD packing trick -- already the default before this + * runs, so we do nothing for the SSE2 case). + */ + if (boot_cpu_has(X86_FEATURE_AVX512F)) { + static_call_update(marie_simd_scan, lru_marie_simd_scan_avx512); + pr_info("SIMD PTE scan: AVX-512F (8 PTEs/iter)\n"); + } else if (boot_cpu_has(X86_FEATURE_AVX2)) { + static_call_update(marie_simd_scan, lru_marie_simd_scan_avx2); + pr_info("SIMD PTE scan: AVX2 (4 PTEs/iter)\n"); + } else { + /* default already = lru_marie_simd_scan_sse2 */ + pr_info("SIMD PTE scan: SSE2 (4 PTEs/iter, x86-64 baseline)\n"); + } + return 0; +} +/* + * arch_initcall fires before subsys_initcall (marie_init), so the + * static call is patched well before the walker first runs. + */ +arch_initcall(marie_simd_x86_init); + +/* ------------------------------------------------------------------ */ +/* Public API */ +/* ------------------------------------------------------------------ */ + +void lru_marie_simd_batch_begin(void) +{ + if (static_branch_likely(&marie_simd_enabled_key)) + kernel_fpu_begin(); +} +EXPORT_SYMBOL_GPL(lru_marie_simd_batch_begin); + +void lru_marie_simd_batch_end(void) +{ + if (static_branch_likely(&marie_simd_enabled_key)) + kernel_fpu_end(); +} +EXPORT_SYMBOL_GPL(lru_marie_simd_batch_end); + +void lru_marie_simd_young_pte_mask_raw(const void *table, unsigned long *bitmap) +{ + if (static_branch_likely(&marie_simd_enabled_key)) + static_call(marie_simd_scan)((const pte_t *)table, bitmap); + else + marie_simd_scan_scalar((const pte_t *)table, bitmap); +} +EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask_raw); + +void lru_marie_simd_young_pte_mask(const void *table, unsigned long *bitmap) +{ + lru_marie_simd_batch_begin(); + lru_marie_simd_young_pte_mask_raw(table, bitmap); + lru_marie_simd_batch_end(); +} +EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask); diff --git a/mm/lru_marie/simd_x86_avx2.S b/mm/lru_marie/simd_x86_avx2.S new file mode 100644 index 0000000000..95242d53b0 --- /dev/null +++ b/mm/lru_marie/simd_x86_avx2.S @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/lru_marie/simd_x86_avx2.S -- AVX2 PTE young-bit scan for Marie walker. + * + * Processes 512 PTEs (one full PMD page table) per call, producing an + * 8-ulong (512-bit) young-bit bitmap. 4 PTEs are handled per SIMD + * iteration via a single 256-bit YMM load; 16 iterations fill one + * output ulong; 8 ulongs total. + * + * Variant chosen: counter-based rolled loop (outer=8, inner=16). + * Reason: mirrors the SSE2 kernel structure for auditability; 128 + * fully-unrolled SIMD blocks would exceed 3000 instructions with no + * measurable throughput gain on modern out-of-order cores (loop + * overhead is < 1% of SIMD work). + * + * Why AVX2 is cleaner than SSE2 here: + * AVX2 has VPCMPEQQ (256-bit, 64-bit lane equality). SSE2 only has + * PCMPEQD (32-bit lanes), which required the PSHUFD low-dword packing + * trick to avoid false positives from PFN bits landing in the high 32 + * bits of a PTE. With VPCMPEQQ we compare the full 64-bit PTE against + * the broadcast mask (1ULL << _PAGE_ACCESSED) directly -- no packing. + * + * After VPMOVMSKB, eax holds 32 bits: one 8-bit octet per 64-bit PTE + * lane (uniformly 0xFF or 0x00 because VPCMPEQQ produces all-1s or + * all-0s per 64-bit lane). Extracting bits: + * testl $0x000000FF, eax -> PTE 0 young + * testl $0x0000FF00, eax -> PTE 1 young + * testl $0x00FF0000, eax -> PTE 2 young + * testl $0xFF000000, eax -> PTE 3 young + * + * Caller (marie_simd_scan_avx2_wrapper in mm/lru_marie/simd_x86.c, added in + * Task 5) holds kernel_fpu_begin/end around this. Do NOT call directly + * from C without an FPU context bracket. + * + * ABI (System V AMD64): + * rdi = const pte_t *pte_table (512 PTEs, 4096 bytes, page-table base; + * page-aligned by the kernel page-table + * allocator; vmovdqu is used so no stricter + * SIMD alignment is required for the input) + * rsi = unsigned long *bitmap (8 ulongs = 64 bytes, caller pre-cleared) + * + * Register usage: + * ymm0 = young-bit mask broadcast (0x20 x4, 64-bit lanes), constant + * ymm1 = 4-PTE scratch register + * rdi = current PTE pointer (walks forward by 32 bytes per inner iter) + * rsi = bitmap base (constant) + * rdx = outer loop counter (8 -> 0) + * rcx = inner loop counter (16 -> 0) + * rax = vpmovmskb result / bit-extraction scratch + * r8 = per-ulong accumulator + * r9 = bit-position counter within current ulong (0..63) + * r10 = single-bit scratch for OR-into-accumulator + * r11 = bitmap write pointer (rsi + outer*8, updated each outer iter) + * + * AVX2 only (no AVX-512 zmm/kmask/opmask): + * vmovdqa, vpand, vpcmpeqq, vpmovmskb, vzeroupper -- all AVX2. + * All VEX-encoded to avoid SSE/AVX transition penalties. + * VZEROUPPER before RET to clear upper YMM halves for subsequent + * legacy SSE code. + */ + +#include +#include + + /* -------------------------------------------------------------- */ + /* Read-only data: young-bit mask broadcast */ + /* -------------------------------------------------------------- */ + .section .rodata + .align 32 +marie_avx2_young_mask: + /* _PAGE_ACCESSED = bit 5 = 0x20, broadcast to 4x 64-bit lanes */ + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + + /* -------------------------------------------------------------- */ + /* Text */ + /* -------------------------------------------------------------- */ + .text + +/* + * void lru_marie_simd_scan_avx2(const pte_t *pte_table, unsigned long *bitmap) + * + * pte_table: rdi -- 512 PTEs (4096 bytes), page-table base (8-byte aligned) + * bitmap: rsi -- 8 ulongs (64 bytes), caller pre-cleared + */ +SYM_TYPED_FUNC_START(lru_marie_simd_scan_avx2) + + /* Load young-bit mask once; ymm0 is constant for the whole call. */ + vmovdqa marie_avx2_young_mask(%rip), %ymm0 + + movq %rsi, %r11 /* r11 = current bitmap word ptr */ + movl $8, %edx /* outer counter: 8 ulongs */ + + /* + * Outer loop: one iteration per output ulong (64 PTEs / 8 words). + */ +.Louter_avx2: + xorq %r8, %r8 /* accumulator = 0 */ + xorq %r9, %r9 /* bit-position = 0 */ + movl $16, %ecx /* inner counter: 16 SIMD iters */ + + /* + * Inner loop: one iteration processes 4 PTEs => 4 result bits. + * + * Each PTE is 8 bytes; 4 PTEs = 32 bytes = one 256-bit YMM load. + * rdi advances by 32 bytes per inner iteration. + */ +.Linner_avx2: + /* Load 4 PTEs into ymm1 (unaligned; caller guarantees 8-byte align). */ + vmovdqu (%rdi), %ymm1 /* ymm1 = { pte3 | pte2 | pte1 | pte0 } */ + addq $32, %rdi + + /* + * Isolate the young bit (bit 5 = 0x20) in each 64-bit PTE lane. + * VPAND operates on the full 256-bit register. + */ + vpand %ymm0, %ymm1, %ymm1 /* ymm1 &= 0x20 per 64-bit lane */ + + /* + * VPCMPEQQ: compare each 64-bit lane against ymm0 (the mask 0x20). + * If the masked PTE equals 0x20 the young bit was set; that lane + * becomes all-ones (0xFFFFFFFFFFFFFFFF), otherwise all-zeros. + * No PSHUFD trick needed: 64-bit lanes sidestep the PFN false- + * positive that afflicts 32-bit PCMPEQD-based comparisons. + */ + vpcmpeqq %ymm0, %ymm1, %ymm1 /* ymm1[i] = young_i ? ~0 : 0 */ + + /* + * VPMOVMSKB: extracts the MSB of each byte from the 256-bit ymm1 + * into a 32-bit integer. Because VPCMPEQQ produces uniform + * all-ones or all-zeros per 64-bit lane, each 8-byte group in ymm1 + * is either 0xFFFFFFFFFFFFFFFF or 0x0000000000000000. The + * resulting eax octet layout: + * eax[7:0] = 0xFF (young) or 0x00 -- PTE 0 + * eax[15:8] = 0xFF (young) or 0x00 -- PTE 1 + * eax[23:16] = 0xFF (young) or 0x00 -- PTE 2 + * eax[31:24] = 0xFF (young) or 0x00 -- PTE 3 + */ + vpmovmskb %ymm1, %eax + + /* + * Extract one result bit per PTE and shift into the accumulator. + * + * We use the CL-shift idiom (baseline x86-64): save rcx (inner + * loop counter) temporarily, use cl for the variable shift, then + * restore. For each of the 4 PTEs: + * 1. test eax against the octet mask + * 2. setnz -> 1-byte 0 or 1 + * 3. zero-extend to 64 bits + * 4. shift left by current bit-position (r9) using CL + * 5. OR into accumulator + * 6. increment bit-position + */ + + /* PTE 0: octet [7:0] */ + pushq %rcx + movq %r9, %rcx + testl $0x000000FF, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + /* PTE 1: octet [15:8] */ + pushq %rcx + movq %r9, %rcx + testl $0x0000FF00, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + /* PTE 2: octet [23:16] */ + pushq %rcx + movq %r9, %rcx + testl $0x00FF0000, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + /* PTE 3: octet [31:24] */ + pushq %rcx + movq %r9, %rcx + testl $0xFF000000, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + decl %ecx + jnz .Linner_avx2 + + /* Store the completed ulong into bitmap. */ + movq %r8, (%r11) + addq $8, %r11 /* advance bitmap write pointer */ + + decl %edx + jnz .Louter_avx2 + + /* Clear upper YMM halves to avoid SSE/AVX transition penalties. */ + vzeroupper + RET +SYM_FUNC_END(lru_marie_simd_scan_avx2) diff --git a/mm/lru_marie/simd_x86_avx512.S b/mm/lru_marie/simd_x86_avx512.S new file mode 100644 index 0000000000..18d73dbd8b --- /dev/null +++ b/mm/lru_marie/simd_x86_avx512.S @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/lru_marie/simd_x86_avx512.S -- AVX-512F PTE young-bit scan for Marie walker. + * + * Processes 512 PTEs (one full PMD page table) per call, producing an + * 8-ulong (512-bit) young-bit bitmap. 8 PTEs are handled per SIMD + * iteration via a single 512-bit ZMM load and VPTESTMQ; 8 iterations + * fill one output ulong; 8 ulongs total. + * + * Variant chosen: counter-based rolled loop (outer=8, inner=8). + * Reason: mirrors the SSE2 and AVX2 kernel structure for auditability; + * a fully-unrolled variant of 64 SIMD blocks would be ~320 instructions + * but harder to review and provides no measurable throughput gain on + * modern out-of-order cores (loop overhead is < 1% of SIMD work). + * + * Why AVX-512F is the most efficient of the three .S kernels: + * VPTESTMQ ANDs two ZMM registers and writes an 8-bit opmask register + * (one bit per 64-bit lane = one bit per PTE), directly encoding eight + * young-bit results in a single instruction. KMOVW transfers the opmask + * into a general-purpose register. This eliminates the PMOVMSKB + + * test/setnz dance that SSE2 and AVX2 require. + * + * ~5 instructions per 8 PTEs vs ~24 instructions per 4 PTEs (SSE2/AVX2). + * Per-PMD scan is approximately 10 ns of compute on a modern core. + * + * KMOVW vs KMOVB: + * KMOVW (move 16-bit opmask to/from GPR) is part of AVX-512F. + * KMOVB requires AVX-512BW. VPTESTMQ on an 8-lane ZMM only sets + * k1[7:0]; the upper 8 bits written by KMOVW are zero, so KMOVW + * gives the correct 8-bit result. All instructions in this file are + * strictly AVX-512F (Foundation); no BW/DQ/VL/VBMI sub-extensions. + * + * Caller (marie_simd_scan_avx512_wrapper in mm/lru_marie/simd_x86.c, added in + * Task 5) holds kernel_fpu_begin/end around this. Do NOT call directly + * from C without an FPU context bracket. + * + * ABI (System V AMD64): + * rdi = const pte_t *pte_table (512 PTEs, 4096 bytes, page-table base) + * rsi = unsigned long *bitmap (8 ulongs = 64 bytes, caller pre-cleared) + * + * Register usage: + * zmm0 = young-bit mask broadcast (0x20 x8, 64-bit lanes), constant + * zmm1 = 8-PTE scratch register + * k1 = VPTESTMQ result (8-bit opmask, one bit per PTE) + * rdi = current PTE pointer (walks forward by 64 bytes per inner iter) + * rsi = bitmap base (constant) + * rdx = outer loop counter (8 -> 0) + * rcx = inner loop counter (8 -> 0) + * rax = kmovw result / shift scratch + * r8 = per-ulong accumulator + * r9 = bit-position counter within current ulong (0, 8, 16, ..., 56) + * r11 = bitmap write pointer (rsi + outer*8, updated each outer iter) + * + * Opmask register constraints: + * k0 is reserved as the "no mask" predicate (all lanes active); do not + * use k0 as an output register. k1..k7 are used here (k1 only). + * + * AVX-512F only (strict Foundation subset): + * vmovdqa64 (mask), vmovdqu64 (PTE data), vptestmq, kmovw, vzeroupper -- all AVX-512F. + * zmm0..zmm7 only; k1 only. + * VZEROUPPER before RET to avoid SSE/AVX transition penalties. + */ + +#include +#include + + /* -------------------------------------------------------------- */ + /* Read-only data: young-bit mask broadcast */ + /* -------------------------------------------------------------- */ + .section .rodata + .align 64 +marie_avx512_young_mask: + /* _PAGE_ACCESSED = bit 5 = 0x20, broadcast to 8x 64-bit lanes */ + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + .quad 0x0000000000000020 + + /* -------------------------------------------------------------- */ + /* Text */ + /* -------------------------------------------------------------- */ + .text + +/* + * void lru_marie_simd_scan_avx512(const pte_t *pte_table, unsigned long *bitmap) + * + * pte_table: rdi -- 512 PTEs (4096 bytes), page-table base; page-aligned by + * the kernel page-table allocator; vmovdqu64 is used so no + * stricter SIMD alignment is required for the input + * bitmap: rsi -- 8 ulongs (64 bytes), caller pre-cleared + */ +SYM_TYPED_FUNC_START(lru_marie_simd_scan_avx512) + + /* Load young-bit mask once; zmm0 is constant for the whole call. */ + vmovdqa64 marie_avx512_young_mask(%rip), %zmm0 + + movq %rsi, %r11 /* r11 = current bitmap word ptr */ + movl $8, %edx /* outer counter: 8 ulongs */ + + /* + * Outer loop: one iteration per output ulong (64 PTEs / 8 words). + * + * Each outer iteration processes 64 PTEs in 8 inner SIMD steps + * (8 PTEs per step) and writes one 64-bit ulong to the bitmap. + */ +.Louter_avx512: + xorq %r8, %r8 /* accumulator = 0 */ + xorq %r9, %r9 /* bit-position = 0 (steps of 8) */ + movl $8, %ecx /* inner counter: 8 SIMD iters */ + + /* + * Inner loop: one iteration processes 8 PTEs => 8 result bits. + * + * Each PTE is 8 bytes; 8 PTEs = 64 bytes = one 512-bit ZMM load. + * rdi advances by 64 bytes per inner iteration. + * + * Algorithm per iteration: + * 1. vmovdqu64 loads 8 PTEs into zmm1 (unaligned; page tables are + * page-aligned but rdi is the base of the 512-PTE array which + * is always 8-byte aligned, not necessarily 64-byte aligned). + * 2. vptestmq ANDs zmm1 with zmm0 (mask) and sets k1[i] = 1 iff + * the masked value in lane i is nonzero (i.e. young bit set). + * 3. kmovw extracts the 8-bit opmask from k1 into eax[7:0]. + * 4. Zero-extend al -> rax (upper bits already zero from kmovw). + * 5. Shift rax left by r9 (0, 8, 16, ..., 56) to place the 8 bits + * at the correct position in the 64-bit accumulator. + * 6. OR into r8 accumulator; advance bit-position by 8. + * + * The variable-shift idiom (shlq %cl, rax) requires the shift count + * in cl. We temporarily borrow rcx (inner loop counter) by pushing + * and popping around each shift. + */ +.Linner_avx512: + /* Load 8 PTEs into zmm1 (64 bytes, EVEX-encoded unaligned load). */ + vmovdqu64 (%rdi), %zmm1 /* zmm1 = pte[7]..pte[0] */ + addq $64, %rdi + + /* + * VPTESTMQ: for each 64-bit lane i, compute: + * k1[i] = (zmm1[i] & zmm0[i]) != 0 ? 1 : 0 + * + * zmm0 holds 0x20 in all lanes. A PTE with _PAGE_ACCESSED set has + * bit 5 = 1, so the AND is nonzero => k1[i] = 1. + * The 8-bit opmask k1 encodes young status for all 8 PTEs directly. + * + * No PSHUFD trick or PCMPEQD needed: the opmask result is exact. + * No false positives from PFN high bits: VPTESTMQ tests for ANY + * nonzero bit after AND with the narrow mask, but since the mask is + * exactly 0x20 (single bit), the result is correct for any PFN. + */ + vptestmq %zmm1, %zmm0, %k1 /* k1[i] = pte[i] has young bit */ + + /* + * KMOVW: move the 16-bit opmask register k1 to eax. + * VPTESTMQ on an 8-lane ZMM only sets k1[7:0]; k1[15:8] is zero. + * So eax[7:0] = 8-bit young bitmap; eax[31:8] = 0 (zero-extended). + * rax[63:32] is also zero because movl/kmovw zero-extends to 64 bits. + * + * KMOVW is in AVX-512F. KMOVB requires AVX-512BW -- not used here. + */ + kmovw %k1, %eax /* eax[7:0] = 8-bit young mask */ + + /* + * Shift the 8-bit result to the correct position in the accumulator. + * r9 = 0 for the first inner iter, 8 for the second, ..., 56 for + * the eighth. After shifting, rax holds the 8 result bits at their + * correct positions within the 64-bit ulong. + * + * Use CL-based variable shift; save/restore rcx (inner loop counter). + */ + pushq %rcx + movq %r9, %rcx + shlq %cl, %rax /* rax <<= bit-position (0..56) */ + orq %rax, %r8 /* accumulate into output ulong */ + popq %rcx + addq $8, %r9 /* advance bit-position by 8 */ + + decl %ecx + jnz .Linner_avx512 + + /* Store the completed ulong into bitmap. */ + movq %r8, (%r11) + addq $8, %r11 /* advance bitmap write pointer */ + + decl %edx + jnz .Louter_avx512 + + /* + * VZEROUPPER: clear the upper halves of all YMM/ZMM registers to + * avoid SSE/AVX transition penalties in subsequent legacy SSE code. + * Required whenever AVX or AVX-512 code may be followed by SSE code. + */ + vzeroupper + RET +SYM_FUNC_END(lru_marie_simd_scan_avx512) diff --git a/mm/lru_marie/simd_x86_sse2.S b/mm/lru_marie/simd_x86_sse2.S new file mode 100644 index 0000000000..4646ca1b9c --- /dev/null +++ b/mm/lru_marie/simd_x86_sse2.S @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/lru_marie/simd_x86_sse2.S -- SSE2 PTE young-bit scan for the Marie walker. + * + * Processes 512 PTEs (one full PMD page table) per call, producing an + * 8-ulong (512-bit) young-bit bitmap. 4 PTEs are handled per SIMD + * iteration; 16 iterations fill one output ulong; 8 ulongs total. + * + * Variant chosen: counter-based rolled loop (outer=8, inner=16). + * Reason: keeps the file ~130 lines and fully auditable for SSE2 + * compliance; a fully-unrolled variant would be ~3000 instructions and + * harder to review without meaningfully faster throughput on modern OOO + * cores (loop overhead is < 1% of SIMD work). + * + * Caller (marie_simd_scan_sse2_wrapper in mm/lru_marie/simd_x86.c, added in + * Task 5) holds kernel_fpu_begin/end around this. Do NOT call directly + * from C without an FPU context bracket. + * + * ABI (System V AMD64): + * rdi = const pte_t *pte_table (512 PTEs, 4096 bytes, page-table base; + * page-aligned by the kernel page-table + * allocator; movdqu is used so no stricter + * SIMD alignment is required for the input) + * rsi = unsigned long *bitmap (8 ulongs = 64 bytes, caller pre-cleared) + * + * Register usage: + * xmm0 = young-bit mask broadcast (0x00000020 x4), constant + * xmm1 = PTE pair 0-1 scratch + * xmm2 = PTE pair 2-3 scratch + * rdi = current PTE pointer (walks forward) + * rsi = bitmap base (constant) + * rdx = outer loop counter (8 -> 0) + * rcx = inner loop counter (16 -> 0) + * rax = pmovmskb result / bit-extraction scratch + * r8 = per-ulong accumulator + * r9 = bit-position counter within current ulong (0..63) + * r10 = single-bit scratch for OR-into-accumulator + * r11 = bitmap write pointer (rsi + outer*8, updated each outer iter) + * + * SSE2 only (x86-64 ABI baseline since AMD Opteron 2003): + * movdqa, pshufd, punpcklqdq, pand, pcmpeqd, pmovmskb -- all SSE2. + * No SSE3, SSSE3, SSE4.x, AVX, BMI1/BMI2 instructions used. + * + * PSHUFD low-dword packing: the high 32 bits of an x86-64 PTE carry + * PFN bits 32-51, pkey, and NX. If a PFN has bit 25 set (PA >= 128 GiB) + * with all other high PFN bits clear, the high dword would equal 0x20 + * and produce a false positive against the young-bit mask. PSHUFD with + * imm8=0x88 extracts only the low dword of each 64-bit PTE lane before + * the comparison, sidestepping this entirely. + */ + +#include +#include + + /* -------------------------------------------------------------- */ + /* Read-only data: young-bit mask broadcast */ + /* -------------------------------------------------------------- */ + .section .rodata + .align 16 +marie_sse2_young_mask: + /* _PAGE_ACCESSED = bit 5 = 0x20, broadcast to all four dwords */ + .long 0x00000020 + .long 0x00000020 + .long 0x00000020 + .long 0x00000020 + + /* -------------------------------------------------------------- */ + /* Text */ + /* -------------------------------------------------------------- */ + .text + +/* + * void lru_marie_simd_scan_sse2(const pte_t *pte_table, unsigned long *bitmap) + * + * pte_table: rdi -- 512 PTEs (4096 bytes), page-table base (8-byte aligned) + * bitmap: rsi -- 8 ulongs (64 bytes), caller pre-cleared + */ +SYM_TYPED_FUNC_START(lru_marie_simd_scan_sse2) + + /* Load young-bit mask once; xmm0 is constant for the whole call. */ + movdqa marie_sse2_young_mask(%rip), %xmm0 + + movq %rsi, %r11 /* r11 = current bitmap word ptr */ + movl $8, %edx /* outer counter: 8 ulongs */ + + /* + * Outer loop: one iteration per output ulong (64 PTEs / 8 words). + */ +.Louter: + xorq %r8, %r8 /* accumulator = 0 */ + xorq %r9, %r9 /* bit-position = 0 */ + movl $16, %ecx /* inner counter: 16 SIMD iters */ + + /* + * Inner loop: one iteration processes 4 PTEs => 4 result bits. + * + * Each PTE is 8 bytes; 4 PTEs = 32 bytes = 2 x 16-byte XMM loads. + * rdi advances by 32 bytes per inner iteration. + */ +.Linner: + /* Load PTE[0..1] and PTE[2..3] (unaligned; 8-byte align guaranteed). */ + movdqu (%rdi), %xmm1 /* xmm1 = { pte1_hi:pte1_lo | pte0_hi:pte0_lo } */ + movdqu 16(%rdi), %xmm2 /* xmm2 = { pte3_hi:pte3_lo | pte2_hi:pte2_lo } */ + addq $32, %rdi + + /* + * Pack the low 32-bit dword of each PTE into one XMM register. + * + * pshufd imm8=0x88 = 0b_10_00_10_00: + * dst[0] = src[0] (dword 0 = PTE low bits) + * dst[1] = src[2] (dword 2 = next PTE low bits) + * dst[2] = src[0] (repeated -- don't care) + * dst[3] = src[2] (repeated -- don't care) + * => xmm1 = { lo1_dup | lo0_dup | lo1 | lo0 } + * xmm2 = { lo3_dup | lo2_dup | lo3 | lo2 } + * + * punpcklqdq merges the low 64 bits of each: + * dst = { xmm2[63:0] | xmm1[63:0] } + * = { lo3 | lo2 | lo1 | lo0 } + */ + pshufd $0x88, %xmm1, %xmm1 + pshufd $0x88, %xmm2, %xmm2 + punpcklqdq %xmm2, %xmm1 /* xmm1 = { lo3, lo2, lo1, lo0 } */ + + /* Isolate young bit (bit 5 = 0x20) in each dword. */ + pand %xmm0, %xmm1 /* xmm1 &= 0x20 */ + + /* Compare: dword == 0x20 => all-ones (-1), else 0. */ + pcmpeqd %xmm0, %xmm1 /* xmm1[i] = (lo_i & 0x20 == 0x20) ? 0xFFFFFFFF : 0 */ + + /* + * pmovmskb: takes the MSB of each byte => 16-bit result in eax. + * Since each dword is either 0x00000000 or 0xFFFFFFFF, the four + * nibbles of eax are uniformly 0x0 or 0xF: + * eax[3:0] = PTE0 young (0xF) or not (0x0) + * eax[7:4] = PTE1 + * eax[11:8] = PTE2 + * eax[15:12] = PTE3 + */ + pmovmskb %xmm1, %eax + + /* + * Extract one result bit per PTE and shift into the accumulator. + * + * We use the CL-shift idiom (SSE2/baseline x86): + * save rcx (inner loop counter) -> r10 temporarily, then restore. + * + * For each of the 4 PTEs: + * 1. test eax against the nibble mask (0x000F, 0x00F0, 0x0F00, 0xF000) + * 2. setnz -> 1-byte 0 or 1 + * 3. zero-extend to 64 bits + * 4. shift left by current bit-position (r9) using CL + * 5. OR into accumulator + * 6. increment bit-position + * + * We push/pop rcx around each variable-CL shift to preserve the + * inner loop counter. + */ + + /* PTE 0: nibble [3:0] */ + pushq %rcx + movq %r9, %rcx + testl $0x000F, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + /* PTE 1: nibble [7:4] */ + pushq %rcx + movq %r9, %rcx + testl $0x00F0, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + /* PTE 2: nibble [11:8] */ + pushq %rcx + movq %r9, %rcx + testl $0x0F00, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + /* PTE 3: nibble [15:12] */ + pushq %rcx + movq %r9, %rcx + testl $0xF000, %eax + setnz %r10b + movzbq %r10b, %r10 + shlq %cl, %r10 + orq %r10, %r8 + popq %rcx + incq %r9 + + decl %ecx + jnz .Linner + + /* Store the completed ulong into bitmap. */ + movq %r8, (%r11) + addq $8, %r11 /* advance bitmap write pointer */ + + decl %edx + jnz .Louter + + RET +SYM_FUNC_END(lru_marie_simd_scan_sse2) diff --git a/mm/lru_marie/state.c b/mm/lru_marie/state.c new file mode 100644 index 0000000000..8f07247597 --- /dev/null +++ b/mm/lru_marie/state.c @@ -0,0 +1,2745 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Marie per-PFN state array — allocation, init, and global counters. + * + * Implements the public storage declared in state.h: the flat + * marie_state[] array indexed by PFN, the cycling head-gen counter, + * and the per-(gen, type) install counters that drive aging. All of + * these are allocated once at subsys_initcall time and never freed + * for the lifetime of the kernel. + * + * Sizing rule: the array covers PFNs [0, max_pfn). max_pfn is bounded + * by MARIE_MAX_SUPPORTED_PFN (the 32-bit PFN gate latched in + * marie_init), so worst-case footprint is 4 GiB. Realistic configs + * are 4-64 MiB. NUMA holes and reserved regions read as zero + * (untracked) and incur only sequential-read cost during scans. + */ + +#define pr_fmt(fmt) "marie_state: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86 +#include +#include +#endif + +#include "../internal.h" /* struct scan_control, shrink_folio_list */ +#include "account.h" +#include "pfn_install.h" +#include "prefetch.h" +#include "state.h" + +/* + * Runtime prefetch-ring parameters, set once at boot by + * marie_prefetch_params_init() based on CPUID. All values are + * powers of 2 so the hot path can use & marie_l3_mask instead of + * % marie_l3_ahead. Defaults are conservative (Silvermont / non-x86). + */ +static unsigned int marie_l3_ahead __read_mostly = 8; +static unsigned int marie_l3_mask __read_mostly = 7; +static unsigned int marie_l1_ahead __read_mostly = 2; + +void __init marie_prefetch_params_init(void) +{ + unsigned int l3 = 8, l1 = 2; + +#ifdef CONFIG_X86 + if (!boot_cpu_has(X86_FEATURE_AVX2)) + goto done; + + if (boot_cpu_has(X86_FEATURE_AVX512F)) { + /* Zen 4/5, Sapphire Rapids: L2 MSHR ~32 */ + l3 = 32; l1 = 8; + goto done; + } + + /* AVX2 present but no AVX-512 */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 >= 0x1A) { + /* Zen 5+ mobile without AVX-512 */ + l3 = 32; l1 = 8; + } else if (boot_cpu_data.x86 == 0x19) { + /* Zen 3 (family 0x19): L2 MSHR ~24 */ + l3 = 24; l1 = 8; + } else if (boot_cpu_data.x86 == 0x17) { + /* Zen 1/2 (family 0x17): L2 MSHR ~20 */ + l3 = 20; l1 = 8; + } else { + /* Excavator era (family 0x15): L2 MSHR ~12 */ + l3 = 16; l1 = 6; + } + break; + case X86_VENDOR_INTEL: + /* + * CLFLUSHOPT as a Skylake proxy: Haswell and Broadwell + * (all models) predate it; Skylake introduced it. + */ + if (boot_cpu_has(X86_FEATURE_CLFLUSHOPT)) { + /* Skylake and newer: L2 MSHR ~20-32 */ + l3 = 24; l1 = 8; + } else { + /* Haswell / Broadwell: L2 MSHR ~16 */ + l3 = 16; l1 = 6; + } + break; + default: + /* Unknown vendor with AVX2: conservative v3 baseline */ + l3 = 16; l1 = 6; + } +done: +#endif + marie_l3_ahead = l3; + marie_l3_mask = l3 - 1; + marie_l1_ahead = l1; + pr_info("prefetch ring: l3_ahead=%u l1_ahead=%u\n", l3, l1); +} + +u8 *marie_state; +unsigned long marie_state_size; + +/* + * Latches true once marie_state[] is allocated (first enable) and never + * flips back -- the array lives for the kernel's lifetime. Gates the + * page-free hook so stale TRACKED bits are wiped at the buddy handoff + * even across a Marie disable transition (when lru_marie_enabled() is + * already false but the drain walk is still in flight). See + * marie_state_ready() in . + */ +DEFINE_STATIC_KEY_FALSE(marie_state_ready_key); +EXPORT_SYMBOL_GPL(marie_state_ready_key); + +atomic_t marie_head_gen[2]; + +/* + * Per-(gen, type) install gauge, now PER-CPU instead of one global + * atomic_long. The install hot path bumps it with this_cpu_inc (no + * shared cacheline), so concurrent installs from different lruvecs + * (different lru_locks, not mutually serialised) no longer ping-pong a + * single global line -- the cross-lruvec contention point on the alloc + * path. The count is advisory (drives only the aging-cadence hint), so + * the throttled advance check reading an approximate cross-CPU sum is + * sufficient. marie_aging_tick is the per-CPU throttle for that check. + */ +DEFINE_PER_CPU(long[MARIE_PFN_NR_GENS][ANON_AND_FILE], marie_gen_installs_pc); +DEFINE_PER_CPU(unsigned int[ANON_AND_FILE], marie_aging_tick); +atomic_long_t marie_gen_occupied[MARIE_PFN_NR_GENS][2]; +atomic_t marie_gen_walker_visits[MARIE_PFN_NR_GENS][2]; + + +struct marie_bitmap marie_track_bm[2][MARIE_PFN_NR_GENS][MARIE_PFN_NR_TIERS]; +unsigned int marie_l2_shift; + +/* + * Per-CPU shrink scratch buffer, pre-allocated at boot. Reclaim path + * cannot kmalloc / kvmalloc on the hot path (allocation under memory + * pressure is what we are trying to relieve), so the isolate batch + * lives in a fixed per-CPU buffer claimed via an atomic in_use flag. + * On contention (preempted reclaimer on the same CPU holds the buf + * across a shrink_folio_list sleep) marie_state_shrink_lruvec falls + * back to a 160-entry stack array. + * + * Sizing: 8192 entries = SWAP_CLUSTER_MAX << 8. Doubled from the + * MGLRU MAX_LRU_BATCH (4096) reference after boot testing showed + * 4096-cap reclaim falling behind tail /dev/zero alloc rate. 32 MiB + * per shrink_folio_list flush at peak amortises lock + IPI overhead + * twice as well. Per-CPU memory cost: + * batch: 8192 * 8 B = 64 KiB + * atomic: = ~4 B + * ~= 64 KiB / CPU. 16 CPUs = ~1 MiB system-wide static. + * + * Neither PFN nor prev_tier needs its own array at putback: PFN is + * recovered via folio_pfn(batch[i]), and prev_tier is read back from the + * per-PFN state byte (counters_only preserves it across isolate). + */ +#define MARIE_PFN_SHRINK_BATCH (SWAP_CLUSTER_MAX << 8) /* 8192 */ +#define MARIE_PFN_BATCH_FLOOR (SWAP_CLUSTER_MAX * 8) /* 256, matches + * legacy + * MARIE_BATCH_FLOOR */ +/* + * Fallback batch size when the per-CPU buf is contended. 5 * + * SWAP_CLUSTER_MAX = 160 entries occupy 160 * 8 = 1280 B on the + * stack; combined with the surrounding ~464 B of non-array locals + * in shrink_lruvec the frame lands at ~1744 B, staying under the + * gcc -Wframe-larger-than=2048 threshold without restructuring. + * 5x SWAP_CLUSTER_MAX. + */ +#define MARIE_PFN_FALLBACK_BATCH (SWAP_CLUSTER_MAX * 5) /* 160 */ + +struct marie_shrink_buf { + atomic_t in_use; + struct folio *batch[MARIE_PFN_SHRINK_BATCH]; +}; +static DEFINE_PER_CPU(struct marie_shrink_buf, marie_shrink_buf); + +/* + * Per-PFN adaptive batch threshold. + * + * priority = DEF_PRIORITY -> floor (MARIE_PFN_BATCH_FLOOR = 256) + * priority = 0 -> cap (MARIE_PFN_SHRINK_BATCH = 8192) + * + * Cap is the per-CPU buffer size; floor is large enough to amortise + * the per-call scan setup. Linear interpolation between the two over + * sc->priority. + */ +static unsigned long marie_pfn_batch_threshold(struct scan_control *sc) +{ + unsigned long floor = MARIE_PFN_BATCH_FLOOR; + unsigned long cap = MARIE_PFN_SHRINK_BATCH; + unsigned long pressure; + + pressure = DEF_PRIORITY + 1 - + clamp(sc_priority(sc), 0, DEF_PRIORITY); + return floor + (cap - floor) * (pressure - 1) / DEF_PRIORITY; +} + +/* + * Per-memcg L1/L2 bitmap pair. + * + * L1: 1 bit per PFN, separately allocated (~512 KiB / 16 GiB + * max_pfn). Set on every install for this memcg, cleared + * on every del. + * L2: 1 bit per 32 MiB PFN range, inline (64 B). Maintained + * via the per-bit l2_count[] refcounter so the L2 bit is + * set on the 0->1 transition and cleared on the 1->0 + * transition -- precise (no stale bits). + * + * Scan AND's L1 word-by-word into the (type, gen, tier) inner + * producer and L2 word-by-word into the outer 8-word L2 loop, so + * memcg-targeted reclaim iterates exactly (type, gen, tier) ∩ memcg + * at source. + * + * Allocated for every non-root memcg at memcg create + * (lru_marie_memcg_alloc); freed at exit (marie_memcg_bitmap_free). + * Root memcg has no bitmap and the helpers no-op for it; root + * reclaim runs without per-memcg filtering anyway. + */ +/* + * Per-memcg bitmap is just the unified struct marie_bitmap on the + * heap, xa-keyed by memcg pointer. No wrapper needed. + */ +static DEFINE_XARRAY(marie_memcg_bitmap_xa); + +int lru_marie_memcg_alloc(struct mem_cgroup *memcg) +{ + struct marie_bitmap *bm; + int err; + + might_sleep(); + + if (!memcg || mem_cgroup_is_root(memcg)) + return 0; + if (!max_pfn) + return 0; + + bm = kzalloc(sizeof(*bm), GFP_KERNEL); + if (!bm) + return -ENOMEM; + if (marie_bm_init(bm)) { + kfree(bm); + return -ENOMEM; + } + + err = xa_err(xa_store(&marie_memcg_bitmap_xa, + (unsigned long)memcg, bm, GFP_KERNEL)); + if (err) { + marie_bm_free(bm); + kfree(bm); + return err; + } + return 0; +} +EXPORT_SYMBOL_GPL(lru_marie_memcg_alloc); + +void marie_memcg_bitmap_free(struct mem_cgroup *memcg) +{ + struct marie_bitmap *bm; + + if (!memcg) + return; + bm = xa_erase(&marie_memcg_bitmap_xa, (unsigned long)memcg); + if (bm) { + marie_bm_free(bm); + kfree(bm); + } +} + +void marie_memcg_bitmap_set(struct mem_cgroup *memcg, unsigned long pfn) +{ + struct marie_bitmap *bm; + + if (!memcg || mem_cgroup_is_root(memcg)) + return; + bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg); + if (bm) + marie_bm_set(bm, pfn); +} + +void marie_memcg_bitmap_clear(struct mem_cgroup *memcg, unsigned long pfn) +{ + struct marie_bitmap *bm; + + if (!memcg || mem_cgroup_is_root(memcg)) + return; + bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg); + if (bm) + marie_bm_clear(bm, pfn); +} + +unsigned long *marie_memcg_bitmap_get(struct mem_cgroup *memcg) +{ + struct marie_bitmap *bm; + + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg); + return bm ? bm->l1 : NULL; +} + +unsigned long *marie_memcg_bitmap_get_l2(struct mem_cgroup *memcg) +{ + struct marie_bitmap *bm; + + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg); + return bm ? bm->l2 : NULL; +} + +/* + * marie_memcg_bitmap_merge - hand off every PFN tracked in @child's + * per-memcg bitmap to @parent's. Thin wrapper over marie_bm_merge: + * the per-memcg storage is just a struct marie_bitmap so the merge + * logic (L2-pruned word-OR + l2_count transfer + L2 bit sync) lives + * once in bitmap.c and is shared between memcg reparent and any + * future caller. + * + * The per-PFN state byte (gen / tier / type / zone) and the global + * (type, gen, tier) tracking bitmaps are not touched here -- they + * are memcg-agnostic. After the merge, the walker / scanner running + * against @parent finds the reparented PFNs at their existing + * gen / tier positions through the union'd memcg L1. + * + * @parent == NULL or root: @child's bitmap is simply zeroed (folios + * fall back to global tracking equivalent to root_memcg). + * + * Caller must serialise against concurrent set/clear on either + * bitmap. memcontrol's reparent path holds objcg_lock + both lruvecs' + * lru_lock with IRQs off, which is sufficient. + */ +void marie_memcg_bitmap_merge(struct mem_cgroup *parent, + struct mem_cgroup *child) +{ + struct marie_bitmap *cbm, *pbm = NULL; + + if (!child || mem_cgroup_is_root(child)) + return; + cbm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)child); + if (!cbm) + return; + if (parent && !mem_cgroup_is_root(parent)) + pbm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)parent); + + marie_bm_merge(pbm, cbm); +} +EXPORT_SYMBOL_GPL(marie_memcg_bitmap_merge); + +/* + * Allocate the per-PFN state array. Called from marie_init() after + * the 32-bit PFN gate is latched, so max_pfn is guaranteed to fit + * in the supported range. + * + * kvmalloc lets the array fall back to vmalloc on systems where a + * physically contiguous allocation is unavailable; the array is + * accessed strictly by PFN index and does not require contiguity. + * GFP_KERNEL is safe here — initcall context can sleep. + */ +int __init marie_state_init(void) +{ + unsigned long bytes; + int g, t, ty; + + bytes = max_pfn * sizeof(u8); + if (!bytes) { + pr_err("max_pfn is zero; refusing to initialise\n"); + return -EINVAL; + } + + marie_state = kvmalloc(bytes, GFP_KERNEL | __GFP_ZERO); + if (!marie_state) { + pr_err("failed to allocate %lu-byte per-PFN state array\n", + bytes); + return -ENOMEM; + } + marie_state_size = max_pfn; + + /* + * L2 bitmap shift: (1 << shift) PFNs map to one L2 bit so 512 + * L2 bits cover the full max_pfn range. Round up to the next + * power of two so the index is a simple right shift in the hot + * path. Floor at shift 0 for tiny VMs where max_pfn < 512. + * Must be set before any marie_bm_* call so marie_pfn_to_l2_bit + * works correctly. + */ + { + unsigned long ppb = max_pfn / MARIE_L2_BITS; + + if (ppb < 1) + ppb = 1; + marie_l2_shift = order_base_2(ppb); + } + + marie_bm_range_locks_init(); + + /* Per-(type, gen, tier) L1 bitmaps: 16 total. */ + for (ty = 0; ty < 2; ty++) { + for (g = 0; g < MARIE_PFN_NR_GENS; g++) { + for (t = 0; t < MARIE_PFN_NR_TIERS; t++) { + if (marie_bm_init(&marie_track_bm[ty][g][t])) + goto bm_oom; + } + } + } + + /* + * Latch the page-free hook on now that marie_state[] exists. Never + * disabled -- the array is never freed, and TRACKED bits can persist + * into a disable transition, so the hook must keep wiping them. + */ + static_branch_enable(&marie_state_ready_key); + + pr_info("allocated state %lu B + 16 tracking bitmaps (max_pfn=%lu, l2_shift=%u)\n", + bytes, max_pfn, marie_l2_shift); + return 0; + +bm_oom: + for (ty = 0; ty < 2; ty++) + for (g = 0; g < MARIE_PFN_NR_GENS; g++) + for (t = 0; t < MARIE_PFN_NR_TIERS; t++) + marie_bm_free(&marie_track_bm[ty][g][t]); + kvfree(marie_state); + marie_state = NULL; + return -ENOMEM; +} + +/* + * marie_state_isolate_scan_l2lock - L2-bitmap pre-filtered scan with + * 512-way parallel exclusion via try_lock on per-L2-bit locks. + * + * Walks the L2 bitmap (1 cacheline) for the oldest (gen, type). For + * each set L2 bit it try_locks the matching L2 lock; on success it + * holds exclusive ownership of that PFN range and walks the L1 + * bitmap within it, applying the same (mask, target) byte filter as + * the cursor scan. On try_lock failure another scanner already owns + * the range -- skip and try the next L2 bit. No wasted candidate + * scan work, no per-CPU cursor, no overlap-arbitration via + * folio_test_clear_lru collisions. + * + * Loop exits when batch_size is reached, nr_to_scan is exhausted, + * or every L2 bit in the pgdat's PFN range has been visited (locked + * or skipped). + */ +unsigned long marie_state_isolate_scan_l2lock(struct pglist_data *pgdat, + int type, int max_zone, + unsigned int tier, + struct mem_cgroup *target_memcg, + struct folio **batch, + unsigned long batch_size, + unsigned long nr_to_scan) +{ + unsigned long *mbm = target_memcg ? + marie_memcg_bitmap_get(target_memcg) : NULL; + unsigned long *memcg_l2 = target_memcg ? + marie_memcg_bitmap_get_l2(target_memcg) : + NULL; + unsigned long *l1, *l2; + u8 oldest_gen, mask, target; + int oldest; + unsigned long start_pfn, end_pfn; + unsigned int start_l2, end_l2; + unsigned int l2_word, l2_word_end; + unsigned long n_batch = 0; + + if (!marie_state) + return 0; + + oldest = marie_find_oldest_occupied(type); + if (oldest < 0) + return 0; + oldest_gen = (u8)oldest; + { + struct marie_bitmap *bm = + &marie_track_bm[type][oldest_gen][tier & 0x3]; + + l1 = bm->l1; + l2 = bm->l2; + } + if (!l1) + return 0; + + mask = MARIE_PFN_TRACKED | MARIE_PFN_GEN_MASK | + MARIE_PFN_TIER_MASK | MARIE_PFN_TYPE_MASK; + target = MARIE_PFN_TRACKED | + (oldest_gen << MARIE_PFN_GEN_SHIFT) | + ((tier & 0x3) << MARIE_PFN_TIER_SHIFT) | + (type ? MARIE_PFN_TYPE_FILE : 0); + + start_pfn = pgdat->node_start_pfn; + end_pfn = pgdat_end_pfn(pgdat); + if (end_pfn > marie_state_size) + end_pfn = marie_state_size; + if (start_pfn >= end_pfn) + return 0; + + start_l2 = marie_pfn_to_l2_bit(start_pfn); + end_l2 = marie_pfn_to_l2_bit(end_pfn - 1) + 1; + if (end_l2 > MARIE_L2_BITS) + end_l2 = MARIE_L2_BITS; + l2_word = start_l2 / BITS_PER_LONG; + l2_word_end = DIV_ROUND_UP(end_l2, BITS_PER_LONG); + + /* + * Outer L2 loop is word-level: AND the global (type, gen, tier) + * L2 with the per-memcg L2 (when memcg-targeted) so the inner + * __ffs/blsr extraction visits only L2 bits where + * (type, gen, tier) ∩ memcg is non-empty. 512 L2 bits collapse + * to 8 u64 word iterations; empty AND results skip the entire + * word at one cycle each. + */ + for (; l2_word < l2_word_end; l2_word++) { + unsigned long l2w = l2[l2_word]; + + if (memcg_l2) + l2w &= memcg_l2[l2_word]; + /* Mask off pre-start_l2 / post-end_l2 bits in edge words. */ + if (l2_word == start_l2 / BITS_PER_LONG && + (start_l2 % BITS_PER_LONG)) + l2w &= ~((1UL << (start_l2 % BITS_PER_LONG)) - 1); + if (l2_word + 1 == l2_word_end && + (end_l2 % BITS_PER_LONG)) + l2w &= (1UL << (end_l2 % BITS_PER_LONG)) - 1; + + while (l2w && n_batch < batch_size && nr_to_scan > 0) { + unsigned int bit = l2_word * BITS_PER_LONG + __ffs(l2w); + unsigned long lo, hi; + unsigned long ring[MARIE_L3_AHEAD_MAX]; + int rh = 0, rt = 0, rc = 0; + unsigned long word_rem; + unsigned long word_base; + unsigned long word_i, end_word; + bool producer_done = false; + int i, n; + /* + * Local copies of the runtime ring parameters. Declaring them + * here as loop-scope constants lets the compiler see them as + * truly invariant within this L2 lock window and allocate + * registers for them, rather than spilling the file-static + * globals to the stack under register pressure. + */ + const unsigned int r_l3_ahead = marie_l3_ahead; + const unsigned int r_l3_mask = marie_l3_mask; + const unsigned int r_l1_ahead = marie_l1_ahead; + /* + * Per-L2-range cache-line cursors for marie_state[] prefetch. + * PFNs within one L2 range are monotonically increasing, so + * the cursor only advances; resetting per L2 range avoids + * stale comparisons when the next range starts at a lower + * cache line than the previous one ended at. + */ + unsigned long state_cl_cursor_l3 = 0; + unsigned long state_cl_cursor_l1 = 0; + /* + * Per-L2-range cache-line cursors for bitmap arrays. word_i + * is monotonically increasing within the range so cursors + * only advance. mbm cursor is unused when memcg-targeting is + * off (mbm == NULL) — the macro guards on the array. + */ + unsigned long l1_cl_cursor = 0; + unsigned long mbm_cl_cursor = 0; + + l2w &= l2w - 1; + + if (!marie_bm_range_trylock(bit)) + continue; + + lo = marie_l2_bit_pfn_start(bit); + hi = marie_l2_bit_pfn_end(bit); + if (lo < start_pfn) + lo = start_pfn; + if (hi > end_pfn) + hi = end_pfn; + + /* + * Inline bit producer state with optional word-level mbm + * AND: word_rem is the live remainder of l1[word_i] with + * mbm[word_i] AND-ed in (when memcg-targeted). Persists + * across Phase 1 fill and Phase 3 refill so we never + * re-scan a cleared word and never pay find_next_bit's + * call overhead. The AND narrows iteration to + * (type, gen, tier) ∩ memcg at source -- per-candidate + * mbm post-filter falls away. + */ + word_i = lo / BITS_PER_LONG; + end_word = BITS_TO_LONGS(hi); + word_base = word_i * BITS_PER_LONG; + word_rem = (word_i < end_word) ? l1[word_i] : 0; + if (mbm && word_i < end_word) + word_rem &= mbm[word_i]; + /* Mask off pre-lo bits in the first word. */ + if (lo > word_base) + word_rem &= ~((1UL << (lo - word_base)) - 1); + word_i++; + + /* + * Two-stage prefetch ring within this L2 lock window: + * + * Phase 1: fill the ring (up to marie_l3_ahead candidate + * PFNs via inline __ffs/blsr), firing prefetcht2 on + * each struct page + state byte -- DRAM fetch in + * flight by the time the iterator pulls the entry. + * + * Phase 2: L1-escalate the first marie_l1_ahead entries + * with prefetcht0 so they land in L1 before processing. + * + * Phase 3: drain. Per pulled entry, refill the head (one + * more L3 prefetch) and L1-escalate the entry now + * marie_l1_ahead ahead of the new tail. State byte + * confirm and pfn_folio() both hit cache. + * + * Ring is local to this L2 lock acquisition; struct page + + * state byte are vmemmap/contiguous so prefetches incur no + * locking cost. + */ + /* + * Cache-line cursor prefetch for bitmap arrays. Issued at each word + * refill; the cursor only advances so a dense word transition does + * not re-prefetch the same cache line. + */ +#define MARIE_PREFETCH_BMWORD_L3(arr, cursor) do { \ + unsigned long _bi = word_i + MARIE_BM_L3_AHEAD_WORDS; \ + if (_bi < end_word) { \ + unsigned long _cl = (unsigned long)&(arr)[_bi] \ + & ~63UL; \ + if (_cl != (cursor)) { \ + marie_prefetch_l3((void *)_cl); \ + (cursor) = _cl; \ + } \ + } \ + } while (0) + +#define MARIE_RING_PRODUCE(out_pfn, done_label) do { \ + while (!word_rem) { \ + if (word_i >= end_word) { \ + producer_done = true; \ + goto done_label; \ + } \ + word_rem = l1[word_i]; \ + MARIE_PREFETCH_BMWORD_L3(l1, l1_cl_cursor); \ + if (mbm) { \ + word_rem &= mbm[word_i]; \ + MARIE_PREFETCH_BMWORD_L3(mbm, \ + mbm_cl_cursor);\ + } \ + word_base = word_i * BITS_PER_LONG; \ + word_i++; \ + } \ + (out_pfn) = word_base + __ffs(word_rem); \ + word_rem &= word_rem - 1; \ + if ((out_pfn) >= hi) { \ + producer_done = true; \ + goto done_label; \ + } \ + } while (0) + + /* + * Cache-line cursor prefetch for marie_state[]. AHEAD_PFN pushes the + * prefetched cache line N PFN ahead of the current producer position + * so DRAM (L3-tier) and L3->L1 latencies are hidden even when the + * consumer's fast-skip iter (mask filter early-continue) burns only + * a few cycles per PFN. struct page is per-PFN = per-cache-line + * already, so its prefetches stay per-PFN unchanged. + */ +#define MARIE_PREFETCH_STATE_L3(pfn) do { \ + unsigned long _ah = (pfn) + MARIE_STATE_L3_AHEAD_PFN; \ + if (_ah < marie_state_size) { \ + unsigned long _cl = (unsigned long)&marie_state[_ah] \ + & ~63UL; \ + if (_cl != state_cl_cursor_l3) { \ + marie_prefetch_l3((void *)_cl); \ + state_cl_cursor_l3 = _cl; \ + } \ + } \ + } while (0) +#define MARIE_PREFETCH_STATE_L1(pfn) do { \ + unsigned long _ah = (pfn) + MARIE_STATE_L1_AHEAD_PFN; \ + if (_ah < marie_state_size) { \ + unsigned long _cl = (unsigned long)&marie_state[_ah] \ + & ~63UL; \ + if (_cl != state_cl_cursor_l1) { \ + marie_prefetch_l1((void *)_cl); \ + state_cl_cursor_l1 = _cl; \ + } \ + } \ + } while (0) + + while (rc < r_l3_ahead) { + unsigned long p; + + MARIE_RING_PRODUCE(p, phase1_done); + ring[rh] = p; + rh = (rh + 1) & r_l3_mask; + rc++; + MARIE_PREFETCH_STATE_L3(p); + marie_prefetch_l3(pfn_to_page(p)); + } +phase1_done: + + n = rc < r_l1_ahead ? rc : r_l1_ahead; + for (i = 0; i < n; i++) { + unsigned long p = ring[(rt + i) & r_l3_mask]; + + MARIE_PREFETCH_STATE_L1(p); + marie_prefetch_l1(pfn_to_page(p)); + } + + while (rc > 0 && n_batch < batch_size && nr_to_scan > 0) { + unsigned long pfn = ring[rt]; + u8 s; + unsigned int z; + struct folio *f; + + rt = (rt + 1) & r_l3_mask; + rc--; + nr_to_scan--; + + if (!producer_done) { + unsigned long np; + + MARIE_RING_PRODUCE(np, refill_done); + ring[rh] = np; + rh = (rh + 1) & r_l3_mask; + rc++; + MARIE_PREFETCH_STATE_L3(np); + marie_prefetch_l3(pfn_to_page(np)); + } +refill_done: + + if (rc > r_l1_ahead) { + int idx = (rt + r_l1_ahead - 1) & + r_l3_mask; + unsigned long lp = ring[idx]; + + MARIE_PREFETCH_STATE_L1(lp); + marie_prefetch_l1(pfn_to_page(lp)); + } + + s = READ_ONCE(marie_state[pfn]); + if ((s & mask) != target) + continue; + z = (s & MARIE_PFN_ZONE_MASK) + >> MARIE_PFN_ZONE_SHIFT; + if ((int)z > max_zone) + continue; + + f = pfn_folio(pfn); + /* + * mbm word-AND in the producer already restricted + * candidates to (type, gen, tier) ∩ memcg; the + * per-candidate test_bit(pfn, mbm) is therefore + * unnecessary. Only the cmdline-disabled fallback + * (target_memcg but no per-memcg bitmap) needs a + * folio_memcg compare. + */ + if (target_memcg && !mbm && + folio_memcg(f) != target_memcg) + continue; + batch[n_batch++] = f; + } + + marie_bm_range_unlock(bit); + } /* while (l2w) -- next set bit in this L2 word */ + } /* for (l2_word) -- next L2 word */ +#undef MARIE_RING_PRODUCE +#undef MARIE_PREFETCH_BMWORD_L3 +#undef MARIE_PREFETCH_STATE_L3 +#undef MARIE_PREFETCH_STATE_L1 + + return n_batch; +} + +/* + * marie_state_drop_pfn - zero out every per-PFN tracking artifact + * for one folio (state byte, (type, gen, tier) L1 bit, occupancy + * counter, per-memcg L1/L2/l2_count, and the global L2 range + * counter with bulk L2 bit clear on 0). + * + * Called from: + * marie_evict_locked -- normal evict path + * marie_drain_pfn_locked -- enable=0 sysfs flip; folio gets + * returned to legacy LRU, the per-PFN + * artifacts MUST be wiped or they + * survive across the disabled window + * as ghosts that wedge counters on + * re-enable. + * + * No-op when the state byte is not TRACKED (defensive against + * double-drop). Reads the (gen, tier, type) tuple from the byte + * BEFORE zeroing it so the per-(type, gen, tier) bitmap and + * occupancy counter are decremented at the same coordinate the + * install incremented. + */ +void marie_state_drop_pfn(struct folio *folio) +{ + unsigned long pfn; + u8 s, g, tier, type_bit; + + if (!marie_state || !folio) + return; + + pfn = folio_pfn(folio); + if (pfn >= marie_state_size) + return; + + s = marie_state[pfn]; + marie_state[pfn] = 0; + if (!(s & MARIE_PFN_TRACKED)) + return; + + g = (s & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT; + tier = (s & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT; + type_bit = (s & MARIE_PFN_TYPE_MASK) ? 1 : 0; + + marie_bm_clear(&marie_track_bm[type_bit][g][tier], pfn); + atomic_long_dec(&marie_gen_occupied[g][type_bit]); + marie_memcg_bitmap_clear(folio_memcg(folio), pfn); +} +EXPORT_SYMBOL_GPL(marie_state_drop_pfn); + +/* + * marie_state_drop_pfn_at_free - canonical buddy-handoff cleanup. + * + * Invoked from mm/page_alloc.c::free_pages_prepare for every page about + * to enter the buddy allocator. Eliminates the deferred-cleanup race + * between marie_evict_counters_only (counters -1, TRACKED preserved) and + * the next allocation at the same PFN: the moment the page is destined + * for buddy, we wipe Marie's per-PFN bookkeeping so a subsequent + * install_local starts from a clean state byte. + * + * Counters are NOT touched here -- they were either already balanced + * by marie_evict_locked (the normal Marie del path) or pre-decremented + * by marie_evict_counters_only (the reclaim isolate path), and the + * page-free hook runs once per page regardless of which del path was + * taken upstream. + * + * memcg_bitmap is intentionally untouched. folio_memcg is unsafe to + * dereference at free time (the page is mid-uncharge); the stale bit + * is harmless because the next install at this PFN under a different + * memcg will re-set the new memcg's bitmap bit, and a memcg teardown + * will free the bitmap wholesale. + * + * Lock-free: byte write, bitmap atomic-bit-clear, atomic_long_dec -- + * safe from any context including IRQ. + */ +void marie_state_drop_pfn_at_free(unsigned long pfn) +{ + u8 s, g, tier, type_bit; + + if (!marie_state || pfn >= marie_state_size) + return; + + s = marie_state[pfn]; + if (!(s & MARIE_PFN_TRACKED)) + return; + + /* + * A TRACKED folio reaching the buddy free path still carrying PG_lru + * bypassed Marie's evict (which clears both TRACKED and PG_lru under + * the folio_test_clear_lru claim). Leaving PG_lru set trips the + * "Bad page state |lru|" PAGE_FLAGS_CHECK_AT_FREE oops. Clear it + * here as the canonical last-resort: the folio is being freed + * (refcount 0) and Marie folios keep folio->lru as a self-loop + * (never linked onto a real lruvec list), so dropping PG_lru cannot + * corrupt any list. This is a mitigation for a residual reclaim + * accounting race (a Marie folio reaching free with TRACKED still + * set); the per-folio vmstat that install +nr'd is not undone here, + * a minor drift accepted in exchange for not oopsing. + */ + { + struct folio *f = page_folio(pfn_to_page(pfn)); + + /* + * Invariant: a TRACKED folio must never reach the buddy free + * path still carrying PG_lru. Marie's evict clears both under + * the folio_test_clear_lru claim, and folio_batch_move_lru no + * longer re-stamps PG_lru onto a tracked folio (the mm/swap.c + * fix). VM_WARN_ON_ONCE flags a regression of that invariant in + * DEBUG_VM builds; it compiles to nothing in production, so the + * folio_test_lru below costs only a predicted-not-taken branch + * on an already-hot folio->flags. The trailing clear is the + * production last resort -- it degrades any future regression + * to a counter blip instead of a PAGE_FLAGS_CHECK_AT_FREE oops. + * Marie folios keep folio->lru detached from real lruvec lists, + * so clearing PG_lru here cannot corrupt a list. + */ + if (unlikely(folio_test_lru(f))) { + VM_WARN_ON_ONCE_FOLIO(1, f); + folio_clear_lru(f); + } + /* + * shrink_folio_list can re-set PG_active on a folio whose + * PG_lru is clear (Marie isolated it). PG_active is in + * PAGE_FLAGS_CHECK_AT_FREE; if still set here it would + * trigger bad_page in free_pages_prepare. Clear it + * unconditionally as a last-resort safety net. + */ + if (unlikely(folio_test_active(f))) + folio_clear_active(f); + +#ifdef CONFIG_LRU_GEN + /* + * Scrub MGLRU gen/refs residue. LRU_GEN_MASK is in + * PAGE_FLAGS_CHECK_AT_FREE, so a leftover gen counter trips + * "Bad page state" in free_pages_prepare. With Marie masking + * lru_gen_enabled() off (see lru_gen_enabled()), no MGLRU + * writer stamps these onto a tracked folio, so this is the + * structural last resort that keeps any future regression a + * counter blip rather than a buddy-path oops -- independent of + * whether every lru_gen_enabled() reader stays correctly gated. + * + * PG_workingset is deliberately NOT cleared: Marie's eviction + * relies on the legacy workingset_eviction shadow encoding, + * which reads PG_workingset, and the bit is not in + * PAGE_FLAGS_CHECK_AT_FREE. + */ + if (unlikely(f->flags.f & (LRU_GEN_MASK | LRU_REFS_MASK))) { + VM_WARN_ON_ONCE_FOLIO(1, f); + set_mask_bits(&f->flags.f, LRU_GEN_MASK | LRU_REFS_MASK, 0); + } +#endif + } + + marie_state[pfn] = 0; + + g = (s & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT; + tier = (s & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT; + type_bit = (s & MARIE_PFN_TYPE_MASK) ? 1 : 0; + + /* + * The reclaim isolate path (marie_evict_counters_only) already retired + * this PFN's scan-bitmap slot + gen_occupied at isolate, leaving only + * the TRACKED byte (wiped just above). Test before clearing so that + * common path does NOT double-decrement l2_count / gen_occupied below + * zero. The clear still fires for the residual-race case -- a TRACKED + * folio reaching free without having gone through isolate -- whose + * scan slot is genuinely still live. Safe to test-then-clear here: + * the page is at refcount 0 with no concurrent Marie op on this PFN. + */ + if (marie_bm_test(&marie_track_bm[type_bit][g][tier], pfn)) { + marie_bm_clear(&marie_track_bm[type_bit][g][tier], pfn); + atomic_long_dec(&marie_gen_occupied[g][type_bit]); + } +} + +/* + * marie_state_move_to_gen - relocate a tracked PFN's encoding to + * (@target_gen, @target_tier) with matched (gen, type) bitmap + + * occupied-counter updates. + * + * Step 1: CAS the state byte. Defeats races against del (cur becomes + * 0) and against another concurrent move (cur changes). Retry on + * mismatch. + * + * Step 2: shuffle the bitmaps / counters. Order is "new first, then + * old" so the folio is visible on at least one (gen, type) plane + * throughout the transition. Skipped entirely when old_gen == + * target_gen (only the tier changed, no slot movement needed). + * + * Skipped if the folio is no longer tracked, or the byte already + * encodes (target_gen, target_tier). + * + * Called from: + * marie_state_inc_tier saturate path (target_gen=head, target_tier=0) + * shrink_lruvec residue putback (target_gen=(head+2)&3, + * target_tier=max(prev, w_tier)) + */ +void marie_state_move_to_gen(unsigned long pfn, u8 target_gen, u8 target_tier) +{ + u8 cur, type, old_gen, old_tier, new_byte; + + if (pfn >= marie_state_size) + return; + target_gen &= MARIE_PFN_NR_GENS - 1; + target_tier &= MARIE_PFN_TIER_MAX; + +retry: + cur = READ_ONCE(marie_state[pfn]); + if (!(cur & MARIE_PFN_TRACKED)) + return; + + new_byte = (cur & ~(MARIE_PFN_GEN_MASK | MARIE_PFN_TIER_MASK)) | + ((u8)target_gen << MARIE_PFN_GEN_SHIFT) | + ((u8)target_tier << MARIE_PFN_TIER_SHIFT); + if (new_byte == cur) + return; + + if (cmpxchg(&marie_state[pfn], cur, new_byte) != cur) + goto retry; + + type = (cur & MARIE_PFN_TYPE_MASK) ? 1 : 0; + old_gen = (cur & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT; + old_tier = (cur & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT; + if (old_gen == target_gen && old_tier == target_tier) + return; + + /* publish on new (type, gen, tier) first */ + marie_bm_set(&marie_track_bm[type][target_gen][target_tier], pfn); + if (old_gen != target_gen) { + atomic_long_inc(&marie_gen_occupied[target_gen][type]); + atomic_long_dec(&marie_gen_occupied[old_gen][type]); + } + /* un-publish old (type, gen, tier) */ + marie_bm_clear(&marie_track_bm[type][old_gen][old_tier], pfn); +} +EXPORT_SYMBOL_GPL(marie_state_move_to_gen); + +/* + * marie_state_publish_at_gen - (re)publish an already-TRACKED PFN's scan + * slot at (@target_gen, @target_tier), PUBLISH-ONLY (no un-publish of an + * old slot). + * + * This is the putback counterpart to marie_evict_counters_only: isolate + * already retired the old (gen, tier) bitmap bit + gen_occupied slot, so a + * surviving folio has NO old slot to clear -- only the new one to set. + * Unlike marie_state_move_to_gen (set-new + clear-old), this never touches + * the old coordinate, so it cannot double-decrement the l2_count / occupied + * accounting that isolate already balanced. + * + * The byte stays TRACKED throughout (counters_only preserves it); here we + * only rewrite its (gen, tier) field and set the matching bitmap bit + + * occupied counter. Always sets the bitmap bit, even when the byte's + * (gen, tier) is unchanged, because the bit itself was cleared at isolate. + * + * Caller context: putback, where the folio is exclusively owned (PG_lru + * cleared at claim, not yet republished; the dropped scan bit keeps the + * walker away), so the cmpxchg cannot lose a race in practice -- it is + * kept only to preserve the byte's TRACKED/TYPE/ZONE bits cleanly. + */ +static void marie_state_publish_at_gen(unsigned long pfn, u8 target_gen, + u8 target_tier) +{ + u8 cur, type, new_byte; + + if (pfn >= marie_state_size) + return; + target_gen &= MARIE_PFN_NR_GENS - 1; + target_tier &= MARIE_PFN_TIER_MAX; + +retry: + cur = READ_ONCE(marie_state[pfn]); + if (!(cur & MARIE_PFN_TRACKED)) + return; + + new_byte = (cur & ~(MARIE_PFN_GEN_MASK | MARIE_PFN_TIER_MASK)) | + ((u8)target_gen << MARIE_PFN_GEN_SHIFT) | + ((u8)target_tier << MARIE_PFN_TIER_SHIFT); + if (new_byte != cur && + cmpxchg(&marie_state[pfn], cur, new_byte) != cur) + goto retry; + + type = (cur & MARIE_PFN_TYPE_MASK) ? 1 : 0; + marie_bm_set(&marie_track_bm[type][target_gen][target_tier], pfn); + atomic_long_inc(&marie_gen_occupied[target_gen][type]); +} + +/* + * marie_state_inc_tier - saturating tier bump on the per-PFN byte. + * + * Runs from folio_mark_accessed() WITHOUT lru_lock, so the state byte + * is committed with try_cmpxchg to avoid losing a concurrent lock-free + * drop_pfn / install publish (see the loop comment below). + * + * Non-saturated (tier < MAX): bump the tier field in place. + * + * Saturated (tier == MAX): in-place promote -- roll to head gen at + * tier 0 (inlined marie_state_move_to_gen). The "already on head" + * early exit avoids the CAS round-trip when the folio cannot be + * promoted further. + */ +void marie_state_inc_tier(unsigned long pfn) +{ + u8 cur, new, t, type, gen, head = 0, old_gen, new_tier = 0; + bool roll; + + if (pfn >= marie_state_size) + return; + + /* + * folio_mark_accessed() runs this from the fault / pagecache-hit path + * WITHOUT lru_lock, racing the lock-free reclaim isolate + * (marie_state_drop_pfn) and the lru_lock-held install publish. All + * three RMW the same non-atomic state byte, so a plain READ/WRITE_ONCE + * loses updates -- e.g. resurrecting a TRACKED bit drop_pfn just + * cleared. Commit the byte with try_cmpxchg; a concurrent writer forces + * a reload + recheck, and if drop_pfn cleared TRACKED we bail. + */ + cur = READ_ONCE(marie_state[pfn]); + do { + if (!(cur & MARIE_PFN_TRACKED)) + return; + t = (cur & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT; + type = (cur & MARIE_PFN_TYPE_MASK) ? 1 : 0; + if (t < MARIE_PFN_TIER_MAX) { + new_tier = t + 1; + new = (cur & ~MARIE_PFN_TIER_MASK) | + ((new_tier << MARIE_PFN_TIER_SHIFT) & + MARIE_PFN_TIER_MASK); + roll = false; + } else { + head = (u8)atomic_read(&marie_head_gen[type]); + old_gen = (cur & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT; + if (head == old_gen) + return; + new = (cur & ~(MARIE_PFN_GEN_MASK | MARIE_PFN_TIER_MASK)) | + (head << MARIE_PFN_GEN_SHIFT); + roll = true; + } + } while (!try_cmpxchg(&marie_state[pfn], &cur, new)); + + /* State byte committed; bitmaps/occupancy best-effort (scanner re-validates). */ + gen = (cur & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT; + if (!roll) { + marie_bm_set(&marie_track_bm[type][gen][new_tier], pfn); + marie_bm_clear(&marie_track_bm[type][gen][t], pfn); + } else { + marie_bm_set(&marie_track_bm[type][head][0], pfn); + marie_bm_clear(&marie_track_bm[type][gen][t], pfn); + atomic_long_dec(&marie_gen_occupied[gen][type]); + atomic_long_inc(&marie_gen_occupied[head][type]); + } +} +EXPORT_SYMBOL_GPL(marie_state_inc_tier); + +/* + * marie_occupied_gen_count - number of gens with at least one folio + * for @type. Used by the reclaim-driven aging trigger. + * + * Four atomic_long_read per call; only invoked at shrink_lruvec + * entry, not on any per-fault hot path. Snapshot may race with + * concurrent install/del but the only consequence is one missed or + * one extra try_advance_head call -- both are benign. + */ +static int marie_occupied_gen_count(int type) +{ + int g, occupied = 0; + + for (g = 0; g < MARIE_PFN_NR_GENS; g++) + if (atomic_long_read(&marie_gen_occupied[g][type]) > 0) + occupied++; + return occupied; +} + +/* + * -------------------------------------------------------------------- + * Anon/file swap-bias controller (stubborn proportional) + * -------------------------------------------------------------------- + * + * A single signed counter per marie_lruvec drives the anon-vs-file + * pick under proportional swappiness (2..199). Granularity rule: + * EXACTLY ONE type is scanned per shrink_lruvec call in the + * proportional regime -- the bias sign selects which. Scanning both + * sides in the same call would dissolve the s:(MAX-s) ratio because + * every call would contribute pages from both. The caller's priority + * loop re-enters shrink_lruvec for the next pick, and the bias + * (updated from this call's outcome) may flip the selection in + * between -- yielding "fine-grained" type switching at call + * granularity, which matches the user-visible reclaim cadence. + * + * SUCCESS (nr_reclaimed > 0): + * bias += sign * nr_reclaimed * weight + * -- page-flow proportional. Long-run pages(anon):pages(file) + * converges to s:(MAX_SWAPPINESS-s) even when per-pick batch + * sizes differ systematically between types. + * + * FAILURE (nr_reclaimed == 0): + * bias unchanged (no-op). + * -- The picked side stays the picked side. Failure carries no + * back-pressure -- not even a unit nudge -- so the favored + * side remains favored indefinitely under sustained failure. + * This is the entire point of low-swappiness on modern ZRAM + * systems: file should be the eviction target even when it + * transiently (or persistently) produces nothing, and anon + * must NOT be touched as a consequence of file being stuck on + * dirty / locked / writeback / depleted state. If file truly + * cannot be reclaimed, the caller escalates priority or OOM + * kicks in -- the controller does not surrender protection. + * + * sign = -1 for picked=ANON (push bias toward FILE) + * +1 for picked=FILE (push bias toward ANON) + * weight = MAX_SWAPPINESS - s for picked=ANON + * = s for picked=FILE + * + * Special-value swappiness short-circuits the controller: + * s=0 FILE only, no fallback (caller proceeds to OOM if depleted) + * s=1 FILE first; ANON engages on EITHER of two depletion + * signals (see the FILE_THEN_ANON tail gate): + * - file < clean_min_ratio floor (skip_file true), or + * - file >= floor but the FILE pass FAILED TO MEET this + * call's reclaim target = file reclaim is not keeping + * pace right now. + * Throughput is empirical -- a tracked file folio may be + * hot/dirty/mapped, and how much frees is knowable only by + * trying -- so the FILE pass's own outcome, not occupancy, is + * the signal. Sufficiency (target met) rather than exact-zero + * is what keeps reclaim file-first: a positive-but-insufficient + * file trickle must not pin reclaim file-only while swappable + * anon OOMs with swap free. The fallback fires on the first + * call file cannot satisfy -- it does NOT wait for sc->priority + * to decay -- and a transient file stall costs at most one + * early anon batch; preferred over OOM with swap free. + * s=MAX ANON only, no fallback (symmetric to s=0) + * + * clean_min_ratio override: when the floor diverts reclaim to + * anon-only (skip_file in marie_state_shrink_lruvec), the caller + * does NOT invoke marie_swap_bias_update for that call. The + * controller stays frozen at its pre-override value so that, when + * file recovers above the floor, the proportional regime resumes + * from where it left off -- no post-recovery overshoot from anon + * reclaim that was driven by external policy, not swappiness. + * + * Sysctl writes invoke lru_marie_swappiness_changed() which walks + * the xarray and resets every swap_bias to zero, so the controller + * restarts cleanly under the new weight ratio. + * + * No CAP: per-cycle delta is bounded by batch_max (~8192) * + * MAX_SWAPPINESS (200) ~ 1.6e6, far below S64_MAX in any realistic + * running time. The sysctl-write reset is the only reset mechanism. + */ + +enum marie_pick_kind marie_swap_pick_type(struct marie_lruvec *mlv, + u8 swappiness) +{ + if (swappiness == 0) + return MARIE_PICK_FILE_STRICT; + if (swappiness == 1) + return MARIE_PICK_FILE_THEN_ANON; + if (swappiness >= MAX_SWAPPINESS) + return MARIE_PICK_ANON_STRICT; + + if (!mlv) + return MARIE_PICK_ANON_FIRST; + + return (atomic64_read(&mlv->swap_bias) < 0) + ? MARIE_PICK_FILE_FIRST + : MARIE_PICK_ANON_FIRST; +} + +void marie_swap_bias_update(struct marie_lruvec *mlv, + int picked_type, + unsigned long nr_reclaimed, + u8 swappiness) +{ + s64 cur, delta; + + if (!mlv) + return; + /* + * Special values bypass the controller. The pick path does not + * read swap_bias under {0, 1, MAX_SWAPPINESS}, so the value + * here is irrelevant to observable behaviour; skipping the + * write also avoids gratuitous cache-line bouncing. + */ + if (swappiness <= 1 || swappiness >= MAX_SWAPPINESS) + return; + + /* + * Failure carries no back-pressure: when nr_reclaimed is zero, + * the bias is left untouched. The picked side stays the picked + * side -- truly stubborn protection of the favored type. See + * the top of this section for the failsafe semantics. + */ + if (!nr_reclaimed) + return; + + if (picked_type == 0) + delta = -(s64)nr_reclaimed * + (s64)(MAX_SWAPPINESS - swappiness); + else + delta = +(s64)nr_reclaimed * (s64)swappiness; + + cur = atomic64_read(&mlv->swap_bias); + atomic64_set(&mlv->swap_bias, cur + delta); +} + +/* + * marie_file_floor_protect - is the clean_min_ratio file floor in force? + * + * Returns true when this node's clean file pagecache has fallen below + * marie_clean_min_ratio (% of node_present_pages) and Marie still has + * anon to absorb the pressure, so file reclaim must be withheld. The pick + * driver diverts file -> anon on this signal (skip_file) and folds the + * result into the MARIE_DRAIN_* mask it returns, so shrink_lruvec's legacy + * orphan drain spares file too. No reclaim path may evict file below the + * floor -- le9uo's single-path floor invariant applied across Marie's paths. + * + * Only CLEAN file counts toward the floor (NR_FILE_DIRTY subtracted): + * dirty pages cannot be reclaimed without writeback, so counting them + * would let the floor be satisfied by unreclaimable pages and strand the + * clean working set. + * + * If anon is empty Marie has no reserve to protect anyway, so the floor + * yields and file scan proceeds as a last resort. An OOM victim bypasses + * the floor entirely (its file is fair game; see the oom_victim handling + * in marie_state_shrink_lruvec). + */ +static bool marie_file_floor_protect(struct pglist_data *pgdat) +{ + unsigned int min_ratio = READ_ONCE(marie_clean_min_ratio); + unsigned long file_pages, dirty, file_min; + long anon_occupied = 0; + int g; + + if (!min_ratio || unlikely(tsk_is_oom_victim(current))) + return false; + + file_pages = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + dirty = node_page_state(pgdat, NR_FILE_DIRTY); + file_pages = (file_pages > dirty) ? file_pages - dirty : 0; + file_min = pgdat->node_present_pages * min_ratio / 100; + + if (file_pages >= file_min) + return false; + + for (g = 0; g < MARIE_PFN_NR_GENS; g++) + anon_occupied += atomic_long_read(&marie_gen_occupied[g][0]); + + return anon_occupied > 0; +} + +/* + * marie_state_shrink_lruvec - per-PFN paradigm reclaim driver. + * + * At entry, for each type, fire the reclaim-driven aging trigger: + * if fewer than 2 gens of that type are occupied, + * marie_try_advance_head(type) so install and reclaim do not + * collide on a single gen. Without this trigger a freshly booted + * system with only the head gen occupied returns zero candidates + * because marie_find_oldest_occupied skips head (the install + * destination). + * + * Per (type, tier) the scan walks the per-(gen, type) bitmap, claims + * each candidate via folio_try_get + folio_test_clear_lru, then calls + * marie_evict_counters_only: counters decremented and the scan-bitmap + * slot + gen_occupied retired at isolate (so other CPUs stop re-finding + * the in-flight folio), but the per-PFN TRACKED byte is KEPT so + * install_local's early-out blocks a concurrent install from re-setting + * PG_lru while shrink_folio_list reclaims it. + * + * Teardown of the TRACKED byte is deferred: a reclaimed folio is wiped + * at its buddy handoff (marie_state_drop_pfn_at_free via the + * free_pages_prepare hook), which finds the scan bit already clear and + * so does not double-decrement l2_count / gen_occupied. Survivors of + * shrink_folio_list keep TRACKED and are re-published at the putback gen + * via marie_state_publish_at_gen (set-only: no clear-old, because isolate + * already retired the old slot), seeding tier from max(prev_tier, + * PG_active/PG_workingset). + */ + +unsigned int marie_state_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + /* + * Hybrid memcg filter (design.h section 9 E): + * - root reclaim: NULL -> per-candidate folio_memcg check + * skipped, every Marie folio in scope. + * - child memcg: pass the lruvec's memcg so scan can confirm + * folio_memcg(f) == target after the byte + * filter. Hot only on cgroup-targeted reclaim. + */ + struct mem_cgroup *target_memcg = + (!memcg || mem_cgroup_is_root(memcg)) ? NULL : memcg; + /* + * Swap-bias controller state for this shrink cycle. mlv_bias is + * the per-lruvec carrier of the bias counter; marie_get_lruvec + * may return NULL on alloc failure, in which case marie_swap_* + * helpers no-op and the pick falls back to the legacy + * "anon then file" order. + * + * @swappiness is captured once per call; subsequent sysctl + * writes that reset bias to zero are seen on the NEXT call. + * mem_cgroup_swappiness returns the effective value (memcg own + * value on cgroup v1 non-root, vm_swappiness otherwise) and is + * a plain READ_ONCE under the hood. + */ + struct marie_lruvec *mlv_bias = marie_get_lruvec(lruvec); + u8 swappiness = (u8)mem_cgroup_swappiness(memcg); + enum marie_pick_kind pick_kind; + int type_order[2]; + int type_count; + int idx; + bool skip_file = false; + unsigned int drain_mask; + /* + * When anon cannot be reclaimed at all (no free swap slots, + * cgroup swap limit hit, no demotion target), swappiness is by + * definition meaningless -- it expresses the anon:file reclaim + * ratio, and one side of that ratio no longer exists. Every ANON + * pick would reclaim nothing, and because the bias controller + * takes no back-pressure from a zero-reclaim pick + * (marie_swap_bias_update bails on !nr_reclaimed), the bias never + * flips to FILE: reclaimable file cache is stranded until OOM. + * Drop the stubborn swappiness preference and force FILE only, + * mirroring get_scan_count()'s "!can_reclaim_anon_pages -> + * SCAN_FILE". The clean_min_ratio floor below still applies, so + * file is reclaimed only down to the protected floor; once file is + * at the floor and anon is unreclaimable this pass reclaims nothing, + * and the stock no_progress_loops path in should_reclaim_retry() + * reaches the OOM killer. + */ + bool anon_unreclaimable = + !vmscan_can_reclaim_anon_pages(memcg, pgdat->node_id, sc); + /* + * An OOM victim's own direct reclaim runs FILE-only, with no holds + * barred on the file side: scan FILE ignoring the swappiness/bias + * pick, the clean_min_ratio floor, the FILE_THEN_ANON tail gate and + * the bias controller. The task has been selected for death and the + * OOM reaper frees its anon, so swapping anon here would only add + * I/O thrash for no benefit -- reclaim just the cheap, no-I/O file + * side (clean_min_ratio is bypassed below, so all file is fair + * game). If file is exhausted the victim falls back on the reaper, + * which is the normal OOM mechanism. kswapd is never an OOM victim, + * so background reclaim is unaffected. + */ + bool oom_victim = tsk_is_oom_victim(current); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) + if (marie_occupied_gen_count(type) < 2) + marie_try_advance_head(type); + + /* + * clean_min_ratio hard floor. True when this node's clean file + * pagecache is below the configured percentage of node_present_pages + * (and anon remains, and we are not an OOM victim). The same predicate + * masks the legacy drain's file scan in shrink_lruvec, so no path + * evicts file below the floor (le9uo's single-path floor invariant). + */ + skip_file = marie_file_floor_protect(pgdat); + + /* + * Choose the type(s) to scan as a strict priority cascade: + * + * oom_victim -> FILE only. The victim's anon is reaped by the + * OOM reaper, so swapping anon is pure I/O thrash; + * reclaim the cheap no-I/O file side. The floor is + * bypassed for victims (skip_file is false), so + * file scans freely. + * anon_unreclaimable -> FILE only. No free swap slots / no demotion + * target: swappiness is meaningless and every ANON + * pick would free nothing. If file is also at the + * floor the per-iteration gate no-ops the file + * scan and the stock no_progress_loops path OOMs. + * swappiness == 0 -> FILE only. Hard "never swap" user policy: the + * clean_min_ratio floor must NOT punch through it + * (core.c). At the floor file is blocked too, so + * this OOMs rather than swapping -- the contract. + * skip_file -> ANON only. The floor is in force and file is + * protected, so divert all reclaim to anon + * regardless of the swappiness/bias pick. This + * outranks the proportional controller: a + * FILE_FIRST pick would otherwise scan the + * floor-blocked file side, free nothing, and -- + * the bias being frozen during skip_file -- stay + * pinned on FILE while anon is never picked, + * stalling reclaim under pressure at high swappiness. + * otherwise -> the swappiness / swap_bias proportional pick. + */ + if (oom_victim) + pick_kind = MARIE_PICK_FILE_STRICT; + else if (anon_unreclaimable) + pick_kind = MARIE_PICK_FILE_STRICT; + else if (swappiness == 0) + pick_kind = MARIE_PICK_FILE_STRICT; + else if (skip_file) + pick_kind = MARIE_PICK_ANON_STRICT; + else + pick_kind = marie_swap_pick_type(mlv_bias, swappiness); + + switch (pick_kind) { + case MARIE_PICK_FILE_STRICT: + type_order[0] = 1; + type_count = 1; + break; + case MARIE_PICK_ANON_STRICT: + type_order[0] = 0; + type_count = 1; + break; + case MARIE_PICK_FILE_THEN_ANON: + /* + * swappiness=1: FILE first, ANON as the depletion fallback + * the moment FILE fails to satisfy this call's reclaim + * target (not only when FILE returns exactly zero). + * type_count=2 with the sufficiency gate at the tail. + */ + type_order[0] = 1; + type_order[1] = 0; + type_count = 2; + break; + case MARIE_PICK_FILE_FIRST: + /* + * Proportional regime, bias picks FILE. SINGLE type per + * call: scanning the other side in the same call would + * dissolve the s:(MAX-s) ratio because both sides would + * contribute pages on every invocation. The caller + * (vmscan priority loop) re-enters shrink_lruvec for + * the next pick; bias may flip in between via the + * proportional update from this call's outcome. + */ + type_order[0] = 1; + type_count = 1; + break; + case MARIE_PICK_ANON_FIRST: + default: + /* Symmetric: proportional regime, bias picks ANON. */ + type_order[0] = 0; + type_count = 1; + break; + } + + /* + * Tell shrink_lruvec which orphan type(s) its legacy drain may + * reclaim: exactly the type this call scans. type_order[0] is the + * primary (and, in the single-type regime, only) type. A file pick + * blocked by skip_file (FILE_STRICT under the clean_min_ratio floor) + * scans nothing, so it grants no drain -- preserving the + * no-progress -> OOM path. + */ + if (type_order[0] == 1) + drain_mask = skip_file ? 0 : MARIE_DRAIN_FILE; + else + drain_mask = MARIE_DRAIN_ANON; + + { + /* + * Claim this CPU's pre-allocated shrink buffer. If the + * cmpxchg fails (preempted reclaimer on the same CPU + * holds it across a shrink_folio_list sleep), fall back + * to a small stack batch. + */ + struct marie_shrink_buf *buf; + /* + * Fallback uses MARIE_PFN_FALLBACK_BATCH-sized stack + * arrays. Sized to stay under gcc -Wframe-larger-than=2048 + * given the ~464 B baseline frame; see MARIE_PFN_FALLBACK_ + * BATCH comment. + */ + struct folio *small_batch[MARIE_PFN_FALLBACK_BATCH]; + struct folio **scratch_batch; + unsigned long batch_max; + bool using_percpu; + + buf = per_cpu_ptr(&marie_shrink_buf, raw_smp_processor_id()); + if (atomic_cmpxchg(&buf->in_use, 0, 1) == 0) { + scratch_batch = buf->batch; + batch_max = marie_pfn_batch_threshold(sc); + using_percpu = true; + } else { + scratch_batch = small_batch; + batch_max = MARIE_PFN_FALLBACK_BATCH; + using_percpu = false; + } + + for (idx = 0; idx < type_count; idx++) { + unsigned int tier; + int oldest; + bool ignore_refs = false; + LIST_HEAD(folio_list); + struct reclaim_stat stat = {}; + unsigned long n_taken = 0; + unsigned int n_reclaimed = 0; + int oldest_for_putback; + u8 putback_gen; + struct folio *f, *tmp; + /* + * Tracks whether this iteration actually attempted + * to pick the type. An external override + * (skip_file from clean_min_ratio) clears this so + * the bias controller is NOT updated for a pick + * that never ran -- the bias must reflect actual + * picking policy, not blocked intentions. + */ + bool attempted_pick = true; + + type = type_order[idx]; + + /* + * Per-type body wrapped in do { } while (0) so the + * existing early-exit conditions become plain + * `break` to a single tail that updates the bias + * controller and applies the swappiness=1 fallback + * gate. `goto done` (target reached) still bypasses + * the tail entirely. + */ + do { + + if (type == 1 && skip_file) { + attempted_pick = false; + break; + } + + oldest = marie_find_oldest_occupied(type); + if (oldest < 0) + break; + ignore_refs = atomic_read( + &marie_gen_walker_visits[oldest][type]) >= 1; + + /* + * Accumulate across all tiers of this type into one + * folio_list up to batch_max, then call + * shrink_folio_list once. + * + * Scan writes candidate folios directly into + * scratch_batch[n_taken..] in a SINGLE call per + * tier. The previous SWAP_CLUSTER_MAX-bounded + * tmp_batch did 128 scan invocations per type at + * batch_max=4096, re-initialising the prefetch + * ring each time -- now one invocation per tier + * (4 per type) lets the ring amortise across the + * full bitmap walk. + * + * Failed claims (try_get / test_clear_lru) leave + * the corresponding scratch_batch slot to be + * overwritten by the next successful claim -- + * in-place compaction via accept_idx. + */ + for (tier = 0; tier < MARIE_PFN_NR_TIERS; tier++) { + unsigned long nr_isolated, i; + unsigned long room; + unsigned long accept_idx = n_taken; + + if (sc_reclaim_target_reached(sc)) + goto done; + if (n_taken >= batch_max) + break; + + room = batch_max - n_taken; + nr_isolated = marie_state_isolate_scan_l2lock( + pgdat, type, sc_reclaim_idx(sc), + tier, target_memcg, + &scratch_batch[n_taken], room, + ULONG_MAX); + if (!nr_isolated) + continue; + + for (i = 0; i < nr_isolated; i++) { + f = scratch_batch[n_taken + i]; + if (!folio_try_get(f)) + continue; + if (!folio_test_clear_lru(f)) { + folio_put(f); + continue; + } + + scratch_batch[accept_idx] = f; + + /* + * marie_evict_counters_only decrements + * counters AND retires the scan-bitmap + * slot (so other CPUs stop re-finding + * this in-flight folio), but KEEPS the + * TRACKED byte so install_local's early- + * out blocks any concurrent install from + * re-setting PG_lru while shrink_folio_- + * list reclaims it. The TRACKED byte is + * wiped at the buddy handoff via + * marie_state_drop_pfn_at_free() (called + * from free_pages_prepare). Survivors + * keep TRACKED and re-publish a fresh + * scan slot + PG_lru in the putback loop + * below. + */ + marie_evict_counters_only(f); + + list_add(&f->lru, &folio_list); + accept_idx++; + } + n_taken = accept_idx; + } + + if (!n_taken) + break; + + /* + * PGSCAN accounting, mirroring upstream MGLRU's + * post-isolation bump (mm/vmscan.c evict_folios). + * n_taken is the count actually pulled off the LRU + * (the equivalent of MGLRU's `isolated`); upstream + * PGSCAN_* tracks isolated, not bitmap-scanned bits. + * + * NR_ISOLATED_ANON / _FILE must be bumped here so + * reclaim throttling and writeback congestion + * checks see Marie's in-flight isolation; the + * counter is decremented after shrink_folio_list + * finishes (whether the folio was reclaimed or put + * back). + * + * Since 7.0, PGSCAN_* / PGSTEAL_* / PGSCAN_ANON / + * PGSTEAL_ANON are node_stat_item (lruvec stats), + * not vm_event_item. Use mod_lruvec_state which + * propagates to both node vmstat and memcg. + */ + { + enum node_stat_item scan_item = + PGSCAN_KSWAPD + + vmscan_reclaimer_offset(sc); + + mod_node_page_state(pgdat, + NR_ISOLATED_ANON + type, + n_taken); + mod_lruvec_state(lruvec, scan_item, n_taken); + mod_lruvec_state(lruvec, PGSCAN_ANON + type, + n_taken); + } + + n_reclaimed = shrink_folio_list(&folio_list, pgdat, + sc, &stat, ignore_refs, + memcg); + sc_add_reclaimed(sc, n_reclaimed); + + /* + * PGSTEAL accounting + matched NR_ISOLATED decrement. + * shrink_folio_list has either freed each folio or + * left it on @folio_list for putback; either way the + * isolation window for these n_taken folios is over. + */ + { + enum node_stat_item steal_item = + PGSTEAL_KSWAPD + + vmscan_reclaimer_offset(sc); + + mod_node_page_state(pgdat, + NR_ISOLATED_ANON + type, + -n_taken); + mod_lruvec_state(lruvec, steal_item, + n_reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + type, + n_reclaimed); + } + + oldest_for_putback = marie_find_oldest_occupied(type); + if (oldest_for_putback >= 0) + putback_gen = (u8)((oldest_for_putback + 1) + & (MARIE_PFN_NR_GENS - 1)); + else + putback_gen = (u8)atomic_read( + &marie_head_gen[type]); + + list_for_each_entry_safe(f, tmp, &folio_list, lru) { + u8 prev, w, target_tier; + struct lruvec *lv; + struct marie_lruvec *mlv; + unsigned long pfn; + int zone; + enum lru_list inst_lru; + + pfn = folio_pfn(f); + /* + * prev_tier comes straight from the per-PFN + * byte: counters_only preserved it across + * isolate and the publish below has not run + * yet, so the byte still encodes the tier this + * folio carried when it was isolated. (Replaces + * the old scratch_prev_tier[] capture + O(n^2) + * linear search back into scratch_batch.) + */ + if (pfn < marie_state_size) + prev = (READ_ONCE(marie_state[pfn]) & + MARIE_PFN_TIER_MASK) >> + MARIE_PFN_TIER_SHIFT; + else + prev = 0; + w = (folio_test_active(f) ? 2 : 0) | + (folio_test_workingset(f) ? 1 : 0); + target_tier = prev > w ? prev : w; + + list_del_init(&f->lru); + lv = folio_lruvec(f); + mlv = marie_get_lruvec(lv); + zone = folio_zonenum(f); + /* + * Normalize PG_active->0 BEFORE computing inst_lru, mirroring + * marie_folio_install() and marie_evict_locked(). The active hotness + * was already folded into target_tier (w) above, so nothing is lost. + * shrink_folio_list's activate_locked path can leave PG_active set on a + * Marie-isolated folio; crediting folio_lru_list() with it still set + * lands the survivor's +nr in ACTIVE_*, but every debit path + * (marie_evict_locked / marie_evict_counters_only) clears PG_active + * first and debits INACTIVE_*. That producer/consumer bucket split is + * what underflows mz->lru_zone_size at the eventual free + * ("mem_cgroup_update_lru_size: lru_size -1"). + */ + if (folio_test_active(f)) + folio_clear_active(f); + inst_lru = folio_lru_list(f); + + /* + * Survivor putback -- UNIFIED, mlv-independent. + * marie_state[pfn] still has TRACKED set from + * before isolate (counters_only preserves it), + * so re-publish a FRESH scan slot at + * (putback_gen, target_tier) -- publish-only, no + * clear-old (isolate already retired the old + * slot). The folio stays a Marie folio; we do + * NOT route it back through folio_putback_lru / + * folio_add_lru. That generic path re-enters the + * per-cpu folio_batch pipeline, which assumes + * legacy-LRU invariants (folio on a real list, + * counted in mz->lru_zone_size) that Marie folios + * break -- under heavy pressure it freed + * still-dirty swapbacked folios out of the batch + * drain ("Bad page state"). + * + * mlv is NULL only when marie_get_lruvec could + * not allocate the per-lruvec carrier under the + * GFP_ATOMIC reclaim context (or Marie was just + * disabled). Either way re-publishing into the + * per-PFN state is correct and needs no mlv; only + * the per-mlv counters are skipped (they live in + * the carrier that does not exist -- no leak, + * mirrors marie_evict_counters_only's !mlv path). + * The global vmstat / nr_folios counters ARE + * restored unconditionally to mirror the -nr + * counters_only applied at isolate. + */ + marie_state_publish_at_gen(pfn, putback_gen, + target_tier); + marie_memcg_bitmap_set(folio_memcg(f), pfn); + + /* + * Account the survivor's re-installation. mlv + * may be NULL under reclaim (GFP_ATOMIC carrier + * alloc fail or teardown race); the helper drops + * only the global counters in that case -- + * mirrors the !mlv path in + * marie_evict_counters_only / install_isolate. + */ + marie_account_install_isolate(lv, mlv, f, + inst_lru, zone); + + if (!folio_put_testzero(f)) { + /* + * Isolation ref dropped, folio still alive. + * Set PG_lru so the next scan can re-isolate + * it via folio_test_clear_lru. + */ + folio_set_lru(f); + } else { + /* + * Isolation ref was the last one -- folio is + * being freed now. PG_lru is clear (was cleared + * at isolation), so __folio_put's + * __page_cache_release will not call + * del_page_from_lru_list and will not debit + * mz->lru_zone_size a second time -- isolation + * already debited it (the install +nr is settled + * by the isolate path), so a free-time debit here + * would underflow. + * + * shrink_folio_list's activate_locked path may + * set PG_active on a folio whose PG_lru is + * already clear (Marie isolated it). Normally + * PAGE_FLAGS_CHECK_AT_FREE is satisfied because + * folio_activate() checks PG_lru and is a no-op + * when it is clear -- but some stock paths set + * PG_active directly (e.g. folio_set_active in + * the deactivate batch). Clear it here; the + * folio has no live references and is not on any + * LRU list, so clearing PG_active is safe. + * + * Undo the putback counter increments before + * completing the free. Bitmaps and TRACKED are + * cleaned at buddy handoff by + * marie_state_drop_pfn_at_free. + */ + folio_clear_active(f); + marie_account_evict_isolate(lv, mlv, f, + inst_lru, + zone); + __folio_put(f); + } + } + + /* + * No deferred drop pass: the scan-bitmap slot was + * retired at isolate (counters_only), and the TRACKED + * byte of a reclaimed folio is wiped at its buddy + * handoff (marie_state_drop_pfn_at_free via the + * free_pages_prepare hook). Folios still alive in + * folio_list went through the survivor putback above, + * which re-published a fresh scan slot via + * marie_state_publish_at_gen. + */ + + } while (0); + + /* + * Per-iteration tail. + * + * Bias controller update is skipped when: + * - !attempted_pick: external override (skip_file + * from clean_min_ratio) blocked the scan. The + * bias must track actual picking policy, not + * policy preempted before it ran. + * - skip_file is in effect for THIS call: even + * the ANON pick that succeeds during a + * skip_file regime is happening only because + * file was forcibly removed from contention. + * Freezing the controller during the override + * keeps the bias at its pre-override value, so + * when file recovers above clean_min_ratio the + * proportional regime resumes without an + * overshoot driven by anon-only reclaim that + * was never about the swappiness ratio. + * + * swappiness=1 (FILE_THEN_ANON) depletion-fallback + * gate (see the tail `if` below). Two independent + * reasons divert reclaim to ANON, on separate layers: + * + * 1. file < clean_min_ratio floor: handled UPFRONT by + * marie_file_floor_protect -> skip_file -> pick + * ANON_STRICT. Protects a minimum clean-file + * reserve and never reaches here (skip_file + * short-circuits FILE_THEN_ANON). + * + * 2. file >= floor but file reclaim cannot keep pace: + * detected HERE by the FILE pass FAILING TO MEET + * this call's reclaim target. A target-meeting FILE + * pass exits via the tier loop's + * sc_reclaim_target_reached() -> `goto done`, PAST + * this tail; so merely arriving here means file fell + * short. Occupancy/tier cannot tell reclaimability + * or throughput apart -- a tracked file folio may be + * hot/dirty/mapped, and how much actually frees is + * known only by trying (shrink_folio_list). The + * earlier gate keyed on the FILE pass returning + * EXACTLY zero, which conflates "no reclaimable + * file" with "file frees a positive trickle that + * cannot match the allocation rate": while any + * recyclable clean pagecache keeps cycling (refault / + * IO refill) the FILE pass returns >0 forever, anon + * is never scanned, and GBs of swappable anon OOM + * with swap free. Sufficiency, not exact-zero, is the + * correct depletion signal -> fall through to ANON. + * + * This does NOT wait for sc->priority to decay round by + * round: that would thrash file-first for several more + * rounds before conceding, which is the very stall we are + * eliminating. The fallback fires on the first call where + * file cannot satisfy the target. A transient file stall + * (oldest gen momentarily all dirty/writeback) costs at + * most one early anon batch -- acceptable on swappiness=1 + * / ZRAM, and far better than OOM-ing with swap free. + * + * `goto done` (target reached inside the tier loop) + * jumps PAST this tail intentionally: we are winning, + * the controller does not need a back-pressure tick. + */ + /* + * anon_unreclaimable forced FILE_STRICT above, + * bypassing the proportional controller; do not let + * those forced-file picks drive the bias (matches the + * "special swappiness values bypass the controller" + * rule -- the bias must resume cleanly once swap + * capacity returns and can_reclaim_anon flips back). + * + * The FILE_THEN_ANON depletion fallback (idx==1 ANON, + * reached only because the FILE pass found nothing + * reclaimable) is likewise a forced pick driven by file + * depletion, not by the swappiness ratio, so it must not + * drive the bias either. + */ + if (attempted_pick && !skip_file && !anon_unreclaimable && + !(pick_kind == MARIE_PICK_FILE_THEN_ANON && idx == 1) && + likely(!oom_victim)) + marie_swap_bias_update(mlv_bias, type, + n_reclaimed, swappiness); + if (likely(!oom_victim) && + pick_kind == MARIE_PICK_FILE_THEN_ANON && + idx == 0 && !skip_file) { + /* + * swappiness=1 depletion fallback -- + * SUFFICIENCY-gated, not zero-gated and not + * priority-gated. + * + * FILE is still strongly preferred: it is + * type_order[0], scanned first and in full + * every call, and a FILE pass that MEETS this + * call's reclaim target short-circuits via the + * tier loop's sc_reclaim_target_reached() -> + * `goto done`, which jumps PAST this gate and + * never engages anon. So simply reaching this + * gate means FILE did NOT satisfy the target -- + * file reclaim is not keeping pace this call. + * + * The old gate broke out on ANY n_reclaimed > 0, + * requiring a FILE pass of EXACTLY zero before it + * would touch anon. While even a trickle of + * recyclable clean pagecache keeps cycling + * (refault / IO refill), FILE returns >0 every + * call, the break pinned reclaim file-only, and + * GBs of swappable anon were never scanned -- + * OOM with swap free (tail /dev/zero). Waiting + * for sc->priority to decay round by round before + * resorting to anon would just thrash file-first + * for several more rounds first, which is exactly + * the stall we want to avoid. + * + * Engage the ANON pass NOW. If the final FILE + * batch happened to tip the target without a + * re-check, the idx==1 ANON pass self-aborts at + * its own sc_reclaim_target_reached() gate, so no + * anon is over-reclaimed. Reaching FILE_THEN_ANON + * proved anon is reclaimable (else FILE_STRICT), + * so swap capacity is available by construction. + */ + drain_mask |= MARIE_DRAIN_ANON; + } + } +done: + if (using_percpu) + atomic_set(&buf->in_use, 0); + } + + return drain_mask; +} + + +/* --- per-mlv lifecycle and install / evict implementations --- */ + + +static DEFINE_PER_CPU(int[ANON_AND_FILE], marie_drain_depth); + +void marie_drain_enter_type(int type) +{ + this_cpu_inc(marie_drain_depth[type]); +} +void marie_drain_exit_type(int type) +{ + this_cpu_dec(marie_drain_depth[type]); +} +bool marie_in_drain_type(int type) +{ + return this_cpu_read(marie_drain_depth[type]) > 0; +} + +/* + * --------------------------------------------------------------------- + * Install / evict — direct synchronous transitions under lru_lock + * --------------------------------------------------------------------- + * + * The per-PFN paradigm reduces every Marie folio's state to a single + * bit (TRACKED in marie_state[pfn]). There are exactly two state + * transitions: + * + * marie_folio_install: TRACKED 0 -> 1 (writes gen, tier, type, + * zone, sets PG_lru, bumps counters; defined + * below, declared in pfn_install.h) + * marie_evict_locked: TRACKED 1 -> 0 (counter decrements + + * per-PFN state wipe via marie_state_drop_pfn) + * + * Both are called with the caller's lru_lock irqsave held, so the + * per-PFN byte write, the bitmap mutations, and the per-mlv counter + * updates all run in the same atomic context. PG_active hygiene and + * other cross-cutting concerns are concentrated here. + */ + + +/* + * marie_folio_install - unified fresh install (TRACKED 0 -> 1). + * + * Replaces the former marie_install_local / marie_install_locked pair. + * The two used to differ only in the order of (publish, account, flag) + * and in the PG_lru set method; this canonical form picks set_mask_bits + * (atomic PG_active clear + PG_lru set in one mask write) and the + * publish -> flag -> account order from install_local. + * + * Call sites: + * - lru_marie_add_folio (THP under per-type lock, small folio direct) + * - marie_change_state_lruvec (gate-on fill, under per-type both lock) + * + * Per-type lock is a property of the caller, not of this function: the + * body only requires lru_lock + IRQs off and behaves identically whether + * or not the caller additionally holds the per-type lock. + * + * Returns true on success, false on the "already TRACKED" early-out. + * See pfn_install.h for the contract documentation. + */ +bool marie_folio_install(struct folio *folio, struct marie_lruvec *mlv) +{ + bool was_active, was_workingset; + unsigned int tier; + int type, zone; + u8 head; + enum lru_list inst_lru; + unsigned long pfn; + + lockdep_assert_held(&mlv->lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + /* + * "Already TRACKED" early-out. A folio reaching install while its + * per-PFN byte is still TRACKED is a Marie-owned, reclaim-isolated + * folio (the deferred-teardown design preserves TRACKED while PG_lru + * is cleared) being re-added through a path that lacks a TRACKED gate + * -- e.g. folio_add_lru()/folio_putback_lru() on an anon folio that + * reclaim isolated into the swap cache and a fault then swaps back in. + * Re-installing would re-set PG_lru and double-count per-mlv counters; + * the resurrected PG_lru then survives onto the buddy free path and + * trips "Bad page state |lru|" PAGE_FLAGS_CHECK_AT_FREE. Bail so the + * in-flight reclaim retains ownership. + * + * Return TRUE, not false: returning false tells lruvec_add_folio() to + * run its LEGACY fallback (update_lru_size(+nr) + list_add onto a real + * lruvec->lists[lru]) on a folio that is STILL TRACKED and that Marie + * never credited to mz->lru_zone_size. That stray, never-debited mz + * credit + a folio cross-linked onto a legacy list is exactly the + * mz->lru_zone_size underflow ("lru_size -1") we were chasing. TRUE + * means "Marie owns it, do not add anywhere" -- which is what "retain + * ownership" requires. + */ + pfn = folio_pfn(folio); + if (pfn < marie_state_size && + (READ_ONCE(marie_state[pfn]) & MARIE_PFN_TRACKED)) + return true; + + /* + * Workingset signal capture: (PG_active, PG_workingset) -> tier. + * (0,0) tier 0 cold + * (0,1) tier 1 workingset, distance too large + * (1,0) tier 2 recent refault, never workingset before + * (1,1) tier 3 = MARIE_PFN_TIER_MAX established hot + * Read PG_active BEFORE clearing so the captured tier matches the + * byte we publish below. PG_workingset stays set: + * workingset_eviction's shadow encoding needs it at next eviction. + */ + was_active = folio_test_active(folio); + was_workingset = folio_test_workingset(folio); + tier = ((unsigned int)was_active << 1) | (unsigned int)was_workingset; + tier &= MARIE_PFN_TIER_MAX; + + if (was_active) + folio_clear_active(folio); + + /* + * folio->lru MUST be re-initialised here. A recycled folio arrives + * with LIST_POISON{1,2} from the prior owner's list_del, and the + * eventual marie_evict_locked's list_del_init would walk the + * poison pointers and fault. + */ + INIT_LIST_HEAD(&folio->lru); + + type = folio_is_file_lru(folio); + zone = folio_zonenum(folio); + head = (u8)atomic_read(&marie_head_gen[type]); + + /* + * Publish per-PFN state byte + scan bitmap + memcg L1 + + * gen_occupied++. See pfn_install.h::marie_pfn_publish_inherit. + */ + marie_pfn_publish_inherit(folio, type, head, (u8)tier, zone); + + /* + * Bump the install gauge and let the advance hook fire its periodic + * gen-advance decision. Split path intentionally skips this bump + * because the split tail inherits its parent's install budget + * (already counted at the parent's fault-install). + */ + this_cpu_inc(marie_gen_installs_pc[head][type]); + marie_install_advance_hook(type); + + /* + * Atomic PG_active->0 + PG_lru->1 in one mask write. PG_active was + * cleared above when set; the mask write keeps the invariant + * against the defensive case where another path set PG_active + * between then and now. Ordered AFTER the state-byte publish so a + * concurrent __page_cache_release observing PG_lru=1 also observes + * marie_state[pfn] & MARIE_PFN_TRACKED. + */ + set_mask_bits(&folio->flags.f, BIT(PG_active), BIT(PG_lru)); + inst_lru = folio_lru_list(folio); + + marie_account_install(mlv, folio, inst_lru, zone); + + return true; +} + +bool marie_evict_locked(struct marie_lruvec *mlv, struct folio *folio) +{ + int zone = folio_zonenum(folio); + + lockdep_assert_held(&mlv->lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + /* + * folio->lru is either a self-loop (install/flush leave it that + * way, and the per-PFN paradigm never re-attaches it onto a + * Marie-owned list) or on legacy lruvec->lists[lru] after a + * drain handed it off. list_del_init is a no-op in the first + * case and a legacy-list removal in the second; the caller + * holds lruvec->lru_lock for the latter, so no extra Marie-side + * lock is required. + */ + list_del_init(&folio->lru); + + /* + * PG_active hygiene MUST happen before folio_lru_list() below. + * The install helper clears PG_active and then computes the lru + * index, so install always credits INACTIVE_*. If we read + * folio_lru_list() here while PG_active is still set (e.g. via + * folio_activate() on a tracked folio between install and del), + * we would decrement ACTIVE_* -- an LRU index Marie's install + * never +1'd -- and trip the mz->lru_zone_size underflow WARN. + * Mirror install's order: clear PG_active, then compute lru. + * + * Also drops PG_active for shrink_folio_list, which trips + * VM_BUG_ON_FOLIO(folio_test_active) otherwise. + */ + if (folio_test_active(folio)) + folio_clear_active(folio); + + marie_account_evict(mlv, folio, folio_lru_list(folio), zone); + + /* + * Clear PG_lru BEFORE marie_state_drop_pfn so a concurrent + * del-side path gated on folio_test_clear_lru cannot observe + * (state=TRACKED, PG_lru=1) -> Marie del again recursion. + * drop_pfn then wipes the per-PFN state (byte, bitmap, + * l2_range_count, memcg L1) which is the only Marie tracking + * for this folio. + * + * Idempotent for callers that already cleared PG_lru via + * folio_test_clear_lru before reaching evict + * (__page_cache_release, marie_state_shrink_lruvec claim loop). + */ + folio_clear_lru(folio); + marie_state_drop_pfn(folio); + + return true; +} + +/* + * marie_evict_counters_only - reclaim-isolate per-folio counter decrement + * that also retires the scan-bitmap slot, but PRESERVES marie_state[]'s + * TRACKED bit. + * + * The per-PFN state byte staying TRACKED throughout shrink_folio_list is + * the race defence: marie_folio_install's "already TRACKED" early-out + * makes a concurrent install on this PFN bail, so install cannot set + * PG_lru on the folio while shrink_folio_list is reclaiming it. (The + * earlier full marie_evict_isolated cleared TRACKED inline; a concurrent + * install would then succeed, set PG_lru, and trip + * PAGE_FLAGS_CHECK_AT_FREE at free_unref_folios in the success path.) + * + * The global (type, gen, tier) bitmap bit + gen_occupied slot ARE dropped + * here, at isolate. The bit is the scanner's candidate index, and an + * isolated folio is no longer a candidate: leaving it set lets every + * other CPU's scanner re-find the same in-flight PFN for the whole + * swap-out window (the claim fails on the already-cleared PG_lru, but the + * re-scan / re-batch work is pure waste, and a folio shrink_folio_list + * chose to KEEP can get re-isolated before its second chance is honoured + * -> avoidable refaults). Retiring the scan slot here while keeping the + * TRACKED byte separates "is a scan candidate" (bitmap) from "blocks a + * concurrent install" (byte). l2_count / gen_occupied stay balanced 1:1: + * the matching set is the install; the matching re-set, for a survivor, + * is marie_state_publish_at_gen at putback; a reclaimed folio's byte is + * wiped at the buddy free hook, which finds the bit already clear. + * + * Caller-side gates that hold throughout this path: + * 1. folio_try_get() - reference held, folio cannot be freed. + * 2. folio_test_clear_lru() - PG_lru cleared atomically, gating + * external del paths. + * 3. install_local TRACKED early-out (above) + * + * memcg_bitmap is cleared here because the buddy free hook + * (marie_state_drop_pfn_at_free) runs without a folio reference and cannot + * derive memcg later. + * + * Counters are decremented immediately so the in-flight folio does not + * inflate lruvec_lru_size() and skew reclaim pressure heuristics during + * shrink_folio_list. The scan bitmap + gen_occupied are torn down HERE so + * the in-flight folio leaves the candidate index immediately; only the + * TRACKED byte teardown is deferred (to the buddy free hook for reclaimed + * folios). Survivors go through the putback path, which re-publishes a + * fresh scan slot via marie_state_publish_at_gen and re-sets PG_lru. + */ +void marie_evict_counters_only(struct folio *folio) +{ + struct lruvec *lv = folio_lruvec(folio); + struct marie_lruvec *mlv = marie_get_lruvec(lv); + int zone = folio_zonenum(folio); + enum lru_list del_lru; + + if (unlikely(!list_empty(&folio->lru))) { + /* + * Defensive: an mm/swap.c batch path lacking a Marie gate + * may have placed this folio onto a legacy lruvec list via + * lruvec_add_folio_tail. The caller's list_add(&f->lru, ...) + * would then corrupt that list. Detach under lru_lock first; + * DO NOT fall back to lru_marie_del_folio (it would clear + * TRACKED via marie_state_drop_pfn, breaking the deferred- + * teardown invariant the putback path relies on). + */ + VM_WARN_ON_ONCE_FOLIO(1, folio); + scoped_guard(spinlock_irq, &lv->lru_lock) + list_del_init(&folio->lru); + } + + if (folio_test_active(folio)) + folio_clear_active(folio); + + del_lru = folio_lru_list(folio); + + /* + * marie_account_evict_isolate owns the local_irq_save/restore that + * the lock-free reclaim path needs against same-CPU softirq + * reentrancy on fbc->lock and the per-CPU vmstat diff (see the + * helper's contract in account.h, and 9c6a93782's lockup history). + * mlv may be NULL: the helper drops only the global counters in + * that case (no leak -- the missing carrier took the per-mlv + * counters with it). + */ + marie_account_evict_isolate(lv, mlv, folio, del_lru, zone); + + /* + * Retire the scan-bitmap slot + gen_occupied at isolate (see the + * function comment). Read the still-TRACKED byte for its (gen, tier, + * type) coordinate; the byte itself is left TRACKED for the install- + * race early-out. These bit ops are atomic and need no IRQ-off + * window; the helper's local_irq_save/restore is scoped to the + * counters that actually need it. + */ + { + unsigned long pfn = folio_pfn(folio); + + if (pfn < marie_state_size) { + u8 s = READ_ONCE(marie_state[pfn]); + + if (s & MARIE_PFN_TRACKED) { + u8 g = (s & MARIE_PFN_GEN_MASK) >> + MARIE_PFN_GEN_SHIFT; + u8 tr = (s & MARIE_PFN_TIER_MASK) >> + MARIE_PFN_TIER_SHIFT; + u8 tb = (s & MARIE_PFN_TYPE_MASK) ? 1 : 0; + + marie_bm_clear(&marie_track_bm[tb][g][tr], pfn); + atomic_long_dec(&marie_gen_occupied[g][tb]); + } + } + } + + /* Clear memcg bitmap now (folio gone before post-reclaim drop runs). */ + marie_memcg_bitmap_clear(folio_memcg(folio), folio_pfn(folio)); +} + +/* + * Bumps the per-PFN tier; marie_state_inc_tier handles both the + * non-saturated bump (WRITE_ONCE) and the saturated promote + * (marie_state_move_to_gen to head_gen + tier 0) internally. + */ +void lru_marie_mark_accessed(struct folio *folio) +{ + unsigned long pfn = folio_pfn(folio); + u8 state; + + if (!lru_marie_enabled() || !marie_state_ready()) + return; + if (pfn >= marie_state_size) + return; + state = READ_ONCE(marie_state[pfn]); + if (!(state & MARIE_PFN_TRACKED)) + return; + + /* Bump the access tier toward MAX (hotter). */ + marie_state_inc_tier(pfn); + /* Mark the page as recently accessed for the workingset estimator. */ + if (folio_test_clear_referenced(folio)) + folio_set_workingset(folio); +} +EXPORT_SYMBOL_GPL(lru_marie_mark_accessed); + +/* + * Per-cpu folio_batch LRU-op hooks (declared in ). + * Each applies the op directly on the folio's per-PFN state and returns + * true so mm/swap.c skips the legacy folio_batch; false (Marie off / folio + * untracked) falls through to the legacy path. All run lock-free: + * marie_state_move_to_gen is CAS-based and its bitmap ops are atomic, + * matching the no-lru_lock contract of these entry points. + */ + +/* + * Demote: relocate to the oldest live gen at tier 0 so Marie's next scan + * reclaims it promptly. Used for the EXPLICIT user "make cold" madvise + * (MADV_COLD -> folio_deactivate / deactivate_file_folio). Reclaim-internal + * hints (activate / rotate) deliberately do NOT demote -- see those hooks. + */ +static bool marie_folio_demote(struct folio *folio) +{ + int type, oldest; + + if (!lru_marie_enabled() || !folio_marie_test_tracked(folio)) + return false; + type = folio_is_file_lru(folio); + oldest = marie_find_oldest_occupied(type); + if (oldest >= 0) + marie_state_move_to_gen(folio_pfn(folio), (u8)oldest, 0); + return true; +} + +bool lru_marie_deactivate(struct folio *folio) +{ + return marie_folio_demote(folio); +} +EXPORT_SYMBOL_GPL(lru_marie_deactivate); + +/* + * rotate: NO-OP for Marie folios (skip the legacy batch). Like activate + * this is a reclaim-internal hint (folio_rotate_reclaimable fires on + * writeback completion of a PG_reclaim folio). An actively reclaimed Marie + * folio is isolated (PG_lru cleared) so this is rarely reached, and Marie's + * gen aging already orders reclaim -- no per-PFN state change is wanted. + */ +bool lru_marie_rotate(struct folio *folio) +{ + return lru_marie_enabled() && folio_marie_test_tracked(folio); +} +EXPORT_SYMBOL_GPL(lru_marie_rotate); + +/* + * activate: NO-OP for Marie folios (but skip the legacy batch by returning + * true). folio_activate is driven mostly by shrink_folio_list's + * FOLIOREF_ACTIVATE during reclaim, and Marie already decides retention + * there via its tier vote in folio_check_references. Promoting to the head + * gen on top would pull referenced folios out of the oldest gen on every + * reclaim pass; under an all-hot workload that starves reclaim entirely + * (OOM with GBs of unreclaimable inactive_anon). The explicit-access + * channel is folio_mark_accessed -> lru_marie_mark_accessed (tier bump), + * which must not be double-counted here. + */ +bool lru_marie_activate(struct folio *folio) +{ + return lru_marie_enabled() && folio_marie_test_tracked(folio); +} +EXPORT_SYMBOL_GPL(lru_marie_activate); + +/* + * MADV_FREE: make the anon folio reclaim-without-writeback. Clear the + * dirtiness signals synchronously (what the legacy lru_lazyfree move_fn + * does) and demote so Marie frees it promptly without swap on the next + * scan. type is read before clearing swapbacked (folio_is_file_lru flips + * once swapbacked is gone); the Marie byte keeps its anon TYPE, so demote + * stays within the anon gen ring. + */ +bool lru_marie_lazyfree(struct folio *folio) +{ + int type, oldest; + + if (!lru_marie_enabled() || !folio_marie_test_tracked(folio)) + return false; + type = folio_is_file_lru(folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + folio_clear_swapbacked(folio); + count_vm_events(PGLAZYFREE, folio_nr_pages(folio)); + oldest = marie_find_oldest_occupied(type); + if (oldest >= 0) + marie_state_move_to_gen(folio_pfn(folio), (u8)oldest, 0); + return true; +} +EXPORT_SYMBOL_GPL(lru_marie_lazyfree); + +/* + * folio_marie_get_tier (public API in ): returns the + * folio's tier, or 0 when Marie is off, the PFN is out of range, or the + * folio is untracked. + */ +unsigned int folio_marie_get_tier(const struct folio *folio) +{ + unsigned long pfn = folio_pfn((struct folio *)folio); + u8 state; + + if (!marie_state || pfn >= marie_state_size) + return 0; + state = READ_ONCE(marie_state[pfn]); + if (!(state & MARIE_PFN_TRACKED)) + return 0; + return (state & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT; +} +EXPORT_SYMBOL_GPL(folio_marie_get_tier); + +/* + * lru_marie_test_tracked (public API in ). + */ +bool lru_marie_test_tracked(const struct folio *folio) +{ + return folio_marie_test_tracked(folio); +} +EXPORT_SYMBOL_GPL(lru_marie_test_tracked); + +/* + * lru_marie_free_page_hook (public API in ). + * Thin wrapper over marie_state_drop_pfn_at_free so the page allocator + * can call the hook without including the private state.h. + */ +void lru_marie_free_page_hook(unsigned long pfn) +{ + marie_state_drop_pfn_at_free(pfn); +} +EXPORT_SYMBOL_GPL(lru_marie_free_page_hook); + +enum marie_tier_inc_result marie_tier_inc(struct marie_lruvec *mlv, + struct folio *folio) +{ + /* + * The saturated promote is folded into marie_state_inc_tier + * itself: when current tier == MAX, the helper invokes + * marie_state_move_to_gen(pfn, head, 0). The walker tier path + * is a single per-PFN write here. + * + * Return value is preserved for ABI symmetry but the single + * caller (walker.c marie_walk_pmd_range) discards it. + */ + marie_state_inc_tier(folio_pfn(folio)); + return MARIE_TIER_INC_OK; +} + +/* + * marie_del_folio_locked - lru_marie_del_folio body. + * + * External-removal entry: if the folio is still Marie-tracked, do the + * full evict via marie_evict_locked, which routes through + * marie_account_evict and owns the ENTIRE counter wind-down -- including + * the single marie_nr_folios -1. The caller does no accounting of its + * own; an earlier caller-side -1 predated the account.h funnel and + * double-counted marie_nr_folios on every generic del of a tracked folio. + * + * Lock contract: caller holds lruvec->lru_lock. No Marie lock is taken + * here -- the lru_lock invariant already serialises every Marie state + * mutation. See the comment above the call site in lru_marie_del_folio + * for the full protection-model rationale. + * + * Returning true tells the dispatcher (lruvec_del_folio in + * include/linux/mm_inline.h) "Marie owns this folio, do not fall + * through to legacy". + * + * The not-tracked branch returns true defensively. Under the lru_lock + * invariant it is unreachable -- the caller's TRACKED fast-path test + * already gated entry here -- but returning true keeps the safe + * behaviour if the invariant ever regresses: a stray legacy + * update_lru_size on a folio Marie already accounted would double- + * decrement mz->lru_zone_size. + */ +bool marie_del_folio_locked(struct marie_lruvec *mlv, struct folio *folio) +{ + lockdep_assert_held(&mlv->lruvec->lru_lock); + lockdep_assert_irqs_disabled(); + + if (!folio_marie_test_tracked(folio)) + return true; + return marie_evict_locked(mlv, folio); +} + +/* + * --------------------------------------------------------------------- + * lruvec lifecycle + * --------------------------------------------------------------------- + */ + +/* + * marie_type_init: caller-side scalar/lock initialisation only. + * @t->nr_pages (percpu_counter) is initialised separately in + * marie_alloc_lruvec because percpu_counter_init can fail and must be + * rolled back together with the lru_zone_size counters. + */ +static void marie_type_init(struct marie_type *t, int type) +{ + spin_lock_init(&t->type_lock); + t->type = type; +} + +/* + * Hand off every Marie-tracked PFN from @child_mlv's lruvec to + * @parent_mlv's. Used at memcg reparenting (mm/memcontrol.c) so folios + * charged to a dying memcg appear under the parent for subsequent + * per-memcg targeted reclaim. + * + * Caller holds both lruvecs' lru_lock and IRQs disabled (the memcg + * offline path's reparent_locks), so concurrent del / install paths + * targeting either lruvec are pending behind those locks. Caller also + * holds @child_mlv's both type_locks via marie_both_mlv. + * + * No per-folio iteration: the per-PFN state byte stores (gen, tier, + * type, zone) which are memcg-agnostic and do not change on reparent. + * What changes is membership in the per-memcg L1/L2 bitmaps, which is + * an L2-pruned bitmap OR (marie_memcg_bitmap_merge) -- cost scales + * with the number of populated 32 MiB ranges in @child, not with the + * folio count. The accompanying per-type and per-(lru, zone) counters + * move via atomic_long_xchg. + * + * Returns 0: marie_nr_folios is unchanged (the state bytes still say + * TRACKED, the folios continue to count globally). + * + * @parent_mlv == NULL: child's bitmap and counters are zeroed out; + * folios fall back to global tracking with no per-memcg filter + * (equivalent to being charged to root_memcg from the bitmap's + * perspective). The next del path still cleans them up correctly + * because the state byte and global bitmaps remain consistent. + */ +long marie_reparent_locked(struct marie_lruvec *child_mlv, + struct marie_lruvec *parent_mlv) +{ + struct mem_cgroup *child_memcg = lruvec_memcg(child_mlv->lruvec); + struct mem_cgroup *parent_memcg = parent_mlv ? + lruvec_memcg(parent_mlv->lruvec) : NULL; + int t, z; + enum lru_list lru; + + marie_memcg_bitmap_merge(parent_memcg, child_memcg); + + /* + * percpu_counter has no atomic xchg primitive; under the held + * lru_lock + type_lock pair the sum-then-zero-then-add pattern + * is functionally equivalent because no other writer can race the + * child's counters while we hold them. + */ + for (t = 0; t < ANON_AND_FILE; t++) { + struct marie_type *child_type = &child_mlv->types[t]; + s64 n = percpu_counter_sum(&child_type->nr_pages); + + percpu_counter_set(&child_type->nr_pages, 0); + if (n > 0 && parent_mlv) + marie_pc_add( + &parent_mlv->types[t].nr_pages, n); + } + + for (lru = 0; lru < NR_LRU_LISTS; lru++) { + for (z = 0; z < MAX_NR_ZONES; z++) { + s64 n; + + /* + * Fold the child's deferred isolate mz delta into its + * mz->lru_zone_size first (both lru_locks held), so the + * shadow-based transfer below brings child mz exactly to + * 0. Skipping this would leave child mz at -pending. + */ + marie_mz_drain_locked(child_mlv, lru, z); + + n = percpu_counter_sum( + &child_mlv->marie_lru_zone_size[lru][z]); + + percpu_counter_set( + &child_mlv->marie_lru_zone_size[lru][z], 0); + if (!n) + continue; + marie_update_lru_size(child_mlv->lruvec, lru, z, -n); + if (parent_mlv) { + marie_pc_add( + &parent_mlv->marie_lru_zone_size[lru][z], + n); + marie_update_lru_size(parent_mlv->lruvec, + lru, z, n); + } + } + } + + return 0; +} + +struct marie_lruvec *marie_alloc_lruvec(struct lruvec *lv, gfp_t gfp) +{ + struct marie_lruvec *mlv; + int t, lru, z; + int initialised_t = 0; + int initialised_lru = 0; + int initialised_z = 0; + + mlv = kzalloc(sizeof(*mlv), gfp); + if (!mlv) + return NULL; + + mlv->lruvec = lv; + mlv->memcg = lruvec_memcg(lv); + mlv->nid = lruvec_pgdat(lv)->node_id; + + for (t = 0; t < ANON_AND_FILE; t++) { + marie_type_init(&mlv->types[t], t); + if (percpu_counter_init(&mlv->types[t].nr_pages, 0, gfp)) + goto fail_types; + initialised_t = t + 1; + } + + for (lru = 0; lru < NR_LRU_LISTS; lru++) { + for (z = 0; z < MAX_NR_ZONES; z++) { + if (percpu_counter_init( + &mlv->marie_lru_zone_size[lru][z], 0, gfp)) + goto fail_zones; + initialised_z = z + 1; + } + initialised_lru = lru + 1; + initialised_z = 0; + } + + return mlv; + +fail_zones: + /* Roll back any (lru, z) percpu_counters initialised so far. */ + for (z = 0; z < initialised_z; z++) + percpu_counter_destroy( + &mlv->marie_lru_zone_size[initialised_lru][z]); + for (lru = 0; lru < initialised_lru; lru++) + for (z = 0; z < MAX_NR_ZONES; z++) + percpu_counter_destroy( + &mlv->marie_lru_zone_size[lru][z]); +fail_types: + for (t = 0; t < initialised_t; t++) + percpu_counter_destroy(&mlv->types[t].nr_pages); + kfree(mlv); + return NULL; +} + +void marie_free_lruvec(struct marie_lruvec *mlv) +{ + int t, lru, z; + + for (t = 0; t < ANON_AND_FILE; t++) + percpu_counter_destroy(&mlv->types[t].nr_pages); + for (lru = 0; lru < NR_LRU_LISTS; lru++) + for (z = 0; z < MAX_NR_ZONES; z++) + percpu_counter_destroy( + &mlv->marie_lru_zone_size[lru][z]); + kfree(mlv); +} + +int marie_counters_init(void) +{ + return percpu_counter_init(&marie_nr_folios, 0, GFP_KERNEL); +} diff --git a/mm/lru_marie/state.h b/mm/lru_marie/state.h new file mode 100644 index 0000000000..50cafe1831 --- /dev/null +++ b/mm/lru_marie/state.h @@ -0,0 +1,1335 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_STATE_H +#define _MM_LRU_MARIE_STATE_H + +#include +#include "bitmap.h" /* struct marie_bitmap, MARIE_L2_BITS, marie_bm_* */ + +/* + * Marie per-PFN state array — paradigm specification. + * ====================================================== + * + * Marie represents every folio's reclaim state as a single byte in + * a flat per-PFN array allocated once at boot. Each Marie operation + * on a folio is a single byte read or write at marie_state[pfn] — + * there is no allocation anywhere in the fault / del / aging fast + * paths, no linked-list traversal, no per-CPU staging. + * + * The array is sized once at boot to cover totalram_pages PFNs + * (~4 MB on a 16 GiB box, ~16 MB on 64 GiB; the same scale as a + * 1/64-th miniature struct page) and never grows or shrinks. The + * 32-bit PFN gate (marie_init's MARIE_MAX_SUPPORTED_PFN check) + * caps the worst-case array size at 4 GiB. + * + * + * Byte layout + * ----------- + * + * bit 7 TRACKED 1 = folio is owned by Marie; 0 = ignore byte + * bit 6 TYPE 1 = file LRU, 0 = anon LRU + * bit 5..4 ZONE folio_zonenum: 0=DMA, 1=DMA32, 2=NORMAL, 3=MOVABLE + * bit 3..2 GEN relative-position 0..3 in the cycling ring + * (0 = oldest, head = atomic_read(&marie_head_gen[type])) + * bit 1..0 TIER 0=cold, 1=workingset, 2=active, 3=refault + * + * The 8 bits saturate the byte. Bits are laid out in reclaim filter + * hierarchy from MSB (root: existence) down to LSB (leaf: hotness), + * so the isolate scan can extend its (s & mask) == target test by + * widening @mask from the top: + * + * (byte == 0) -> untracked (single-cycle skip) + * (byte & 0x80) -> TRACKED + * (byte & 0xC0) -> TRACKED + type + * (byte & 0xF0) -> TRACKED + type + zone + * (byte & 0xFC) -> TRACKED + type + zone + gen + * (byte & 0xFF) -> all five dimensions + * + * The whole filter is a pure byte mask + compare with no pfn_folio() + * dereference required to make a candidate / skip decision — the + * inner loop scales to AVX-512 vpand+vpcmpeqb at 64 byte per cycle + * and reserves the struct page touch for confirmed candidates only. + * + * The zone field truncates to 2 bits. ZONE_DEVICE (when enabled) + * never reaches Marie because the dispatcher gates on regular LRU + * folios; ZONE_HIGHMEM is 32-bit-only and excluded by the 32-bit + * PFN gate. So the 4 zone codes cover every Marie-tracked folio in + * practice. + * + * Untracked PFNs read as 0. The TRACKED bit is the single source of + * truth — no separate folio->flags Marie bit is used. + * + * + * Aging — gen ring as a cycling counter (per type) + * ------------------------------------------------ + * + * atomic_t marie_head_gen[ANON_AND_FILE]; // 0..3 cycling per type + * atomic_long_t marie_gen_installs[4][ANON_AND_FILE]; + * atomic_long_t marie_gen_occupied[4][ANON_AND_FILE]; + * + * install: + * + * u8 gen = atomic_read(&marie_head_gen[type]); + * marie_state[pfn] = MARIE_PFN_TRACKED | (type<<6) | (zone<<4) | + * (gen<<2) | tier; + * set_bit(pfn, marie_gen_bitmap[gen][type]); + * atomic_long_inc(&marie_gen_installs[gen][type]); + * atomic_long_inc(&marie_gen_occupied[gen][type]); + * + * head_gen advance is per-type, drain-wait gated (next gen empty for + * that type), and fired both by install cadence (gen_installs > + * MARIE_AGING_THRESHOLD) and by the reclaim-driven trigger (occupied + * gen count for that type < 2 at shrink_lruvec entry). See + * mm/lru_marie/design.h sections 3-5 for the binding contract. + * + * + * Del — single byte zero + * ---------------------- + * + * marie_state[pfn] = 0; + * + * No swap-pop, no list_del, no shard lock dance. External del + * (lru_marie_del_folio from compaction, folio_put, munmap) is the + * same single store. + * + * + * Isolate — cursor + SIMD scan + * ---------------------------- + * + * Per-pgdat scan cursor walks the array; SIMD reads 64 byte / cycle + * (AVX-512) and tests for (TRACKED && gen == oldest && tier == 0) + * via a single AND + CMP mask. Cursor saves position across calls so + * batch-32 isolate typically scans only a few hundred PFNs. + * + * for (pfn = cursor; n_batch < batch; pfn = next_or_wrap(pfn)) { + * u8 s = marie_state[pfn]; + * if ((s & MARIE_PFN_FILTER) != MARIE_PFN_TARGET) + * continue; + * batch[n_batch++] = pfn_folio(pfn); + * } + * cursor = pfn; + * + * Worst-case (sparse) full sweep of the 4 MB array is ~0.5 ms at + * DRAM bandwidth, ~50 µs in L3. Cursor amortises across many + * batches, so typical batch cost is sub-µs. + * + * + * memcg scope + * ----------- + * + * The array is global (single allocation system-wide), not per-memcg. + * memcg-targeted reclaim filters by checking folio_memcg(pfn_folio) + * inside the scan loop. This trades per-memcg locality for vastly + * simpler data structures — desktop and small-server cgroup trees + * (where Marie targets) are dominated by the root memcg anyway, so + * the locality loss is small in practice. + * + * + * Walker integration + * ------------------ + * + * The PTE walker (marie_walker) inspects young bits as before but + * commits tier bumps to marie_state[pfn] instead of folio->flags. + * The same SIMD young-pte machinery from the prior implementation + * carries over unchanged. + * + * + * Disable / reparent + * ------------------ + * + * Marie disable: write 0 to every TRACKED byte via SIMD bulk store, + * folio_put each one. Memcg reparent: same loop, but instead of + * zeroing, re-encode the byte under the parent's accounting. Both + * are O(N) sweeps but happen rarely. + * + * + * Sizing & init + * ------------- + * + * marie_state is kvmalloc'd at subsys_initcall with size + * `max_pfn` bytes. max_pfn is bounded by the 32-bit PFN gate + * (marie_init's MARIE_MAX_SUPPORTED_PFN check), so the array is at + * most 4 GiB on the maximum supported config. Realistic sizings: + * + * 16 GiB RAM -> 4 MiB (single kvmalloc, contiguous in vmalloc) + * 64 GiB RAM -> 16 MiB + * 256 GiB RAM -> 64 MiB + * + * The array is sparse-tolerant: NUMA holes and reserved regions read + * as 0 (untracked) and incur only sequential-read cost during scan. + */ + +/* + * Field shifts and masks within each marie_state[] byte. Ordered + * MSB -> LSB by reclaim filter hierarchy: TRACKED, TYPE, ZONE, GEN, + * TIER. See the byte-layout block above for the rationale. + */ +#define MARIE_PFN_TIER_SHIFT 0 +#define MARIE_PFN_TIER_BITS 2 +#define MARIE_PFN_TIER_MASK (((1U << MARIE_PFN_TIER_BITS) - 1) << \ + MARIE_PFN_TIER_SHIFT) +#define MARIE_PFN_NR_TIERS (1U << MARIE_PFN_TIER_BITS) +#define MARIE_PFN_TIER_MAX (MARIE_PFN_NR_TIERS - 1) + +#define MARIE_PFN_GEN_SHIFT 2 +#define MARIE_PFN_GEN_BITS 2 +#define MARIE_PFN_GEN_MASK (((1U << MARIE_PFN_GEN_BITS) - 1) << \ + MARIE_PFN_GEN_SHIFT) +#define MARIE_PFN_NR_GENS (1U << MARIE_PFN_GEN_BITS) + +#define MARIE_PFN_ZONE_SHIFT 4 +#define MARIE_PFN_ZONE_BITS 2 +#define MARIE_PFN_ZONE_MASK (((1U << MARIE_PFN_ZONE_BITS) - 1) << \ + MARIE_PFN_ZONE_SHIFT) +#define MARIE_PFN_NR_ZONES_ENCODED (1U << MARIE_PFN_ZONE_BITS) + +#define MARIE_PFN_TYPE_SHIFT 6 +#define MARIE_PFN_TYPE_FILE (1U << MARIE_PFN_TYPE_SHIFT) +#define MARIE_PFN_TYPE_MASK MARIE_PFN_TYPE_FILE + +#define MARIE_PFN_TRACKED_SHIFT 7 +#define MARIE_PFN_TRACKED (1U << MARIE_PFN_TRACKED_SHIFT) + +/* + * Encode @zone (folio_zonenum result) into the byte's zone nibble. + * Truncates to MARIE_PFN_NR_ZONES_ENCODED-1 so ZONE_DEVICE etc. do + * not overflow the 2-bit field; in practice those zones do not + * reach Marie's install path. + */ +static inline u8 marie_pfn_zone_bits(unsigned int zone) +{ + return (u8)((zone & (MARIE_PFN_NR_ZONES_ENCODED - 1)) << + MARIE_PFN_ZONE_SHIFT); +} + +/* Forward declaration: struct marie_lruvec is defined further down in + * this file but referenced by some declarations below. */ +struct marie_lruvec; + +/* The base allocation (subsys_initcall) covers totalram_pages PFNs. */ +extern u8 *marie_state; +extern unsigned long marie_state_size; + +/* + * Per-type head_gen (0..MARIE_PFN_NR_GENS - 1, cycling). anon and file + * have independent gen rings so the per-type pressure dial (swappiness) + * keeps its meaning and the scan / aging triggers operate on disjoint + * counters. + */ +extern atomic_t marie_head_gen[2 /* ANON_AND_FILE */]; + +/* + * Per-(gen, type) install gauge; drives the install-cadence aging + * trigger (cross-CPU sum > MARIE_AGING_THRESHOLD => try_advance_head). + * + * PER-CPU (not a global atomic): the install hot path does this_cpu_inc + * with no shared cacheline, so concurrent installs from different + * lruvecs (different lru_locks) no longer contend a single global line. + * Advisory only -- a lost/raced bump merely nudges aging cadence -- so + * the approximate per-CPU sum read at the throttled advance check (gated + * by the per-CPU marie_aging_tick) is sufficient. + */ +DECLARE_PER_CPU(long[MARIE_PFN_NR_GENS][ANON_AND_FILE], marie_gen_installs_pc); +DECLARE_PER_CPU(unsigned int[ANON_AND_FILE], marie_aging_tick); + +/* + * Per-(gen, type) live folio count. Bumped on install, decremented on + * del / promote-out. Drives: + * - the drain-wait gate (next gen empty => advance allowed) + * - the reclaim-driven aging trigger (occupied gen count < 2 at + * shrink_lruvec entry => try_advance_head) + */ +extern atomic_long_t marie_gen_occupied[MARIE_PFN_NR_GENS][2 /* ANON_AND_FILE */]; + +/* + * Per-(gen, type) walker visit counter. Walker pass-end bumps every + * gen of the visited type; gen advance resets the new head's slot to + * zero. Reclaim reads marie_gen_walker_visits[oldest][type] >= 1 to + * decide ignore_references=true on shrink_folio_list (cold-confirmed + * gens skip the rmap walk). Hint only — no correctness dependency. + */ +extern atomic_t marie_gen_walker_visits[MARIE_PFN_NR_GENS][2 /* ANON_AND_FILE */]; + +/* + * Per-(type, gen, tier) tracking bitmap. One struct marie_bitmap per + * (type, gen, tier) tuple, each holding: + * - L1: per-PFN bit (BITS_TO_LONGS(max_pfn) words, ~256 KiB / bitmap + * on an 8 GiB system; 16 bitmaps = ~4 MiB total) + * - L2: 512-bit summary over the same PFN space (64 B / bitmap) + * - per-cell refcount: 512 atomic_t per bitmap (2 KiB / bitmap), so + * the L2 bit transitions track L1 occupancy exactly via the 0 <-> + * 1 refcount boundary. + * + * Scanners walk one (type, gen, tier) bitmap at a time; the L2 plane + * provides a 512-way fast-skip over empty 32 MiB ranges. + * + * struct + operations are defined in mm/lru_marie/bitmap.{h,c} and + * are also used by the per-memcg plane (struct marie_memcg_bm). + */ +extern struct marie_bitmap marie_track_bm[2 /* ANON_AND_FILE */] + [MARIE_PFN_NR_GENS] + [MARIE_PFN_NR_TIERS]; + +/* + * clean_min_ratio: minimum file-pagecache reserve as percent of + * node_present_pages. Sysfs-tunable in core.c, read by reclaim. + * Default 15 (le9uo recommendation for desktop). + */ +extern unsigned int marie_clean_min_ratio; + +/* + * Aging trigger threshold: per (gen, type), once the install counter + * crosses this the head_gen cmpxchg-advances (gated on drain-wait: + * next gen must be empty). Mirrors the legacy reader's + * marie_gen_growth_threshold floor (SWAP_CLUSTER_MAX << 8). + */ +#define MARIE_AGING_THRESHOLD 8192 + +/* + * marie_try_advance_head - cycle the per-type head_gen by one slot + * iff the next slot has been fully drained. + * + * Drain-wait: the advance fires only when marie_gen_occupied[next][type] + * reads zero. This is the workingset protection borrowed from legacy + * Marie -- reclaim must drain the oldest gen before aging can rotate + * its slot back to head. + * + * "Drained" means gen_occupied==0, which the reclaim isolate path reaches + * at isolate (marie_evict_counters_only retires the slot), NOT at free. + * So in-flight isolated folios may still carry @next in their per-PFN + * byte when this resets @next's bitmap. That is safe: their scan bits + * were already cleared at isolate, so bm_reset only touches already-clear + * bits, and their deferred byte teardown (marie_state_drop_pfn_at_free) + * is gated on marie_bm_test, so it will not double-decrement the slot's + * l2_count after the reset. New installs at the recycled @next therefore + * cannot collide with the retired old-life bits. + * + * Concurrency: the cmpxchg ensures exactly one writer advances per + * head transition; losers see the new head on their next read. The + * gen_installs / walker_visits resets for the new slot are + * benign-on-race (a concurrent install/walker bump can be lost -- + * both counters are advisory hints, not correctness primitives). + * + * Triggered from both: + * - install-cadence (gen_installs > MARIE_AGING_THRESHOLD) + * - reclaim-driven (occupied gen count < 2 at shrink_lruvec entry, + * see design.h section 4) + */ +static inline void marie_try_advance_head(int type) +{ + u8 head = (u8)atomic_read(&marie_head_gen[type]); + u8 next = (head + 1) & (MARIE_PFN_NR_GENS - 1); + + if (atomic_long_read(&marie_gen_occupied[next][type]) != 0) + return; + + /* + * Reset each tier-bitmap of the slot we are about to recycle so + * any residue from the slot's previous lifetime doesn't suppress + * the next install's 0 -> 1 cell_count transition. [type][next][*] + * is a per-type stripe -- clearing it cannot disturb the OTHER + * type's folios that may share the same gen index. + * + * Must happen BEFORE the cmpxchg: installs read head_gen[type] + * to find their target slot, so until the cmpxchg lands no + * install can target @next. + */ + { + int tier_idx; + + for (tier_idx = 0; tier_idx < MARIE_PFN_NR_TIERS; tier_idx++) + marie_bm_reset(&marie_track_bm[type][next][tier_idx]); + smp_wmb(); + } + + if (atomic_cmpxchg(&marie_head_gen[type], head, next) != head) + return; + + /* + * Zero the recycled slot's per-CPU install gauge across all CPUs. + * Rare (only on a successful head advance) and benign-on-race with + * concurrent this_cpu_inc on other CPUs (advisory hint). possible, + * not online, so a parked CPU's residue can't survive into the + * slot's next lifetime and prematurely re-trip the cadence trigger. + */ + { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(marie_gen_installs_pc, cpu)[next][type] = 0; + } + atomic_set(&marie_gen_walker_visits[next][type], 0); +} + +/* + * marie_find_oldest_occupied - return the oldest live gen for @type. + * + * Walks the ring forward from (head + 1) & 3 to (head + 3) & 3 and + * returns the first slot with marie_gen_occupied > 0. This is the + * legitimate "oldest install epoch" -- the (head + 1) & 3 shorthand + * assumes a fully-occupied 4-gen ring, which does NOT hold during + * boot or any sparse-ring window. Returns -1 if only head (or + * nothing) is occupied. + * + * Concretely the bug a fixed (head + 1) & 3 hit: after the + * reclaim-driven trigger advances head into an empty slot, the + * previous gen still holds the only live data, but (head + 1) & 3 + * points two slots past it -- scan returns zero forever despite + * GiB of reclaimable folios. + * + * 3 atomic_long_read per call; only runs at scan entry, never on a + * per-candidate hot path. + */ +static inline int marie_find_oldest_occupied(int type) +{ + int head = atomic_read(&marie_head_gen[type]); + int i; + + for (i = 1; i < MARIE_PFN_NR_GENS; i++) { + int slot = (head + i) & (MARIE_PFN_NR_GENS - 1); + + if (atomic_long_read(&marie_gen_occupied[slot][type]) > 0) + return slot; + } + return -1; +} + +/* + * Install-side aging trigger with dynamic threshold. + * + * Threshold = max(MARIE_AGING_THRESHOLD, total_occupied_for_type / 8) + * + * The dynamic leg scales the trigger with the workload's actual + * footprint -- with NR_GENS = 4 the "balanced ring" target is + * total / 4 per gen, so cutting at total / 8 (half of that) advances + * often enough that non-head gens always carry a healthy reclaim- + * candidate buffer. + * + * Sampled at every 1024th install via a PER-CPU tick (marie_aging_tick) + * rather than the old (cnt & 1023) gate on a global atomic: the gauge is + * now per-CPU, so there is no cheap global running count to mask on. + * this_cpu_inc_return touches only this CPU's line, adding no contention. + * The aggregate check rate is unchanged (~1 per 1024 installs), and only + * on that 1-in-1024 slow path do we pay the O(nr_cpus) cross-CPU sum of + * the gauge -- amortised to nothing on the install hot path. Sampling + * delays the first cross-threshold advance by at most ~1023 installs, + * invisible against the ~32 MiB+ threshold window. + * + * Below the floor (sum <= MARIE_AGING_THRESHOLD) the helper exits + * immediately -- advance is impossible regardless of total_occupied. + */ +static inline void marie_install_advance_hook(int type) +{ + long total, cnt; + unsigned long dynamic, threshold; + int g, cpu, head; + + /* Per-CPU throttle; no shared cacheline (replaces the old global + * (cnt & 1023) gate). */ + if (likely((this_cpu_inc_return(marie_aging_tick[type]) & 1023) != 0)) + return; + + /* Approximate global install count for the current head gen. Only + * reached on the 1-in-1024 slow path, so the cross-CPU walk is free + * on the hot path. */ + head = atomic_read(&marie_head_gen[type]); + cnt = 0; + for_each_online_cpu(cpu) + cnt += per_cpu(marie_gen_installs_pc, cpu)[head][type]; + + if (cnt <= MARIE_AGING_THRESHOLD) + return; + + total = 0; + for (g = 0; g < MARIE_PFN_NR_GENS; g++) + total += atomic_long_read(&marie_gen_occupied[g][type]); + + dynamic = total > 0 ? (unsigned long)total >> 3 : 0; + threshold = max((unsigned long)MARIE_AGING_THRESHOLD, dynamic); + + if ((unsigned long)cnt > threshold) + marie_try_advance_head(type); +} + +/* + * Per-PFN isolate scan uses an L2-pruned, range-locked walk rather + * than a per-CPU cursor -- see marie_state_isolate_scan_l2lock below + * for the parallelism model (try-lock on per-L2-bit range locks gives + * 512-way exclusion across concurrent reclaimers). + */ +#include +/* One-shot init from marie_init(). Allocates marie_state with kvmalloc. */ +int marie_state_init(void); +/* Detect CPUID-based prefetch ring parameters. Call before marie_state_init(). */ +void marie_prefetch_params_init(void); + +struct pglist_data; +struct folio; +struct lruvec; +struct scan_control; +struct mem_cgroup; + +/* + * L2-lock parallel isolate scan: collapses the 512-bit outer L2 + * walk to an 8-word loop, word-ANDing the global (type, gen, tier) + * L2 with the per-memcg L2 to skip empty PFN ranges in one cycle + * each; surviving L2 bits are taken under a try_lock for exclusive + * PFN-range ownership before the inner producer extracts + * candidates via __ffs/blsr + the L1 word-AND with mbm. + * + * @target_memcg: non-NULL only for cgroup-targeted (child memcg) + * reclaim; NULL means root reclaim and the memcg word-AND is + * skipped (every Marie folio in scope). + */ +unsigned long marie_state_isolate_scan_l2lock(struct pglist_data *pgdat, + int type, int max_zone, + unsigned int tier, + struct mem_cgroup *target_memcg, + struct folio **batch, + unsigned long batch_size, + unsigned long nr_to_scan); + +/* + * Per-PFN-array reclaim driver. Walks (type, tier) via + * marie_state_isolate_scan_l2lock, claims each candidate via + * try_get + test_clear_lru, hands the resulting folio_list to + * shrink_folio_list, and putbacks any survivors. Sole reclaim + * driver in PFN-only Marie. + */ +unsigned int marie_state_shrink_lruvec(struct lruvec *lruvec, + struct scan_control *sc); + +/* + * Marie type-pick return codes for marie_swap_pick_type(). + * + * MARIE_PICK_FILE_STRICT swappiness=0: FILE only, no ANON fallback; + * caller proceeds to OOM if FILE + * is depleted. + * MARIE_PICK_ANON_STRICT swappiness=MAX: ANON only, no FILE fallback. + * MARIE_PICK_FILE_THEN_ANON swappiness=1: FILE first; ANON engages + * ONLY when skip_file is set + * (clean_min_ratio breached). + * Per-call transient FILE + * failures do not promote to + * ANON -- the floor itself is + * the sole depletion signal. + * MARIE_PICK_ANON_FIRST Proportional regime (s=2..199), bias picks + * ANON. SINGLE type per call -- scanning the + * other side would dissolve the s:(MAX-s) + * page-flow ratio. Bias gets updated from + * this call's outcome, possibly flipping the + * pick for the next shrink_lruvec call. + * MARIE_PICK_FILE_FIRST Symmetric to ANON_FIRST: bias picks FILE, + * single type per call. + */ +enum marie_pick_kind { + MARIE_PICK_FILE_STRICT, + MARIE_PICK_ANON_STRICT, + MARIE_PICK_FILE_THEN_ANON, + MARIE_PICK_ANON_FIRST, + MARIE_PICK_FILE_FIRST, +}; + +/* + * Resolve the type-pick policy for one shrink_lruvec invocation. + * + * Pure read of the controller state: looks at @swappiness to detect + * the {0, 1, MAX_SWAPPINESS} special values, otherwise reads + * mlv->swap_bias sign to pick the primary type for the proportional + * regime. Does not modify any state. + * + * @mlv may be NULL when the lruvec has no marie_lruvec yet (alloc + * failure path); the helper falls back to MARIE_PICK_ANON_FIRST so + * the caller iterates both types in the legacy order without any + * bias tracking. + */ +enum marie_pick_kind marie_swap_pick_type(struct marie_lruvec *mlv, + u8 swappiness); + +/* + * Apply the bias-controller update for one ATTEMPTED pick. + * + * nr_reclaimed > 0 -> bias += sign * nr_reclaimed * weight + * Page-flow proportional: long-run + * pages(anon):pages(file) -> s:(MAX-s) even + * when per-pick batches differ between types. + * + * nr_reclaimed == 0 -> no-op (bias unchanged) + * Failure carries no back-pressure. The + * picked side stays the picked side + * indefinitely under sustained failure; + * anon is not surrendered just because file + * is transiently or persistently stuck. + * + * sign = -1 for picked=ANON (push toward FILE) + * +1 for picked=FILE (push toward ANON) + * weight = MAX_SWAPPINESS - s for picked=ANON + * = s for picked=FILE + * + * Bypassed entirely under special-value swappiness (0, 1, MAX), + * where the pick is deterministic and the bias is not consulted; + * also a no-op when @mlv is NULL (lruvec alloc-failure path). + * + * Caller MUST only invoke when the pick was actually attempted; + * do NOT call when an external override (skip_file from + * clean_min_ratio) blocked the picked type before the scan ran. + */ +void marie_swap_bias_update(struct marie_lruvec *mlv, + int picked_type, + unsigned long nr_reclaimed, + u8 swappiness); + +/* + * Per-memcg L1/L2 bitmap pair. Allocated for every non-root memcg + * at memcg create; install/del maintain (type, gen, tier)-agnostic + * occupancy at PFN granularity (L1) and 32 MiB-range granularity + * (L2 via per-bit l2_count refcounter). Scan AND's both planes + * with the global (type, gen, tier) L1/L2 to restrict iteration + * to (type, gen, tier) ∩ memcg at source. + */ +void marie_memcg_bitmap_free(struct mem_cgroup *memcg); +void marie_memcg_bitmap_set(struct mem_cgroup *memcg, unsigned long pfn); +void marie_memcg_bitmap_clear(struct mem_cgroup *memcg, unsigned long pfn); +unsigned long *marie_memcg_bitmap_get(struct mem_cgroup *memcg); +unsigned long *marie_memcg_bitmap_get_l2(struct mem_cgroup *memcg); +void marie_memcg_bitmap_merge(struct mem_cgroup *parent, + struct mem_cgroup *child); + +/* + * Saturating tier increment for a Marie-tracked folio's per-PFN byte. + * + * Non-saturated bump (tier < MAX) is a best-effort race-tolerant + * WRITE_ONCE; losing a bump to a concurrent racer is benign because + * tier is a hotness hint, not a correctness primitive. + * + * Saturated bump (tier == MAX) is in-place promote: the folio's GEN + * field is CAS-moved to atomic_read(&marie_head_gen[type]) with TIER + * reset to 0. The CAS guards against concurrent del and against + * another walker promoting the same PFN. + * + * Skips quietly if the folio is not (or no longer) tracked, or if a + * saturated folio is already encoded on the head gen. + */ +void marie_state_inc_tier(unsigned long pfn); + +/* + * marie_state_move_to_gen - relocate a tracked PFN to (@target_gen, + * @target_tier) in the per-PFN byte, with matched bitmap / occupied + * counter updates on both source and destination (gen, type) planes. + * + * Single point of policy for any operation that needs to move a folio + * between gens. Two callers in design.h: + * - walker tier saturate (section 7): + * marie_state_move_to_gen(pfn, head, 0) + * - shrink_folio_list residue putback (section 13): + * marie_state_move_to_gen(pfn, (head + 2) & 3, max(prev, w)) + * + * The state-byte cmpxchg defeats races against del / another + * concurrent move. The bitmap / counter shuffle uses "new first, then + * old" ordering so the folio remains visible to scan on at least one + * (gen, type) plane throughout the transition. + * + * No-op if the folio is no longer tracked or already encodes the + * target (gen, tier). + */ +void marie_state_move_to_gen(unsigned long pfn, u8 target_gen, u8 target_tier); + +struct folio; +/* + * marie_state_drop_pfn - wipe every per-PFN tracking artifact + * (state byte, (type, gen, tier) L1 bit, occupancy counter, per- + * memcg L1/L2/l2_count, global L2 range counter with bulk L2 clear + * on 0) for @folio. Shared by the normal evict path + * (marie_evict_locked) and the enable=0 drain path + * (marie_drain_one_lruvec) so disable->enable cycles never leave + * ghost per-PFN state behind. No-op when the byte is not TRACKED. + */ +void marie_state_drop_pfn(struct folio *folio); + + +/* --- per-lruvec residency state and install/evict surface --- */ +#ifdef CONFIG_LRU_MARIE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include /* SWAP_CLUSTER_MAX, ANON_AND_FILE */ +#include +#include + +struct folio; +struct lruvec; +struct mem_cgroup; +struct marie_lruvec; +struct marie_gen; + +/* + * --------------------------------------------------------------------- + * Per-folio state inspection (internal) + * --------------------------------------------------------------------- + * + * Reads of the per-PFN state byte are lock-free (READ_ONCE). Writes + * go through state.c helpers (marie_state_inc_tier, + * marie_state_move_to_gen, marie_state_drop_pfn); the byte is the + * single source of truth for Marie's per-folio state. + * + * folio->lru is not interpreted as part of Marie's state -- folios + * are never linked from a Marie-owned list. It exists only so legacy + * LRU can attach drained folios via lruvec->lists[lru] (handed off by + * marie_drain_pfn_locked when Marie is disabled). + */ + +/** + * folio_marie_test_tracked - is @folio claimed by Marie? + * + * Reads the per-PFN state byte (the single source of truth in the + * per-PFN paradigm). folio->flags carries no Marie state. + */ +static inline bool folio_marie_test_tracked(const struct folio *folio) +{ + unsigned long pfn = folio_pfn((struct folio *)folio); + + if (!marie_state || pfn >= marie_state_size) + return false; + return READ_ONCE(marie_state[pfn]) & MARIE_PFN_TRACKED; +} + +/* + * folio_marie_get_tier is declared in + * so callers outside mm/lru_marie/ (e.g. mm/vmscan.c + * folio_check_references) can read tier without including this + * private header. + * + * Tier bumps go through marie_state_inc_tier (defined in state.c) -- the + * per-PFN state byte is the only place tier lives. + */ + +/* + * marie_folio_lruvec_rcu - RCU-bracketed folio_lruvec() for Marie hot paths. + * + * folio_lruvec() reaches obj_cgroup_memcg() which has a lockdep predicate + * requiring rcu_read_lock or cgroup_mutex. Marie's drain and walker paths + * run under mlv->lock (which disables preemption) but NOT under + * rcu_read_lock(); preempt-disable does not satisfy the lockdep + * predicate. The brief RCU bracket avoids the WARN trip; the returned + * pointer is used only for equality comparison or as an xarray key, + * never dereferenced after rcu_read_unlock(). + */ +static inline struct lruvec *marie_folio_lruvec_rcu(struct folio *folio) +{ + struct lruvec *lv; + + rcu_read_lock(); + lv = folio_lruvec(folio); + rcu_read_unlock(); + return lv; +} + +/* + * marie_update_lru_size - Marie counterpart to legacy update_lru_size(). + * + * Updates the shared global vmstat / zone counters AND the per-memcg + * mz->lru_zone_size, exactly mirroring the legacy update_lru_size() + * wrapper (__update_lru_size + mem_cgroup_update_lru_size). Marie + * credits mz->lru_zone_size at install and debits it at evict, so a + * Marie-tracked folio is counted in mz the same way a legacy/MGLRU + * folio is. lruvec_lru_size() therefore reads mz directly with no + * Marie-specific summing. + * + * Unified accounting (was: Marie owned a private mlv->marie_lru_zone_size + * and skipped mz): keeping mz authoritative for Marie folios too means + * any del path -- Marie's own evict, or a legacy lruvec_del_folio that + * fires on a folio whose TRACKED bit was already cleared -- finds the + * matching +nr Marie credited at install. The old split made + * "added via Marie (no mz +1), del'd via legacy (mz -1)" an + * mz->lru_zone_size underflow; crediting mz at install closes that + * asymmetry structurally. mlv->marie_lru_zone_size survives only as + * Marie's internal per-bucket tally (reparent transfer); it no longer + * feeds size reads. + * + * Marie<->legacy list transitions (drain to legacy, adopt from legacy) + * are mz-neutral: the folio stays counted in mz across the list_move, + * so those paths must NOT re-credit/re-debit mz (see + * marie_drain_pfn_locked). + * + * Caller MUST hold lruvec->lru_lock. mod_lruvec_state's per-CPU fold + * and __mod_zone_page_state's per-zone counter are documented as + * lru_lock-protected against concurrent updaters of the same lruvec. + */ +static inline void marie_update_lru_size(struct lruvec *lruvec, + enum lru_list lru, + enum zone_type zid, + long nr_pages) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); +#ifdef CONFIG_MEMCG + /* Unified with legacy: Marie folios are counted in mz too. */ + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); +#endif +} + +/* + * Global folio counter, lives in mm/lru_marie/core.c for stats_show; + * the install/evict helpers in state.c percpu_counter_add it during + * Marie's TRACKED 0<->1 transitions. + */ +extern struct percpu_counter marie_nr_folios; + +/* + * marie_pc_add - Marie-private percpu_counter add that elides the + * outer preempt_disable / preempt_enable bracket of + * percpu_counter_add_batch() while preserving its IRQ safety. + * + * percpu_counter_add_batch() wraps the whole body in + * preempt_disable/enable. Under DEBUG_PREEMPT that bracket shows up in + * perf under 16-thread memhog as ~4 % of total CPU (preempt_count_add + + * check_preemption_disabled). We drop it because the individual + * this_cpu_* primitives used here are each self-contained: this_cpu_add + * is a single atomic RMW (one instruction on x86), and the slow-path + * fbc->lock section takes raw_spin_lock_irqsave, so correctness does + * not depend on the caller's preempt or IRQ state. + * + * IRQ safety is MANDATORY, not optional: not every caller holds + * lru_lock. The reclaim isolate path (marie_evict_counters_only) and + * the survivor putback in marie_state_shrink_lruvec update the GLOBAL + * marie_nr_folios counter with IRQs ENABLED (preempt_disable only). + * The same counter is also bumped from IRQ/softirq context when a + * Marie-tracked LRU folio's last reference is dropped + * (folio_put -> __page_cache_release -> lruvec_del_folio -> + * lru_marie_del_folio -> marie_evict_locked). If the flush path used a + * plain raw_spin_lock, a softirq landing on the CPU that already holds + * fbc->lock would spin forever on it -> hard lockup. Hence + * raw_spin_lock_irqsave below, exactly as percpu_counter_add_batch does. + * + * The fast path uses this_cpu_add (atomic against same-CPU IRQ + * reentrancy); the earlier __this_cpu_read + __this_cpu_write pair was + * a non-atomic RMW that could lose an IRQ-context update. + */ +static inline void marie_pc_add(struct percpu_counter *fbc, s64 amount) +{ + s64 count = this_cpu_read(*fbc->counters) + amount; + + if (unlikely(abs(count) >= percpu_counter_batch)) { + unsigned long flags; + + raw_spin_lock_irqsave(&fbc->lock, flags); + count = __this_cpu_read(*fbc->counters) + amount; + fbc->count += count; + __this_cpu_sub(*fbc->counters, count - amount); + raw_spin_unlock_irqrestore(&fbc->lock, flags); + } else { + this_cpu_add(*fbc->counters, amount); + } +} + +/* + * --------------------------------------------------------------------- + * Install / evict — per-folio TRACKED 0 <-> 1 with lru_lock held + * --------------------------------------------------------------------- + * + * Marie's per-folio state is one bit: TRACKED in marie_state[pfn]. + * Synchronous install/evict helpers own all the bookkeeping (per-PFN + * state byte, global per-(type, gen, tier) and per-memcg bitmaps, + * per-mlv / global percpu_counters, lru_size mirror, PG_active / + * PG_lru hygiene): + * + * marie_folio_install: TRACKED 0 -> 1 + * unified fresh install for both small + * folios and THP; declared in + * pfn_install.h + * marie_state_publish_at_gen: TRACKED stays, (gen, tier) refreshed + * reclaim survivor putback + * marie_evict_locked: TRACKED 1 -> 0 + * called from marie_del_folio_locked + * + * folio_marie_test_tracked() is the lock-free state inspector: it + * reads marie_state[pfn] & MARIE_PFN_TRACKED, returning whether + * Marie owns @folio. The binary state is checked directly at each + * callsite -- no intermediate dispatch machinery. + */ +bool marie_evict_locked(struct marie_lruvec *mlv, struct folio *folio); + +/* + * Reclaim isolate path: counters-only decrement at claim time. The per- + * PFN state byte's TRACKED bit intentionally stays set throughout + * shrink_folio_list so marie_folio_install's TRACKED early-out blocks any + * concurrent install from setting PG_lru on a folio currently in the + * reclaim list. The scan-bitmap slot + gen_occupied ARE retired here (an + * isolated folio is no longer a scan candidate); the TRACKED byte is + * wiped later -- at the buddy free hook (marie_state_drop_pfn_at_free) + * for a reclaimed folio, or re-published by marie_state_publish_at_gen at + * putback for a survivor. See state.c body for the full rationale. + */ +void marie_evict_counters_only(struct folio *folio); + +/* + * Canonical per-PFN state teardown invoked from + * mm/page_alloc.c::free_pages_prepare at every page's buddy handoff. + * Wipes the per-PFN state byte / bitmap / gen_occupied slot whenever + * the byte still carries TRACKED. No-op on already-cleared state. + * Counters are NOT touched (they were balanced upstream by Marie's + * del path or by marie_evict_counters_only). + * + * Lock-free; safe from any context. + */ +void marie_state_drop_pfn_at_free(unsigned long pfn); + +/* marie_folio_install lives in pfn_install.h. */ + +/* Lazy lookup-or-allocate of the Marie state for @lv. */ +struct marie_lruvec *marie_get_lruvec(struct lruvec *lv); + +/* + * Adaptive batch threshold. Returns the per-call page accumulator cap, + * lerped between MARIE_PFN_BATCH_FLOOR (low pressure, + * sc->priority == DEF_PRIORITY) and MARIE_PFN_SHRINK_BATCH (max + * pressure, sc->priority == 0). Defined in state.c as + * marie_pfn_batch_threshold; this declaration is the public name. + */ +struct scan_control; +unsigned long marie_adaptive_batch_threshold(struct scan_control *sc); + +/** + * marie_del_folio_locked - lru_marie_del_folio body. + * @mlv: residency set + * @folio: folio to remove (any Marie-tracked state) + * + * Universal external-removal handler called from lru_marie_del_folio when + * lruvec_del_folio fires from outside Marie (compaction, lru_activate + * batch drain, __page_cache_release after the last folio_put). If the + * folio is TRACKED, calls marie_evict_locked to run the full eviction + * (per-PFN state wipe + counter decrements + lru_size mirror). If the + * folio is no longer TRACKED, returns true defensively (treated as + * "Marie already removed it"). + * + * Returns true iff @folio was tracked (the caller can fall through to + * its remaining bookkeeping). The full counter wind-down -- including + * the single marie_nr_folios -1 -- is owned by marie_evict_locked via + * marie_account_evict; the caller adds no decrement of its own. + */ +bool marie_del_folio_locked(struct marie_lruvec *mlv, struct folio *folio); + +/* + * Walker hot-path tier promotion result codes. Kept for ABI; the + * current implementation always returns MARIE_TIER_INC_OK because + * marie_state_inc_tier handles saturation in-place (synchronous + * move-to-head_gen). The walker discards the return value. + */ +enum marie_tier_inc_result { + MARIE_TIER_INC_OK = 0, + MARIE_TIER_INC_SATURATED, /* reserved, no longer produced */ + MARIE_TIER_INC_FAILED, /* reserved, no longer produced */ +}; + +/** + * marie_tier_inc - walker-side hot path: bump @folio's tier. + * @mlv: residency set (unused by the current implementation; kept + * for ABI compatibility) + * @folio: subject folio (must be Marie-tracked or the call is a no-op) + * + * Thin wrapper around marie_state_inc_tier(folio_pfn(folio)). The + * per-PFN saturate path is synchronous -- when tier == MAX the helper + * calls marie_state_move_to_gen(pfn, head, 0) directly, so there is no + * deferred promote queue and no enqueue allocation that could fail. + * The single caller (walker.c marie_walk_pmd_range) discards the + * return value. + */ +enum marie_tier_inc_result marie_tier_inc(struct marie_lruvec *mlv, + struct folio *folio); + +/* + * Tier count: alias to the per-PFN state byte's tier field width. + * Tier lives entirely in marie_state[pfn]'s MARIE_PFN_TIER field + * (see state.h: MARIE_PFN_NR_TIERS / MARIE_PFN_TIER_MAX). The + * aliases keep call sites (overflow buffer sizing, tier-loop bounds) + * readable without rewriting them all. + * + * Tier 0 = "never touched since added"; tier MARIE_TIER_MAX = saturated + * (further young hits trigger a sync promote to head_gen via + * marie_state_inc_tier). + */ +#define MARIE_NR_TIERS MARIE_PFN_NR_TIERS +#define MARIE_TIER_MAX MARIE_PFN_TIER_MAX + +/* + * Reclaim-side batch size — fallback compile-time constant used by + * a few non-hot-path call sites. The per-PFN scan path uses + * MARIE_PFN_FALLBACK_BATCH / MARIE_PFN_SHRINK_BATCH (see state.c). + */ +#define MARIE_ISOLATE_BATCH SWAP_CLUSTER_MAX + +/* + * Allocation-side aging trigger threshold (per head gen installs) + * lives in mm/lru_marie/core.c as marie_gen_growth_threshold and is + * runtime-tunable via /sys/kernel/mm/lru_marie/gen_growth_threshold. + * Default 8192 pages (= MARIE_ISOLATE_BATCH << 8, i.e. 32 MiB). + * marie_install_advance_hook combines this with a dynamic + * total_occupied / 8 leg to drive marie_try_advance_head. + */ + +/* + * --------------------------------------------------------------------- + * data structures + * --------------------------------------------------------------------- + * + * Per-type independence is fundamental: anon and file each have their + * own per-type lock and their own slice of the global per-(type, gen, + * tier) bitmap / counter arrays. vm.swappiness controls only the + * eviction proportion between types; aging on one type never forces + * work on the other. + * + * The per-PFN state byte carries the zone field, so per-zone filtering + * is part of the scan mask -- no per-zone data structure is needed + * (matching the existing NR_LRU_LISTS / zone semantics). + */ + +struct marie_type { + /* + * @type_lock serialises per-type operations that need to be + * mutually exclusive across CPUs (drain, reparent, fill-from- + * legacy). Hot install/del do not take it -- they update the + * per-PFN state byte and the unified bitmap lock-free. + * + * Cross-type sections take both types' locks in canonical order + * (anon first, file second) via the marie_both_mlv guard. + * + * @type: 0 = anon, 1 = file. Set once at marie_type_init time so + * scoped_guard(marie_type_lock, ...) can recover the type index + * (needed for the per-CPU drain-depth counter) from a bare + * struct marie_type * without an extra argument. + * + * @nr_pages is the total page count for this type on this lruvec, + * read by stats / pick callers. percpu_counter so per-folio writes + * hit the local CPU's diff (no global cache line bouncing) and + * only flush to the global every percpu_counter_batch additions. + */ + spinlock_t type_lock; + int type; + struct percpu_counter nr_pages; +}; + +struct marie_lruvec { + /* + * ---- CL0: small hot read fields ---- + * + * Layout intent: pack the few small fields the per-fault and + * per-shrink-batch paths touch into one cacheline at the top, so + * that a fault hitting marie_folio_install or the + * swap-bias pick path pulls @lruvec / @swap_bias / @nid in a + * single line read. The bulky @types[] subtree follows, keeping + * the small hot fields off the same cachelines as the per-type + * locks that another CPU may be hammering. + * + * @lruvec: back-pointer to the legacy lruvec. Read in every path + * that needs lv->lru_lock — install, del, + * marie_state_shrink_lruvec, walker pass. Read-only after + * marie_alloc_lruvec. + * + * @nid: node id. Read-only after init; consumed by walker / + * shrink stat paths. + * + * @swap_bias: signed bias counter that drives anon-vs-file pick in + * marie_state_shrink_lruvec under proportional swappiness (2..199). + * Sign decides the type to scan (>=0 -> ANON, <0 -> FILE). + * + * Update rule for one ATTEMPTED pick: + * + * nr_reclaimed > 0 -> bias += sign * nr_reclaimed * weight + * (page-flow proportional accumulation) + * nr_reclaimed = 0 -> bias unchanged + * (failure carries NO back-pressure) + * + * where sign = -1 for picked=ANON, +1 for picked=FILE, and + * weight = MAX_SWAPPINESS - s for ANON, s for FILE. + * + * Long-run page-flow under healthy operation: + * pages(anon) : pages(file) = s : (MAX_SWAPPINESS - s) + * + * Stubborn protection under failure: zero-reclaim cycles leave + * the bias untouched, so the picked side stays the picked side + * indefinitely. Low-swappiness configurations on ZRAM systems + * depend on this -- anon's working set must remain resident + * even when file is transiently or persistently stuck on + * dirty / locked / writeback / depleted state. If file truly + * cannot be reclaimed, the caller escalates priority or OOM + * intervenes; the controller does not surrender protection. + * + * Reset to 0 by lru_marie_swappiness_changed() on sysctl write + * so stale bias from a previous swappiness regime does not + * steer the first picks under the new value. + * + * Special-value swappiness (0, 1, MAX_SWAPPINESS) bypasses the + * bias entirely at pick time; the field is not consulted, and + * the update path is short-circuited so the value never drifts. + * No CAP is applied -- per-cycle delta is bounded by batch_max + * (~8192) * MAX_SWAPPINESS (200) ~ 1.6e6, far below S64_MAX. + */ + struct lruvec *lruvec; + int nid; + bool offline; /* set under lv->lru_lock at css_offline */ + atomic64_t swap_bias; + + /* + * ---- Per-type subtrees ---- + * + * types[0] = anon, types[1] = file. Each marie_type carries its + * own per-type exclusive lock (type_lock) and its own + * nr_pages percpu_counter. Cross-type sections acquire both + * locks in canonical order via marie_both_mlv. + */ + struct marie_type types[ANON_AND_FILE]; + + /* + * ---- Hot write band: per-(lru, zone) Marie page counters ---- + * + * Authoritative per-(lru, zone) count of Marie-tracked pages on + * this lruvec. Marie owns this counter exclusively -- the legacy + * mz->lru_zone_size mirrors only legacy add/del activity and + * never sees Marie's contribution. Readers that want the + * consolidated total query both via lruvec_lru_size() / + * marie_lruvec_zone_size(). + * + * Update sites (paired ±nr): marie_folio_install (install), + * marie_state_publish_at_gen + survivor putback (reclaim + * survivor at non-head gen), marie_evict_locked (evict), + * marie_reparent_locked (reparent), + * marie_fill_one_lruvec (legacy->Marie transition), + * marie_drain_one_lruvec (Marie->legacy transition). + * + * percpu_counter handles concurrent updates without atomic on + * every write: per-folio +/- nr lands in this CPU's local diff, + * and only the periodic flush to the global s64 (every + * percpu_counter_batch operations) touches a shared cacheline. + * + * percpu_counter so per-folio writes hit the local CPU's diff + * instead of contending on the (lru, zone) cell across all CPUs. + * The intra-CL false-sharing concern that justified the previous + * atomic_long array is dissolved: each percpu_counter's hot per-CPU + * storage is allocated separately, the struct itself only holds + * the spinlock + s64 global which is touched only on batch flush. + */ + struct percpu_counter marie_lru_zone_size[NR_LRU_LISTS][MAX_NR_ZONES]; + + /* + * Deferred legacy mz->lru_zone_size delta from the LOCK-FREE isolate + * paths (marie_account_{install,evict}_isolate). mz->lru_zone_size is a + * non-atomic, lru_lock-protected counter; the isolate paths hold no + * lru_lock, so they MUST NOT RMW it directly (concurrent reclaimers lose + * updates and drift mz negative -- the underflow root cause). Instead + * they accumulate here atomically, and the next LOCKED install/evict on + * the same (lru, zone) drains it into mz under lru_lock + * (marie_mz_drain_locked). The per-CPU-safe shadow (marie_lru_zone_size) + * stays authoritative for the Marie count; mz lags by the un-drained + * pending only, and is reconciled exactly, never raced. + */ + atomic_long_t mz_pending[NR_LRU_LISTS][MAX_NR_ZONES]; + + /* + * ---- Cold: only walked at memcg teardown ---- + * + * @memcg: only used by lru_marie_exit_memcg to enumerate every + * marie_lruvec under a dying memcg for reparent. Never read on + * the per-fault or per-shrink path. + */ + struct mem_cgroup *memcg; +}; + +/* lifecycle (called from mm/lru_marie/core.c xa lookup path) */ +struct marie_lruvec *marie_alloc_lruvec(struct lruvec *lv, gfp_t gfp); +void marie_free_lruvec(struct marie_lruvec *mlv); + +/* + * Per-type re-entrant-drain detection. Caller (lru_marie_del_folio in + * mm/lru_marie/core.c) uses marie_in_drain_type(folio's type) to detect "we + * are already inside a per-type-locked drain for this folio's type on + * this CPU" and skip the scoped_guard re-acquire. The depth counters + * are per-CPU statics inside the ADT, mutated by the scoped_guard + * lock/unlock body (S5 / per-CPU encapsulation). + */ +bool marie_in_drain_type(int type); +void marie_drain_enter_type(int type); +void marie_drain_exit_type(int type); + +/* + * --------------------------------------------------------------------- + * drain helpers + * --------------------------------------------------------------------- + * + * No promote-queue or per-CPU staging drain remains: every install / + * evict / tier bump is synchronous (install_local / install_locked + * publish per-PFN state inline, evict_locked wipes it inline, + * marie_state_inc_tier handles saturation via marie_state_move_to_gen + * directly). + * + * The remaining drain entry, marie_drain_one_lruvec (in core.c), is + * only the enable/disable transition: it walks the per-(type, gen, + * tier) bitmap and hands every TRACKED folio back to the legacy LRU. + */ + +/** + * marie_reparent_locked - merge @child_mlv's per-memcg tracking + * into @parent_mlv's via L2-pruned + * bitmap OR + atomic counter transfer. + * @child_mlv: source residency set (a memcg being reparented) + * @parent_mlv: destination residency set (parent memcg's lruvec mlv); may + * be NULL, in which case @child's per-memcg bitmap and + * counters are zeroed and folios fall back to global + * tracking (no per-memcg filter). + * + * Per-folio iteration free: the per-PFN state byte (gen / tier / type / + * zone) and the global (type, gen, tier) bitmaps are memcg-agnostic + * and stay in place. Only the per-memcg L1/L2 bitmap merges into + * @parent and the per-type / per-(lru, zone) counters move via + * atomic_long_xchg. Cost scales with the number of populated L2 ranges + * in @child's per-memcg bitmap, not with the tracked folio count. + * + * Returns 0: marie_nr_folios is unchanged because the per-PFN state + * bytes remain set. Caller need not adjust the global folio counter. + * + * Caller MUST hold @child_mlv's both type_locks (marie_both_mlv + * scoped_guard) plus both lruvecs' lru_lock with IRQs off (the memcg + * offline path's reparent_locks). + */ +long marie_reparent_locked(struct marie_lruvec *child_mlv, + struct marie_lruvec *parent_mlv); + +/* + * scoped_guard(marie_type_lock, &mlv->types[type]) — per-type lock acquisition. + * + * Equivalent to the handwritten dance: + * + * spin_lock_irqsave(&t->type_lock, flags); + * marie_drain_enter_type(t->type); + * ... critical section touching mlv->types[t->type] ... + * marie_drain_exit_type(t->type); + * spin_unlock_irqrestore(&t->type_lock, flags); + * + * The cleanup attribute on the guard variable makes the unlock + + * depth-counter pair a structural property of the scope, not a + * discipline the caller must remember on every early return / goto. + * + * Re-entry inside the scope is handled by the per-CPU per-type + * marie_drain_depth contract — drain helpers' folio_put recursion that + * lands in lru_marie_del_folio observes marie_in_drain_type(folio's + * type) > 0 and skips the spin_lock_irqsave for that type only. Recursion + * involving the *other* type lands on a depth-0 counter and proceeds to + * take the corresponding per-type lock as usual (the outer guard holds + * only one type's lock, so this is not a self-deadlock). + */ +DEFINE_LOCK_GUARD_1(marie_type_lock, struct marie_type, + /* lock */ ({ + spin_lock_irqsave(&_T->lock->type_lock, _T->flags); + marie_drain_enter_type(_T->lock->type); + }), + /* unlock */ ({ + marie_drain_exit_type(_T->lock->type); + spin_unlock_irqrestore(&_T->lock->type_lock, _T->flags); + }), + unsigned long flags +) + +/* + * scoped_guard(marie_both_mlv, mlv) — take both types' locks. + * + * Used by cross-type call sites: drain_pending, drain_one_lruvec, + * fill_one_lruvec, walker pass-end visit-counter bump, residency + * reparent. Putback_batch and isolate_folios use marie_type_lock + * (single-type variant) since their input list is type-pure. + * + * Canonical order: anon (type 0) first, file (type 1) nested with + * SINGLE_DEPTH_NESTING to tell lockdep the two locks are different + * instances despite sharing a lock class. Every cross-type site uses + * this same guard, so the lock order is uniform and AB-BA is + * structurally impossible. + * + * Both per-type drain-depth counters are incremented so that any + * recursive lru_marie_del_folio (regardless of folio type) sees the + * depth > 0 fast path for its own type and avoids re-locking. + */ +DEFINE_LOCK_GUARD_1(marie_both_mlv, struct marie_lruvec, + /* lock */ ({ + spin_lock_irqsave(&_T->lock->types[0].type_lock, _T->flags); + spin_lock_nested(&_T->lock->types[1].type_lock, + SINGLE_DEPTH_NESTING); + marie_drain_enter_type(0); + marie_drain_enter_type(1); + }), + /* unlock */ ({ + marie_drain_exit_type(1); + marie_drain_exit_type(0); + spin_unlock(&_T->lock->types[1].type_lock); + spin_unlock_irqrestore(&_T->lock->types[0].type_lock, _T->flags); + }), + unsigned long flags +) + +/** + * marie_counters_init - one-shot init for per-mlv internals. + * + * Called from marie_init() (subsys_initcall in mm/lru_marie/core.c). + * Currently just initialises the global marie_nr_folios percpu_counter + * (the per-CPU bucket pool, slab caches, and cpuhp callbacks that + * earlier revisions needed have all been retired together with the + * staging machinery). + * + * Returns 0 on success, negative errno on failure (in which case the + * caller propagates the error up to the initcall machinery). + */ +int marie_counters_init(void); + +/* + * --------------------------------------------------------------------- + * Cross-file glue (lifecycle xarray + walker entry points) + * --------------------------------------------------------------------- + * + * These declarations connect mm/lru_marie/core.c (dispatch / lifecycle) and + * mm/lru_marie/walker.c (PTE walker). They live here to keep mm/ + * private headers down to a single file. + */ + +/* + * Per-lruvec mlv lives in lv->marie_mlv (the single source of truth; + * lazy-allocated on first hit via cmpxchg, freed at memcg teardown via + * marie_drop_lruvec()). No side xarray, no RCU. + */ + +/* + * marie_walk_pgdat - run one walker pass for @pgdat. + * + * Called from lru_marie_age_node() (kswapd hook) and + * lru_marie_shrink_lruvec() (direct-reclaim hook). Internally + * rate-limited per pgdat via a jiffies deadline so calling on every + * reclaim/kswapd cycle is fine. + */ +void marie_walk_pgdat(struct pglist_data *pgdat); + +/* + * marie_walker_init - one-shot init for the walker subsystem. + * + * Initialises per-pgdat bloom-filter spinlocks. Bitmaps themselves + * are lazily allocated on first Producer hit. Called from + * marie_init(). + */ +void marie_walker_init(void); + + +#endif /* CONFIG_LRU_MARIE */ +#endif /* _MM_LRU_MARIE_STATE_H */ diff --git a/mm/lru_marie/version.h b/mm/lru_marie/version.h new file mode 100644 index 0000000000..d52a5ba340 --- /dev/null +++ b/mm/lru_marie/version.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_LRU_MARIE_VERSION_H +#define _MM_LRU_MARIE_VERSION_H + +/* + * Marie LRU — version identifiers. + * + * Kept in mm/ rather than include/linux/lru_marie.h so that bumping + * MARIE_VERSION (the only string that changes from one release to the + * next) does not invalidate the ccache entry for every translation + * unit that includes (mm/mm.h, mm/mm_inline.h, + * mm/vmscan.c, mm/swap.c, mm/rmap.c, mm/memcontrol.c, etc.). Only + * Marie's own .c files include this header, so a version bump rebuilds + * just mm/lru_marie*.o. + */ + +#define MARIE_PROGNAME "Marie LRU" +#define MARIE_AUTHOR "Masahito Suzuki" + +#define MARIE_VERSION "0.3.5" + +#endif /* _MM_LRU_MARIE_VERSION_H */ diff --git a/mm/lru_marie/walker.c b/mm/lru_marie/walker.c new file mode 100644 index 0000000000..846d3787d4 --- /dev/null +++ b/mm/lru_marie/walker.c @@ -0,0 +1,961 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/lru_marie/walker.c — Marie's SIMD-accelerated PTE walker. + * + * The walker is Marie's hot signal harvester. Per pgdat, + * rate-limited via marie_walker_interval() (HZ/30..HZ depending on + * pressure), marie_walk_pgdat() snapshots the running mm_struct's, + * walks each page table to PMD granularity, and at every PMD invokes + * lru_marie_simd_young_pte_mask_raw() to extract the young-bit bitmap + * of the entire 512-PTE page in one SIMD pass (AVX-512F / AVX2 / SSE2 + * on x86; scalar fallback on arm64 and other arches via the generic + * variant -- NEON acceleration is a future optimisation, see + * mm/lru_marie/simd_generic.c). + * The FPU bracket around the SIMD call is held across MARIE_FPU_BATCH + * consecutive bloom-hit PMDs and flushed by a scoped_guard around the + * per-mm walk_page_range, amortising kernel_fpu_begin/end overhead. + * For each PTE flagged young, the walker bumps the corresponding + * folio's tier via marie_tier_inc() (thin wrapper around + * marie_state_inc_tier on the per-PFN byte). Folios that saturate to + * MARIE_TIER_MAX trigger an in-place synchronous promote + * (marie_state_move_to_gen to head_gen at tier 0) inside the same + * helper -- no per-mlv promote queue, no pass-end drain. + * + * A per-pgdat bloom filter (marie_bloom_*) feeds back from + * lru_marie_look_around() (rmap-side, called from + * folio_referenced_one()) to the walker: rmap flags PMDs whose target + * folio was young, the walker reads that bitmap and skips PMDs the + * rmap path has not flagged. The bloom is double-buffered (active / + * inactive) and rotated at pass end so the walker reads the feedback + * accumulated during the previous reclaim window. + * + * Bloom is the *only* coupling between rmap and the walker. + * lru_marie_look_around() does NOT promote (no tier++, no + * PG_referenced) on the surrounding folios; the walker handles tier++ + * via young-bit detection on bloom-hit PMDs. This split keeps the + * rmap path PTL-bounded and lock-free, while the walker pays the + * SIMD scan + tier++ cost only for hot PMDs. + */ + +#define pr_fmt(fmt) "lru_marie: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" /* folio_pte_batch_flags / FPB_MERGE_YOUNG_DIRTY */ +#include "state.h" +#include "simd.h" + +/* + * --------------------------------------------------------------------- + * look-around (rmap-side opportunistic PMD scan) + * --------------------------------------------------------------------- + * + * lru_marie_look_around() is called from rmap.c::folio_referenced_one() while + * the rmap caller already holds the page table lock for the target + * folio's PTE. We piggyback on that PTL to scan up to + * MARIE_LOOK_AROUND_BATCH PTEs of the surrounding PMD and clear young bits + * found there in batch — what would otherwise cost one rmap walk per + * neighbouring folio amortises into a single PMD pass, and subsequent + * folio_referenced() calls on those folios get a more accurate "young + * since last reclaim cycle" answer. + * + * Crucially, we do NOT call folio_set_referenced() on the surrounding + * folios. Doing so would cascade into a reclaim-side promote and + * starve reclaim under fault-heavy workloads (memhog, browser tab + * churn) where every recently-faulted PTE has its young bit set — + * see the comment above the test_and_clear loop below. + * + * Returns true iff the target folio's own PTE(s) were young. That's the + * value folio_referenced_one() folds into its referenced count, exactly + * mirroring what test_and_clear_young_ptes_notify() would have returned + * from the bare clear_flush_young_ptes_notify branch. + * + * Lock contract: caller holds the PTL and (via rmap_walk) one of the + * anon_vma / i_mmap rwsems. We DO NOT take any Marie lock here. + * Promotion of folios with an external hotness signal happens out-of- + * band: folio_mark_accessed -> lru_marie_mark_accessed bumps the + * per-PFN tier (marie_state_inc_tier), and tier saturation triggers a + * synchronous marie_state_move_to_gen(pfn, head, 0) on the same path + * -- both operations are lock-free byte writes. No new lock-ordering + * relationship between rmap and Marie state is introduced. + */ +#define MARIE_LOOK_AROUND_BATCH BITS_PER_LONG /* PTEs scanned per call */ + +/* + * As of 7.1-rc1 both test_and_clear_young_ptes_notify() (mm/internal.h) + * and lazy_mmu_mode_enable()/disable() (include/linux/pgtable.h) are + * upstream, so the per-PTE emulation shim and the arch_*_lazy_mmu_mode + * aliases that the 6.12/6.18/7.0 backports carried are no longer needed + * here -- we call the upstream APIs directly. + */ + +/* Forward decl: bloom Producer used by look_around. Definition lives in + * the walker helpers section alongside the walker-side Consumer. */ +static void marie_bloom_set(int nid, unsigned long pmd_addr); + +bool lru_marie_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) +{ + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; + unsigned long start, end; + struct vm_area_struct *vma = pvmw->vma; + struct folio *target = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg; + struct pglist_data *pgdat = folio_pgdat(target); + int i; + + lockdep_assert_held(pvmw->ptl); + + /* Always clear the target folio's own young bit and propagate the + * result to the caller, regardless of whether we go on to scan the + * surrounding PMD. */ + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) + return false; + + /* + * Producer: feed the per-pgdat bloom. The target folio was young, + * so this PMD has at least one hot PTE worth visiting on the next + * walker pass. This is the *only* rmap-side signal Marie gives the + * walker -- look_around does NOT promote (no tier++, no + * PG_referenced) on the surrounding folios; the walker handles + * tier++ via young-bit detection on bloom-hit PMDs. + */ + marie_bloom_set(pgdat->node_id, addr & PMD_MASK); + + /* If the PTL is contended skip the surrounding scan — somebody else + * is waiting and we shouldn't extend our hold time. */ + if (spin_is_contended(pvmw->ptl)) + return true; + + /* PFN-mapped VMAs don't carry struct folio backings on every PTE; + * skip them rather than feed garbage to pfn_folio(). */ + if (vma->vm_flags & VM_SPECIAL) + return true; + + /* Compute a PMD-bounded surrounding range centred on @addr. We + * scan at most MARIE_LOOK_AROUND_BATCH PTEs and never cross either + * the PMD or the VMA boundary. */ + start = max(addr & PMD_MASK, vma->vm_start); + end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + + if (end - start == PAGE_SIZE) + return true; + + if (end - start > MARIE_LOOK_AROUND_BATCH * PAGE_SIZE) { + if (addr - start < MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2) + end = start + MARIE_LOOK_AROUND_BATCH * PAGE_SIZE; + else if (end - addr < MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2) + start = end - MARIE_LOOK_AROUND_BATCH * PAGE_SIZE; + else { + start = addr - MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2; + end = addr + MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2; + } + } + + memcg = get_mem_cgroup_from_folio(target); + + lazy_mmu_mode_enable(); + + pte -= (addr - start) / PAGE_SIZE; + + for (i = 0, addr = start; addr != end; + i += nr, pte += nr, addr += nr * PAGE_SIZE) { + unsigned long pfn; + pte_t ptent = ptep_get(pte); + struct folio *folio; + + nr = 1; + + /* Inline minimal get_pte_pfn — vmscan.c's version is + * MGLRU-static and we only need a subset of its checks. */ + if (!pte_present(ptent)) + continue; + if (pte_special(ptent)) + continue; + pfn = pte_pfn(ptent); + if (is_zero_pfn(pfn)) + continue; + if (!pfn_valid(pfn)) + continue; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + + folio = pfn_folio(pfn); + if (folio_nid(folio) != pgdat->node_id) + continue; + + rcu_read_lock(); + if (folio_memcg(folio) != memcg) + folio = NULL; + rcu_read_unlock(); + if (!folio) + continue; + + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + /* + * 6.18 folio_pte_batch_flags() takes a pointer to a + * COPY of the first pte (ptentp) and the FPB_* flag + * set directly. FPB_MERGE_YOUNG_DIRTY merges across + * young/dirty differences, matching the young-bit- + * agnostic neighbour batching this look-around wants. + */ + nr = folio_pte_batch_flags(folio, NULL, pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + } + + /* The target folio's young bit was already cleared above and + * its referenced status will be re-derived by the caller from + * our return value — don't double-clear it here. */ + if (folio == target) + continue; + + /* + * Clear young bits across the surrounding PMD in batch. We + * deliberately do NOT touch any tier / PG_referenced state on + * the neighbours here: under a fault-heavy allocator (memhog, + * browser tab churn) every recently-faulted PTE has its young + * bit set, and amplifying that into a hot signal on + * ~MARIE_LOOK_AROUND_BATCH neighbours per rmap call cascades + * through promote-in-place and starves the reclaim path of + * evictable folios. The only signal look_around emits for the + * neighbours is the per-pgdat bloom (set above) — that tells + * the next walker pass "this PMD had at least one hot PTE", + * and the walker itself does per-PTE tier++ from young-bit + * detection, preserving per-folio cardinality in the "hot" + * signal that drives MARIE_TIER promotions. + */ + test_and_clear_young_ptes_notify(vma, addr, pte, nr); + } + + lazy_mmu_mode_disable(); + mem_cgroup_put(memcg); + + return true; +} +EXPORT_SYMBOL_GPL(lru_marie_look_around); + +/* + * --------------------------------------------------------------------- + * Walker helpers: adaptive walker rate, per-pgdat state + * --------------------------------------------------------------------- + */ + +/* + * Adaptive walker rate. High pressure -> short interval + * (frequent walks -> fresh tier signal); idle -> long interval (don't + * burn CPU). Returns jiffies until the next walker pass for this pgdat. + * + * Watermarks come from ZONE_NORMAL when present; for builds where + * ZONE_NORMAL is absent we fall back to the first populated zone. + * + * All four stage intervals are runtime-tunable via + * /sys/kernel/mm/lru_marie/walker_interval_{critical,low,normal,idle}_ms; + * defaults preserve the original HZ/30, HZ/10, HZ/4, HZ cadence. + */ +static unsigned long marie_walker_interval(struct pglist_data *pgdat) +{ + struct zone *zone = NULL; + unsigned long free, high, low, min; + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *z = &pgdat->node_zones[zid]; + + if (!populated_zone(z)) + continue; + if (zid == ZONE_NORMAL) { + zone = z; + break; + } + if (!zone) + zone = z; + } + if (!zone) + return READ_ONCE(marie_walker_interval_idle); + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + high = high_wmark_pages(zone); + low = low_wmark_pages(zone); + min = min_wmark_pages(zone); + + if (free < min) + return READ_ONCE(marie_walker_interval_critical); + if (free < low) + return READ_ONCE(marie_walker_interval_low); + if (free < high) + return READ_ONCE(marie_walker_interval_normal); + return READ_ONCE(marie_walker_interval_idle); +} + +/* + * --------------------------------------------------------------------- + * Bloom filter -- rmap → walker forward feedback + * --------------------------------------------------------------------- + * + * Per-pgdat probabilistic set of "PMDs the rmap path saw young in since + * the last walker pass." Keyed by PMD index (>>PMD_SHIFT), m=1<<15 + * (4 KiB per filter, 8 KiB per pgdat), k=2. + * + * Producer: lru_marie_look_around() (rmap-side, runs under PTL during eviction + * folio_referenced walks). Sets bits in @inactive. + * + * Consumer: marie_walk_pmd_range() (walker hot path, runs under PTL). + * Tests bits in @active. Bloom miss -> skip the PMD's SIMD scan. + * + * Pass-end (marie_walk_pgdat): swap active/inactive under @lock and clear + * the new inactive. The walker therefore reads the rmap feedback that + * accumulated during the previous reclaim window. + * + * @warmed_up is the force_scan kill-switch: sticky-true on the first + * Producer write per pgdat. Until then, the walker bypasses bloom and + * scans every PMD (covers cold-boot and freshly-online pgdats where + * rmap has never fed the filter). + * + * Lazy alloc with GFP_ATOMIC -- look_around runs under PTL, so any + * sleeping alloc would deadlock. Allocation failure leaves @inactive + * NULL; the next look_around call retries. With both bitmaps NULL the + * walker falls back to force_scan via @warmed_up == false. + */ +#define MARIE_BLOOM_SHIFT 15 +#define MARIE_BLOOM_SIZE (1U << MARIE_BLOOM_SHIFT) /* 32K bits */ + +struct marie_bloom { + spinlock_t lock; /* serialises swap + alloc */ + unsigned long *active; /* read by walker */ + unsigned long *inactive; /* written by look_around */ + bool warmed_up; /* sticky: true after first Producer set */ +}; + +static struct marie_bloom marie_blooms[MAX_NUMNODES]; + +static inline void marie_bloom_keys(unsigned long pmd_addr, int *key) +{ + u32 hash = hash_long(pmd_addr >> PMD_SHIFT, MARIE_BLOOM_SHIFT * 2); + + key[0] = hash & (MARIE_BLOOM_SIZE - 1); + key[1] = (hash >> MARIE_BLOOM_SHIFT) & (MARIE_BLOOM_SIZE - 1); +} + +static unsigned long *marie_bloom_alloc_atomic(void) +{ + return bitmap_zalloc(MARIE_BLOOM_SIZE, GFP_ATOMIC); +} + +/* + * Producer: feed @pmd_addr into pgdat @nid's inactive bloom. Idempotent. + * Bitops are lock-free; only the lazy-alloc slow path takes b->lock. + */ +static void marie_bloom_set(int nid, unsigned long pmd_addr) +{ + struct marie_bloom *b; + unsigned long *filter; + unsigned long flags; + int key[2]; + + if (nid < 0 || nid >= MAX_NUMNODES) + return; + b = &marie_blooms[nid]; + + marie_bloom_keys(pmd_addr, key); + + filter = READ_ONCE(b->inactive); + if (filter) { + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); + if (!READ_ONCE(b->warmed_up)) + WRITE_ONCE(b->warmed_up, true); + return; + } + + /* Slow path: lazy allocate both bitmaps. */ + spin_lock_irqsave(&b->lock, flags); + if (!b->inactive) + b->inactive = marie_bloom_alloc_atomic(); + if (!b->active) + b->active = marie_bloom_alloc_atomic(); + if (!b->inactive) { + spin_unlock_irqrestore(&b->lock, flags); + return; /* OOM: walker will use force_scan via !warmed_up */ + } + filter = b->inactive; + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); + b->warmed_up = true; + spin_unlock_irqrestore(&b->lock, flags); +} + +/* + * Consumer: walker hot path. Returns true iff @pmd_addr is in pgdat + * @nid's active bloom. NULL active -> false (caller's force_scan path + * covers it). + */ +static bool marie_bloom_test(int nid, unsigned long pmd_addr) +{ + unsigned long *filter; + int key[2]; + + if (nid < 0 || nid >= MAX_NUMNODES) + return false; + + filter = READ_ONCE(marie_blooms[nid].active); + if (!filter) + return false; + + marie_bloom_keys(pmd_addr, key); + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +/* + * Pass-end: swap active <- inactive, clear new inactive. Called from + * marie_walk_pgdat under no other lock. + */ +static void marie_bloom_swap(int nid) +{ + struct marie_bloom *b; + unsigned long *tmp; + unsigned long flags; + + if (nid < 0 || nid >= MAX_NUMNODES) + return; + b = &marie_blooms[nid]; + + spin_lock_irqsave(&b->lock, flags); + tmp = b->active; + b->active = b->inactive; + b->inactive = tmp; + if (b->inactive) + bitmap_zero(b->inactive, MARIE_BLOOM_SIZE); + spin_unlock_irqrestore(&b->lock, flags); +} + +static inline bool marie_bloom_warmed(int nid) +{ + if (nid < 0 || nid >= MAX_NUMNODES) + return false; + return READ_ONCE(marie_blooms[nid].warmed_up); +} + +/* + * Per-CPU walk context: a preallocated mm snapshot buffer so the walker + * doesn't kmalloc inside its hot entry path (the walker can be entered + * from direct reclaim, where allocator recursion is disallowed). + * + * Ownership is established by marie_walker_busy below: the pass owner + * pins to its CPU via migrate_disable() and claims the per-CPU ctx + * with this_cpu_cmpxchg(marie_walker_busy, 0, 1). A preempted-and- + * resumed reclaimer that reaches marie_walk_pgdat on the same CPU will + * find the flag set and bail, preventing concurrent reuse of the + * snapshot buffer. The walker pass itself stays preemptible so + * cond_resched() inside marie_walk_pmd_range remains effective. + * + * marie_walker_next[] lives in the walker section below alongside the + * rest of the walker state. + */ +#define MARIE_WALK_MAX_MMS 256 + +struct marie_walk_ctx { + struct mm_struct *mms[MARIE_WALK_MAX_MMS]; + int n_mms; +}; + +static DEFINE_PER_CPU(struct marie_walk_ctx, marie_walker_ctx); +static DEFINE_PER_CPU(unsigned int, marie_walker_busy); + +/* + * --------------------------------------------------------------------- + * Walker -- SIMD + adaptive + per-pgdat + * --------------------------------------------------------------------- + * + * Per pgdat, rate-limited via marie_walker_interval (HZ/30 .. HZ). + * Each PMD scans young bits in batch via lru_marie_simd_young_pte_mask_raw + * (AVX-512F / AVX2 / SSE2 on x86; scalar fallback on arm64 and other + * arches), with an enclosing FPU bracket batched across MARIE_FPU_BATCH + * scans. Cross-node folios are filtered out so each pgdat owns its + * work cleanly. + * + * Walker tier promotion is synchronous: marie_tier_inc + * (a thin wrapper around marie_state_inc_tier) handles both the + * non-saturated bump and the saturate -> in-place promote inside + * the same call, so there is no per-CPU promote queue and no + * pass-end promote drain. + * + * Lock contract: + * per-PMD: holds the existing pte_offset_map_lock ptl + * per-PTE body: lock-free -- marie_state_inc_tier mutates only the + * per-PFN state byte + * walker_visits: lock-free atomic_inc on the global + * marie_gen_walker_visits[gen][type] counter (read as a + * >= 1 boolean; reset in marie_try_advance_head) + * bloom rotation: per-pgdat marie_blooms[nid].lock (irqsave), taken only + * for lazy alloc / pass-end swap + * per-pgdat deadline: cmpxchg on marie_walker_next[nid] + * + * Lock ordering: the walker takes the pte ptl and, under it, at most the + * per-pgdat bloom lock (a leaf). It takes NO lru_lock and NO per-type + * lock anywhere, so it does not participate in -- and cannot invert -- + * Marie's lru_lock -> type_lock hierarchy. + */ + +/* + * Per-pgdat walker deadline (jiffies). One pass per pgdat per + * marie_walker_interval(pgdat) is allowed; concurrent reclaimers / + * kswapd cycles atomic-cmpxchg to claim the slot. + * + * MARIE_WALK_MAX_MMS bounds the per-pass task snapshot (see the per-CPU + * marie_walker_ctx definition earlier in this file). + */ +static atomic_long_t marie_walker_next[MAX_NUMNODES]; + +/* + * MARIE_FPU_BATCH — number of consecutive bloom-hit PMDs scanned under + * a single FPU bracket before flushing. + * + * Trade-off: larger batch amortises kernel_fpu_begin/end (~100 ns each + * on x86 xsave/xrstor) across more scans, but extends the preempt- + * disabled window proportionally because the bitmap iteration runs + * inside the same PTL window with FPU still held. With per-PMD total + * cost ~1-25 µs (dominated by marie_state_inc_tier on set bits), a + * batch of 16 gives a worst-case preempt window of ~400 µs -- well + * within tolerance and acceptable for desktop-grade preemption. + * + * Bloom misses do NOT advance the counter (the FPU bracket has not been + * opened on those PMDs), so the batch is purely "PMDs we actually + * scanned in a row". + */ +#define MARIE_FPU_BATCH 16 + +/* + * Per-walk FPU batch state. Lives inside marie_walk_arg so it is reset + * automatically each marie_walk_pgdat() invocation; the scoped_guard + * around walk_page_range() in marie_walk_one_mm() flushes any partial + * batch when the per-mm walk exits, guaranteeing FPU is released + * before mmap_read_unlock and before the next mm starts. + */ +struct marie_fpu_batch { + unsigned int count; /* scans in current bracket */ + bool held; /* FPU bracket currently open */ +}; + +static inline void marie_fpu_batch_open(struct marie_fpu_batch *b) +{ + if (!b->held) { + lru_marie_simd_batch_begin(); + b->held = true; + } +} + +static inline void marie_fpu_batch_step(struct marie_fpu_batch *b) +{ + if (++b->count >= MARIE_FPU_BATCH) { + lru_marie_simd_batch_end(); + b->held = false; + b->count = 0; + } +} + +static inline void marie_fpu_batch_flush(struct marie_fpu_batch *b) +{ + if (b->held) { + lru_marie_simd_batch_end(); + b->held = false; + b->count = 0; + } +} + +/* + * scoped_guard(marie_fpu_batch, &arg->fpu) — entry is a no-op (the + * walker opens the bracket lazily on the first bloom-hit PMD); exit + * flushes any in-flight bracket. Wrapped around walk_page_range() in + * marie_walk_one_mm() so an early return / fault from the underlying + * mm walker still releases FPU before mmap_read_unlock. + */ +DEFINE_GUARD(marie_fpu_batch, struct marie_fpu_batch *, + (void)_T, + marie_fpu_batch_flush(_T)) + +struct marie_walk_arg { + struct pglist_data *pgdat; + struct marie_lruvec *mlv_anon_cache; /* one-deep lruvec cache */ + struct marie_lruvec *mlv_file_cache; + bool force_scan; /* bypass bloom gate */ + struct marie_fpu_batch fpu; /* per-walk FPU bracket state */ +}; + +static int marie_walk_pmd_range(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct marie_walk_arg *arg = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t *pte_table, *orig_pte; + spinlock_t *ptl; + unsigned long pmd_addr; + unsigned long bitmap[MARIE_SIMD_PTE_BITMAP_LONGS] = { 0 }; + int bit; + + if (!vma) + return 0; + + pmd_addr = start & PMD_MASK; + + /* + * Bloom gate (Consumer side of rmap → walker forward feedback). + * Skip PMDs the rmap path has not flagged as recently-young; the + * SIMD scan + tier++ work is paid only for hot PMDs. force_scan + * bypasses the gate during cold-boot / freshly-online pgdats where + * the bloom has yet to be primed by look_around. Returning before + * pte_offset_map_lock() avoids the PTL cost on misses too. + */ + if (!arg->force_scan && + !marie_bloom_test(arg->pgdat->node_id, pmd_addr)) + return 0; + + /* + * pte_offset_map_lock returns pte_base + pte_index(start), which may + * not be at the start of the page table. The SIMD kernel must receive + * the page-table base (index 0) so that: + * (a) the 512-entry scan does not walk past the end of the page, and + * (b) bit N in the output bitmap corresponds to pte_base[N], making + * "pte_base + bit" the correct per-entry pointer in the loop. + * + * Keep orig_pte (= pte_base + pte_index(start)) for pte_unmap_unlock. + */ + pte_table = pte_offset_map_lock(walk->mm, pmd, start, &ptl); + if (!pte_table) + return 0; + orig_pte = pte_table; + + /* + * Open the FPU bracket lazily on the first bloom-hit PMD of this + * batch and run the raw scan inside it. marie_fpu_batch_step() + * closes the bracket once we've accumulated MARIE_FPU_BATCH + * scans; subsequent bloom hits reopen for the next batch. The + * scoped_guard around walk_page_range() in marie_walk_one_mm() + * flushes any partial batch on walker exit. + */ + marie_fpu_batch_open(&arg->fpu); + lru_marie_simd_young_pte_mask_raw(pte_table - pte_index(start), bitmap); + marie_fpu_batch_step(&arg->fpu); + + for_each_set_bit(bit, bitmap, 512) { + unsigned long addr = pmd_addr + bit * PAGE_SIZE; + pte_t *pte = orig_pte - pte_index(start) + bit; + pte_t ptent; + unsigned long pfn; + struct folio *folio; + struct marie_lruvec *mlv; + struct lruvec *lv; + int type; + int next_bit; + + /* Peek ahead at the next set bit and prefetch its + * struct page into L1. The body chain below does multiple + * folio-struct accesses (folio_pgdat, marie_test_tracked, + * folio_is_file_lru, folio_lruvec, marie_state_inc_tier). + * Sparse bitmap iteration defeats the hardware prefetcher, + * so an explicit lookahead hides struct page L2/L3 latency. */ + next_bit = find_next_bit(bitmap, 512, bit + 1); + if (next_bit < 512) { + pte_t next_ptent = ptep_get(orig_pte - pte_index(start) + + next_bit); + unsigned long next_pfn = pte_pfn(next_ptent); + + if (pte_present(next_ptent) && pfn_valid(next_pfn)) + __builtin_prefetch(pfn_to_page(next_pfn), 0, 3); + } + + /* Only process PTEs within the [start, end) walk range. */ + if (addr < start || addr >= end) + continue; + + ptent = ptep_get(pte); + if (!pte_present(ptent) || pte_special(ptent)) + continue; + + pfn = pte_pfn(ptent); + if (is_zero_pfn(pfn) || !pfn_valid(pfn)) + continue; + + folio = pfn_folio(pfn); + + /* Skip cross-node folios -- this pass is per pgdat. */ + if (folio_pgdat(folio) != arg->pgdat) + continue; + + /* Lock-free pre-filter: act only on folios Marie is currently + * tracking. folio->lru is no longer a Marie-state signal -- + * tracked folios sit on a self-loop (post install/flush) or + * on legacy lruvec->lists[lru] (post drain); only the per-PFN + * TRACKED bit identifies "Marie has a live tier value for + * this folio." */ + if (!folio_marie_test_tracked(folio)) + continue; + + if (!ptep_test_and_clear_young(vma, addr, pte)) + continue; + + type = folio_is_file_lru(folio); + lv = marie_folio_lruvec_rcu(folio); + + /* One-deep cache: most consecutive PTEs in an mm walk hit + * the same lruvec. Avoid the xa_load on every PTE. */ + if (type == 0 && arg->mlv_anon_cache && + arg->mlv_anon_cache->lruvec == lv) { + mlv = arg->mlv_anon_cache; + } else if (type == 1 && arg->mlv_file_cache && + arg->mlv_file_cache->lruvec == lv) { + mlv = arg->mlv_file_cache; + } else { + /* + * lv->marie_mlv is authoritative (no side xarray) and + * stays valid for the rest of this pass without RCU: + * the folios being walked are mapped in this mmget'd mm + * and therefore charged, which pins their memcg -> + * lruvec -> mlv alive (mlv is freed only at + * mem_cgroup_free). + */ + mlv = READ_ONCE(lv->marie_mlv); + if (!mlv) + continue; + if (type == 0) + arg->mlv_anon_cache = mlv; + else + arg->mlv_file_cache = mlv; + } + + /* + * Walker tier++: marie_tier_inc is a thin + * wrapper over marie_state_inc_tier on the per-PFN byte; + * non-saturated bumps are a lock-free WRITE_ONCE and the + * saturate path triggers a synchronous in-place promote + * (marie_state_move_to_gen to head_gen at tier 0). No + * per-tier counter to update -- tier lives entirely in + * marie_state[pfn] and the per-(type, gen, tier) bitmap + * is moved alongside the byte by the helper itself. + */ + (void)marie_tier_inc(mlv, folio); + } + + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + return 0; +} + +static const struct mm_walk_ops marie_walk_ops = { + .pmd_entry = marie_walk_pmd_range, + .walk_lock = PGWALK_RDLOCK, +}; + +static void marie_walk_one_mm(struct mm_struct *mm, struct marie_walk_arg *arg) +{ + if (!mmap_read_trylock(mm)) + return; + scoped_guard(marie_fpu_batch, &arg->fpu) + walk_page_range(mm, 0, TASK_SIZE, &marie_walk_ops, arg); + mmap_read_unlock(mm); +} + +/** + * marie_walk_pgdat - run one walker pass for @pgdat. + * + * Atomically claims the per-pgdat deadline; concurrent reclaimers / + * kswapd cycles either advance the deadline or no-op. The walker + * snapshots running mm_struct's via for_each_process under RCU, + * walks each via the SIMD pmd_entry handler, then drains both + * per-CPU queues for every mlv on this pgdat. + * + * Safe from any context that allows brief sleeping (cond_resched in + * the inner walk). + */ +void marie_walk_pgdat(struct pglist_data *pgdat) +{ + int nid = pgdat->node_id; + unsigned long deadline; + struct marie_walk_ctx *ctx; + struct task_struct *p; + struct marie_walk_arg arg = { + .pgdat = pgdat, + /* + * force_scan disabled: the cold-bloom force_scan was the + * dominant kswapd startup latency under fault-burst (full + * PMD scan = ~45 ms on memhog 2.5 GB). The walker's role is + * purely tier promotion; folio_check_references' Marie gate + * (vmscan.c) is independent of walker state because + * lru_marie_mark_accessed funnels external access into tier rather + * than PG_referenced, so reclaim functions correctly even + * with an unprimed bloom. Bloom is warmed lazily by + * lru_marie_look_around during the first reclaim cycle's rmap walk. + */ + .force_scan = false, + }; + int i; + + if (nid >= MAX_NUMNODES) + return; /* defensive */ + + /* Atomic test-and-claim deadline for this pgdat. */ + deadline = (unsigned long)atomic_long_read(&marie_walker_next[nid]); + if (time_before(jiffies, deadline)) + return; + if ((unsigned long)atomic_long_cmpxchg(&marie_walker_next[nid], + (long)deadline, + (long)(jiffies + marie_walker_interval(pgdat))) != deadline) + return; /* lost race to another reclaimer */ + + /* + * Pin to this CPU and reentrancy-claim its per-CPU walker ctx. + * The walker iterates up to MARIE_WALK_MAX_MMS mm_struct's per pass + * and walks each up to TASK_SIZE; running the entire pass with + * preempt_disable() makes cond_resched() inside marie_walk_pmd_range + * a no-op and starves the rest of the system to RCU stall under + * sustained memory pressure (observed as desktop stutter then + * freeze on real hardware). migrate_disable() keeps us on the + * CPU whose marie_walker_ctx we own, while marie_walker_busy stops a + * preempted-and-resumed reclaimer from reaching marie_walk_pgdat + * for a different pgdat on the same CPU and clobbering the + * in-flight snapshot. + */ + migrate_disable(); + if (this_cpu_cmpxchg(marie_walker_busy, 0, 1) != 0) { + migrate_enable(); + return; + } + + ctx = this_cpu_ptr(&marie_walker_ctx); + ctx->n_mms = 0; + + rcu_read_lock(); + for_each_process(p) { + struct mm_struct *mm = READ_ONCE(p->mm); + + if (!mm || ctx->n_mms >= MARIE_WALK_MAX_MMS) + continue; + if (!mmget_not_zero(mm)) + continue; + ctx->mms[ctx->n_mms++] = mm; + } + rcu_read_unlock(); + + /* + * Walk preemptibly. FPU bracket is held across MARIE_FPU_BATCH + * consecutive bloom-hit PMDs (see marie_fpu_batch_* helpers above) + * and flushed by the scoped_guard around walk_page_range() inside + * marie_walk_one_mm(), so the per-mm walk always exits with FPU + * released. Whole-pass FPU scope is avoided because the bitmap + * iteration runs inside the bracket and would extend the + * preempt-disabled window by the full iteration (~100 ms on memhog); + * the batched-per-PMD scope keeps preempt windows bounded at + * MARIE_FPU_BATCH x per-PMD time. + */ + for (i = 0; i < ctx->n_mms; i++) { + marie_walk_one_mm(ctx->mms[i], &arg); + /* + * mmput_async, not mmput: if our mmget_not_zero above pinned the + * last reference (the owning task exited mid-walk), a plain mmput + * here drops to zero and enters __mmput -> exit_mmap, which takes + * mm->mmap_lock. marie_walk_pgdat runs from kswapd's balance_pgdat + * with fs_reclaim held; taking mmap_lock under fs_reclaim closes + * the cycle against the execve path that takes mmap_lock then + * allocates (fs_reclaim) via mas_alloc_nodes. Caught by lockdep + * as a circular dependency and reproduced as a desktop hang under + * memory pressure with concurrent fork/exec. MGLRU solves the + * same problem the same way in iterate_mm_list (mm/vmscan.c). + */ + mmput_async(ctx->mms[i]); + } + + /* + * Pass-end housekeeping: mark every (gen, type) as walker-visited. + * + * marie_gen_walker_visits is a GLOBAL [gen][type] counter read only + * as a >= 1 boolean -- marie_state_shrink_lruvec gates + * ignore_references on "walker has visited this gen at least once", + * and the slot is reset to 0 in marie_try_advance_head on recycle. + * A single bump per pass is therefore equivalent to the former + * per-mlv bump, and there is no per-mlv state to touch here: walker + * tier saturate is materialised inline by marie_state_inc_tier + * during the per-PMD walk, so no promote-queue drain accumulates. + * + * This replaces the former xa_for_each(&marie_lruvec_xa) -- the only + * full traversal of that xarray, and the rcu_read_lock it ran under. + * mlv lifetime is now tied to the lruvec/memcg (lv->marie_mlv), so + * neither the traversal nor marie_drop_lruvec's synchronize_rcu() is + * needed. + */ + { + int t, g; + + for (t = 0; t < ANON_AND_FILE; t++) + for (g = 0; g < MARIE_PFN_NR_GENS; g++) + atomic_inc(&marie_gen_walker_visits[g][t]); + } + + /* + * Pass-end bloom rotation: the inactive filter has accumulated + * Producer (look_around) feedback during this reclaim window; + * promote it to active so the next pass scans those PMDs. The + * old active is recycled as the new inactive, cleared of stale + * bits. + */ + marie_bloom_swap(nid); + + /* Release the per-CPU ctx claim before allowing migration. */ + this_cpu_write(marie_walker_busy, 0); + migrate_enable(); +} + +/** + * lru_marie_age_node - kswapd's pre-reclaim aging hook. + * + * MGLRU's `lru_gen_age_node()` analogue. Called from kswapd_age_node() + * before direct reclaim machinery runs, so the gen ring has fresh + * hot/cold ordering by the time pressure builds. Delegates to the + * per-pgdat walker; rate-limiting is internal so calling on every + * kswapd cycle is fine. + */ +void lru_marie_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + marie_walk_pgdat(pgdat); +} +EXPORT_SYMBOL_GPL(lru_marie_age_node); + +/** + * marie_walker_init - one-shot init for the walker. + * + * Initialises per-pgdat bloom-filter spinlocks. Bitmaps themselves + * are lazily allocated by marie_bloom_set() on first Producer hit + * (under PTL, GFP_ATOMIC). Called from marie_init() in mm/lru_marie/core.c. + */ +void marie_walker_init(void) +{ + int nid; + + for (nid = 0; nid < MAX_NUMNODES; nid++) + spin_lock_init(&marie_blooms[nid].lock); +} diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 433bba9dfe..bed1c07fe4 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "swap.h" @@ -2000,6 +2001,18 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, } else WRITE_ONCE(vm_swappiness, val); +#ifdef CONFIG_LRU_MARIE + /* + * Notify Marie so its per-lruvec swap_bias controllers reset to + * neutral under the new value. lru_marie_swappiness_changed + * walks every lruvec once -- the reset is unconditional, not + * scoped to @memcg, because the controller's only state is the + * bias counter and an extra reset on unaffected lruvecs is + * harmless. See lru_marie.h. + */ + lru_marie_swappiness_changed(); +#endif + return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 749c128b4f..9f8f52c8ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -292,6 +293,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg) lru_reparent_memcg(memcg, parent, nid); } +#ifdef CONFIG_LRU_MARIE + /* + * Marie tracks folios in its own per-PFN / per-mlv structures, + * independent of MGLRU and legacy, so its reparent must run + * regardless of which branch above fired (the lru_gen branch + * never calls lru_reparent_memcg, where the helper is otherwise + * documented to live). reparent_locks() above holds objcg_lock + * plus both lruvecs' lru_lock with IRQs disabled, exactly + * matching lru_marie_reparent_lruvec's contract; the call + * short-circuits when lru_marie is off or the child has no mlv. + * + * This is mandatory on 7.1: cgroup_rstat_exit() now runs before + * css_free(), so a Marie counter update on the child after this + * point (e.g. from lru_marie_exit_memcg) would hit a freed + * css_rstat_cpu and crash in css_rstat_updated(). Draining the + * child's Marie state into the parent here prevents that. + */ + lru_marie_reparent_lruvec(mem_cgroup_lruvec(memcg, NODE_DATA(nid)), + mem_cgroup_lruvec(parent, NODE_DATA(nid))); +#endif + objcg = __memcg_reparent_objcgs(memcg, parent, nid); reparent_unlocks(memcg, parent, nid); @@ -3967,6 +3989,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { lru_gen_exit_memcg(memcg); +#ifdef CONFIG_LRU_MARIE + lru_marie_exit_memcg(memcg); +#endif memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -4040,6 +4065,15 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) memcg->deferred_split_queue.split_queue_len = 0; #endif lru_gen_init_memcg(memcg); +#ifdef CONFIG_LRU_MARIE + /* + * Per-memcg bitmap (G) alloc: no-op unless lru_marie.memcg_bitmap=1 + * is set on the kernel cmdline. Alloc failure is non-fatal -- the + * scan path falls back to per-candidate folio_memcg() lookup + * (hybrid E semantics). + */ + (void)lru_marie_memcg_alloc(memcg); +#endif return memcg; fail: mem_cgroup_private_id_remove(memcg); @@ -4199,6 +4233,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); + lru_marie_offline_memcg(memcg); drain_all_stock(memcg); diff --git a/mm/mm_init.c b/mm/mm_init.c index f9f8e1af92..5855f9ef89 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1395,6 +1395,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_init_kcompactd(pgdat); init_waitqueue_head(&pgdat->kswapd_wait); +#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP) + init_waitqueue_head(&pgdat->kcompmari_wait); +#endif init_waitqueue_head(&pgdat->pfmemalloc_wait); for (i = 0; i < NR_VMSCAN_THROTTLE; i++) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23c7298d3b..1978c03b10 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -1314,6 +1315,28 @@ __always_inline bool __free_pages_prepare(struct page *page, trace_mm_page_free(page, order); kmsan_free_page(page, order); +#ifdef CONFIG_LRU_MARIE + /* + * Wipe Marie's per-PFN state at the buddy handoff. Marie's reclaim + * isolate path intentionally leaves marie_state[pfn]'s TRACKED bit + * set across shrink_folio_list (so install_local's TRACKED early- + * out keeps blocking concurrent installs on the in-flight folio); + * this hook is the canonical point at which that stale bit must + * disappear so the next allocation at this PFN starts clean. No-op + * when TRACKED is already 0 (normal Marie del path cleared it). + * Only the head PFN ever carries TRACKED for compound folios. + * + * Gated on marie_state_ready() (latched at marie_state[] alloc), + * NOT lru_marie_enabled(): a Marie disable transition flips the + * enable key false while marie_drain is still walking the bitmaps, + * and freed pages in that window must still have their stale + * TRACKED bits wiped here -- otherwise the drain walk dereferences + * a re-allocated folio's poisoned list head and oopses. + */ + if (marie_state_ready()) + lru_marie_free_page_hook(page_to_pfn(page)); +#endif + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); @@ -4595,6 +4618,45 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, if (*no_progress_loops > MAX_RECLAIM_RETRIES) goto out; +#ifdef CONFIG_LRU_MARIE + /* + * Marie swap-backend-failure OOM trigger. + * + * Catches "get_nr_swap_pages() > 0 but writes still fail" — primarily + * ZRAM/zswap zs_malloc starvation when free RAM cannot satisfy the + * compression buffer, but also disk swap I/O errors. In this state + * can_reclaim_anon_pages() still reports true (slots appear free), so + * the pick driver keeps attempting anon swapout that never completes; + * left to run it grinds the file working set down to the clean_min_ratio + * floor before the no-progress path finally OOMs. Trip OOM as soon as + * the backend has rejected more than MAX_SWAP_WRITE_FAIL_RETRIES writes + * during this allocation, well before that grind. The threshold + * tolerates a handful of transient failures (concurrent ZRAM ops, brief + * retry windows). + * + * The free+swap exhaustion case needs no early watermark here: the pick + * driver and the legacy drain both withhold file reclaim at the + * clean_min_ratio floor (marie_file_floor_protect), so once file is at + * the floor and anon is unreclaimable reclaim returns no progress and + * the stock no_progress_loops path OOMs promptly — mirroring le9uo's + * reliance on the no-progress path at any floor size. + * + * Skipped for reserve / OOM-victim allocations (ALLOC_OOM, + * ALLOC_NO_WATERMARKS, tsk_is_oom_victim): those contexts exist to let a + * dying system make forward progress. MGLRU/Legacy builds + * (lru_marie_enabled()=false) keep vanilla retry semantics so this does + * not leak into baseline comparisons. + */ + if (lru_marie_enabled() && + likely(!(alloc_flags & (ALLOC_OOM | ALLOC_NO_WATERMARKS))) && + likely(!tsk_is_oom_victim(current))) { + long swap_fail_delta = atomic_long_read(&nr_swap_write_failed) - + ac->initial_swap_write_failed; + + if (swap_fail_delta > MAX_SWAP_WRITE_FAIL_RETRIES) + goto out; + } +#endif /* * Keep reclaiming pages while there is a chance this will lead @@ -4718,6 +4780,20 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, WARN_ON_ONCE(current->flags & PF_MEMALLOC); } +#ifdef CONFIG_LRU_MARIE + /* + * Snapshot the global swap-write-fail counter at the start of this + * allocation. should_reclaim_retry compares against this baseline so + * "swap backend rejected N writes since I started trying" can short- + * circuit the MAX_RECLAIM_RETRIES wait. See include/linux/swap.h. + * + * Snapshot unconditionally under CONFIG_LRU_MARIE so the field is in a + * defined state even if the lru_marie_enabled() gate flips between + * here and should_reclaim_retry's read. + */ + ac->initial_swap_write_failed = atomic_long_read(&nr_swap_write_failed); +#endif + restart: compaction_retries = 0; no_progress_loops = 0; diff --git a/mm/page_io.c b/mm/page_io.c index 70cea9e24d..3c958fbda1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -25,8 +25,19 @@ #include #include #include +#include +#include #include "swap.h" +#ifdef CONFIG_LRU_MARIE +/* + * Counter consumed by the early-OOM gate in + * mm/page_alloc.c:should_reclaim_retry. Declared in include/linux/swap.h. + * Marie-only: omitted entirely under CONFIG_LRU_MARIE=n. + */ +atomic_long_t nr_swap_write_failed = ATOMIC_LONG_INIT(0); +#endif + static void __end_swap_bio_write(struct bio *bio) { struct folio *folio = bio_first_folio_all(bio); @@ -39,7 +50,21 @@ static void __end_swap_bio_write(struct bio *bio) * very quickly. * * Also clear PG_reclaim to avoid folio_rotate_reclaimable() + * + * Bump nr_swap_write_failed so the early-OOM gate in + * should_reclaim_retry can short-circuit the + * MAX_RECLAIM_RETRIES wait when the swap backend (most + * commonly ZRAM/zswap zs_malloc, or a real disk error) has + * stopped accepting writes — anon reclaim is doomed in that + * state regardless of get_nr_swap_pages() reporting free + * entries. Marie-only signal; vanilla MGLRU/Legacy builds + * (lru_marie_enabled()=false) skip the counter bump so the + * baseline allocator sees vanilla retry behaviour. */ +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + atomic_long_inc(&nr_swap_write_failed); +#endif folio_mark_dirty(folio); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), @@ -233,6 +258,120 @@ static void swap_zeromap_folio_clear(struct folio *folio) } } +/* + * do_swapout() - Write a folio to swap space + * @folio: The folio to write out + * + * This function writes the folio to swap space, either using zswap or + * synchronous write. It ensures that the folio is unlocked and the + * reference count is decremented after the operation. + */ +static inline void do_swapout(struct folio *folio, struct swap_iocb **swap_plug) +{ + if (zswap_store(folio)) { + count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); + folio_unlock(folio); + } else + __swap_writepage(folio, swap_plug); /* Implies folio_unlock(folio) */ + + /* Decrement the folio reference count */ + folio_put(folio); +} + +#ifdef CONFIG_LRU_MARIE +/* + * kcompmari_store() - Off-load folio compression to kcompmari + * @folio: The folio to compress + * + * This function attempts to off-load the compression of the folio to + * kcompmari. If kcompmari is not available or the folio cannot be + * compressed, it falls back to synchronous write. + * + * Returns true if the folio was successfully queued for compression, + * false otherwise. + */ +static bool kcompmari_store(struct folio *folio, struct swap_iocb **swap_plug) +{ + pg_data_t *pgdat = NODE_DATA(numa_node_id()); + unsigned int ret; + struct folio *head = NULL; + + /* Only kswapd can use kcompmari */ + if (!current_is_kswapd()) + return false; + + /* Mode 0, or mode 1 with Marie off — short-circuit on the static branches. */ + if (!kcompmari_active()) + return false; + + /* kthread must be running */ + if (unlikely(!pgdat->kcompmari)) + return false; + + /* We can only off-load anon folios */ + if (!folio_test_anon(folio)) + return false; + + /* Fall back to synchronously return AOP_WRITEPAGE_ACTIVATE. + * folio_memcg -> obj_cgroup_memcg requires RCU read-side held to + * keep objcg from being freed by a concurrent memcg teardown + * (lockdep_assert_once in obj_cgroup_memcg). */ + { + bool zswap_wb_ok; + + rcu_read_lock(); + zswap_wb_ok = mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)); + rcu_read_unlock(); + if (!zswap_wb_ok) + return false; + } + + /* Swap device must be sync-efficient */ + if (!zswap_is_enabled() && + !data_race(__swap_entry_to_info(folio->swap)->flags & SWP_SYNCHRONOUS_IO)) + return false; + + /* + * The kfifo backing storage is sized at KCOMPMARI_FIFO_SIZE (the + * compile-time max). The effective queue depth is |vm_kcompmari|; + * when current depth meets or exceeds that, treat the queue as + * full and swap out the head folio synchronously to make space. + */ + scoped_guard(spinlock_irqsave, &pgdat->kcompmari_fifo_lock) + if (kfifo_len(&pgdat->kcompmari_fifo) >= + abs(READ_ONCE(vm_kcompmari)) * sizeof(struct folio *) && + unlikely(!kfifo_out(&pgdat->kcompmari_fifo, + &head, sizeof(folio)))) + return false; + + /* Increment the folio reference count to avoid it being freed */ + folio_get(folio); + + /* Enqueue the folio for compression */ + ret = kfifo_in(&pgdat->kcompmari_fifo, &folio, sizeof(folio)); + if (likely(ret)) + /* We successfully enqueued the folio. wake up kcompmari */ + wake_up_interruptible(&pgdat->kcompmari_wait); + else + /* Enqueue failed, so we must cancel the reference count */ + folio_put(folio); + + /* If we had to swap out the head folio, do it now. + * This will block until the folio is written out. + */ + if (head) + do_swapout(head, swap_plug); + + return ret; +} +#else /* !CONFIG_LRU_MARIE */ +static inline bool kcompmari_store(struct folio *folio, + struct swap_iocb **swap_plug) +{ + return false; +} +#endif + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -272,6 +411,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) */ swap_zeromap_folio_clear(folio); + /* + * Compression within zswap and zram might block rmap, unmap + * of both file and anon pages, try to do compression async + * if possible + */ + if (kcompmari_store(folio, swap_plug)) + return 0; + if (zswap_store(folio)) { count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); goto out_unlock; @@ -292,6 +439,46 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) return ret; } +#ifdef CONFIG_LRU_MARIE +/* + * kcompmari() - Kernel thread for compressing folios + * @p: Pointer to pg_data_t structure + * + * This function runs in a kernel thread and waits for folios to be + * queued for compression. It processes the folios by calling do_swapout() + * on them, which handles the actual writing to swap space. + */ +int kcompmari(void *p) +{ + pg_data_t *pgdat = (pg_data_t *)p; + struct folio *folio; + + /* * kcompmari runs with PF_MEMALLOC and PF_KSWAPD flags set to + * allow it to allocate memory for compression without being + * restricted by the current memory allocation context. + * Also PF_KSWAPD prevents Intel Graphics driver from crashing + * the system in i915_gem_shrinker.c:i915_gem_shrinker_scan() + */ + current->flags |= PF_MEMALLOC | PF_KSWAPD; + + while (!kthread_should_stop()) { + wait_event_interruptible(pgdat->kcompmari_wait, + !kfifo_is_empty(&pgdat->kcompmari_fifo)); + + while (kfifo_out_locked(&pgdat->kcompmari_fifo, + &folio, sizeof(folio), &pgdat->kcompmari_fifo_lock)) + /* + * kcompmari is async reclaim writeback; pass a NULL + * swap_plug so __swap_writepage submits each folio's + * bio immediately rather than batching it on a plug + * the caller would have to unplug. + */ + do_swapout(folio, NULL); + } + return 0; +} +#endif /* CONFIG_LRU_MARIE */ + static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/rmap.c b/mm/rmap.c index 78b7fb5f36..0cfaf5bcce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -981,6 +982,11 @@ static bool folio_referenced_one(struct folio *folio, if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) { if (lru_gen_look_around(&pvmw, nr)) referenced++; +#ifdef CONFIG_LRU_MARIE + } else if (lru_marie_enabled() && pvmw.pte) { + if (lru_marie_look_around(&pvmw, nr)) + referenced++; +#endif } else if (pvmw.pte) { if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr)) referenced++; diff --git a/mm/swap.c b/mm/swap.c index 5cc44f0de9..5696ddebbd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "internal.h" @@ -73,11 +74,39 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { - if (folio_test_lru(folio)) { - folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); - lruvec_del_folio(*lruvecp, folio); - __folio_clear_lru_flags(folio); + /* + * PG_lru is the "on an LRU list, still holding +nr LRU accounting" + * signal. A folio that Marie's reclaim isolate already claimed has + * PG_lru clear and its Marie counters already wound down + * (marie_account_evict_isolate); only its per-PFN TRACKED byte stays + * set, until the buddy handoff (marie_state_drop_pfn_at_free). Gating + * the Marie del path on PG_lru -- not TRACKED alone -- keeps such an + * isolated folio from being evict-accounted a SECOND time here: that + * double count (a TRACKED-only gate over-firing on the isolate path) + * drove marie_nr_folios and the per-mlv scan counters negative, which + * in turn fed a runaway reclaim scan. Legacy del is already + * PG_lru-gated, so an off-LRU folio was always a no-op here anyway. + * + * For an on-LRU folio, TRACKED then selects Marie del over legacy + * del. Both debit mz->lru_zone_size now (marie_update_lru_size is + * unified with legacy update_lru_size), so the choice is about the + * LIST, not the count: a TRACKED folio sits on a Marie self-loop, so + * legacy lruvec_del_folio's list_del would corrupt it -- it must go + * through lru_marie_release_folio, which unlinks the self-loop and + * debits mz. See lru_marie_release_folio's contract in + * . + */ + if (!folio_test_lru(folio)) + return; +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled() && lru_marie_test_tracked(folio)) { + lru_marie_release_folio(folio, lruvecp, flagsp); + return; } +#endif + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + lruvec_del_folio(*lruvecp, folio); + __folio_clear_lru_flags(folio); } /* @@ -171,6 +200,27 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); +#ifdef CONFIG_LRU_MARIE + /* + * lru_add's move_fn routes through lruvec_add_folio -> + * lru_marie_add_folio, which (on success) installs the folio + * into Marie and sets PG_lru itself. Marie's reclaim isolate + * path claims folios with a lock-free folio_test_clear_lru and + * does NOT hold lru_lock, so it can clear PG_lru in the window + * between the install above and this trailing folio_set_lru. + * Re-setting PG_lru here would then stamp PG_lru back onto a + * folio the isolate path already owns and is about to free, + * tripping "Bad page state |lru|" at free_unref_folios. When + * the folio is Marie-tracked, install already published PG_lru; + * skip the redundant (and racy) re-set. The non-lru_add move_fns + * (rotate/activate/deactivate/lazyfree) are gated away from + * Marie folios at their swap.c entry points, so they never + * reach here for a tracked folio. + */ + if (lru_marie_enabled() && lru_marie_test_tracked(folio)) + continue; +#endif + folio_set_lru(folio); } @@ -215,6 +265,44 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) if (folio_test_unevictable(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* + * This rotate-batch move_fn can run in hardirq: the lru_move_tail + * batch is flushed from folio_end_writeback() in the block-completion + * IRQ (e.g. nvme_irq -> blk_mq_end_request_batch). Marie's + * lruvec_del_folio / lruvec_add_folio_tail hooks must not run there: + * they assert !in_hardirq(), and lru_marie_add_folio() would ADOPT the + * folio into Marie (which never credits mz->lru_zone_size) right after + * the legacy del already did mz -nr, underflowing mz->lru_zone_size. + * + * Under Marie, handle the rotate without those hooks: + * - a tracked folio does not sit on the legacy list and ages by gen + * rotation, so rotate-to-tail is a no-op -- skip it; + * - a non-tracked folio is on the legacy lruvec list (mz-accounted), + * so rotate it with pure legacy list ops (this mirrors + * lruvec_del_folio + lruvec_add_folio_tail for an evictable, + * non-tracked folio, minus the Marie/lru_gen hooks). + */ + if (lru_marie_enabled()) { + long nr_pages = folio_nr_pages(folio); + int zid = folio_zonenum(folio); + enum lru_list lru; + + if (lru_marie_test_tracked(folio)) + return; + + lru = folio_lru_list(folio); + list_del(&folio->lru); + update_lru_size(lruvec, lru, zid, -nr_pages); + folio_clear_active(folio); + lru = folio_lru_list(folio); + update_lru_size(lruvec, lru, zid, nr_pages); + list_add_tail(&folio->lru, &lruvec->lists[lru]); + __count_vm_events(PGROTATED, nr_pages); + return; + } +#endif + lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); @@ -234,6 +322,14 @@ void folio_rotate_reclaimable(struct folio *folio) folio_test_unevictable(folio) || !folio_test_lru(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* Marie folios bypass legacy LRU lists; apply the rotate on the + * per-PFN state (demote toward prompt reclaim) instead of queueing + * the legacy lru_move_tail batch. See lru_marie_rotate(). */ + if (lru_marie_rotate(folio)) + return; +#endif + folio_batch_add_and_move(folio, lru_move_tail); } @@ -304,6 +400,12 @@ void lru_note_cost_refault(struct folio *folio) folio_nr_pages(folio), 0); } +/* + * lru_marie_orphan_add() (the non-adopting legacy add for untracked orphans + * inside a del+add move_fn) lives in mm/lru_marie/core.c so vmscan.c's reclaim + * putback can share it; declared in . + */ + static void lru_activate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); @@ -311,10 +413,24 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio) if (folio_test_active(folio) || folio_test_unevictable(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* + * Tracked Marie folios are never on legacy lists (the swap.c entry + * gates divert them); guard defensively, and route the untracked + * orphan's re-add away from lru_marie_add_folio()'s adopt path. + */ + if (lru_marie_enabled() && lru_marie_test_tracked(folio)) + return; +#endif lruvec_del_folio(lruvec, folio); folio_set_active(folio); - lruvec_add_folio(lruvec, folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + lru_marie_orphan_add(lruvec, folio, false); + else +#endif + lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); @@ -336,6 +452,13 @@ void folio_activate(struct folio *folio) !folio_test_lru(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* Marie folios bypass legacy LRU lists; apply the promote on the + * per-PFN state instead of queueing the legacy lru_activate batch. */ + if (lru_marie_activate(folio)) + return; +#endif + folio_batch_add_and_move(folio, lru_activate); } @@ -351,6 +474,15 @@ void folio_activate(struct folio *folio) if (!folio_test_clear_lru(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* Re-publish the PG_lru this path speculatively cleared above; the + * promote happened on the per-PFN state in lru_marie_activate(). */ + if (lru_marie_activate(folio)) { + folio_set_lru(folio); + return; + } +#endif + lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); lruvec_unlock_irq(lruvec); @@ -466,6 +598,32 @@ void folio_mark_accessed(struct folio *folio) lru_gen_inc_refs(folio); return; } +#ifdef CONFIG_LRU_MARIE + /* + * Marie: do NOT feed the explicit access signal into the tier. + * + * Tier has no decay -- marie_state_inc_tier only ever raises it + * (reset to 0 happens solely on the saturate->head promotion), and + * survivor re-publish preserves it (target_tier = max(prev, w)). So + * a per-access bump, which fires on essentially every read / pagecache + * hit / fault (filemap_read, __filemap_get_folio, shmem, gup, ...), + * is unbounded and monotonic: folios pin at hot_votes >= 1 in + * folio_check_references (permanent KEEP) or churn on the head-gen + * promotion treadmill. Under memory pressure that starves reclaim -- + * file stalls above the clean_min_ratio floor and anon is never + * swapped -- and the machine OOMs with swap free. + * + * Marie's hotness instead comes from the (rate-limited, kswapd-driven) + * walker young-bit scan plus the rmap young bits read at reclaim time + * in folio_check_references; both self-pace and clear, so they do not + * accumulate. Routing folio_mark_accessed here can be revisited only + * once tier gains a decay/aging mechanism. lru_marie_mark_accessed() + * is kept (dormant) for that future. Return without falling through to + * the legacy activate path, which would re-tier via lruvec_add_folio. + */ + if (lru_marie_enabled()) + return; +#endif if (!folio_test_referenced(folio)) { folio_set_referenced(folio); @@ -510,9 +668,24 @@ void folio_add_lru(struct folio *folio) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_folio_seq() */ +#ifdef CONFIG_LRU_MARIE + /* + * Marie bypass: Marie tracks folios via per-PFN state bytes, + * not on legacy/MGLRU lists, and does not use PG_active. If + * we set it here, the folio enters Marie with PG_active=1; + * later marie_state_shrink_lruvec -> shrink_folio_list trips + * VM_BUG_ON_FOLIO(folio_test_active(folio), folio) in + * mm/vmscan.c. Skip the MGLRU fault hint when Marie owns + * the LRU. (See also defensive clear in lru_marie_add_folio.) + */ + if (!lru_marie_enabled() && lru_gen_enabled() && !folio_test_unevictable(folio) && + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) + folio_set_active(folio); +#else if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); +#endif folio_batch_add_and_move(folio, lru_add); } @@ -569,6 +742,11 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) if (folio_mapped(folio)) return; +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled() && lru_marie_test_tracked(folio)) + return; +#endif + lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); @@ -580,14 +758,24 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) * race window is _really_ small and it's not a critical * problem. */ - lruvec_add_folio(lruvec, folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + lru_marie_orphan_add(lruvec, folio, false); + else +#endif + lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ - lruvec_add_folio_tail(lruvec, folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + lru_marie_orphan_add(lruvec, folio, true); + else +#endif + lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } @@ -605,10 +793,20 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) return; +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled() && lru_marie_test_tracked(folio)) + return; +#endif + lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); - lruvec_add_folio(lruvec, folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + lru_marie_orphan_add(lruvec, folio, false); + else +#endif + lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); @@ -622,6 +820,11 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled() && lru_marie_test_tracked(folio)) + return; +#endif + lruvec_del_folio(lruvec, folio); folio_clear_active(folio); if (lru_gen_enabled()) @@ -634,7 +837,12 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) * anonymous folios */ folio_clear_swapbacked(folio); - lruvec_add_folio(lruvec, folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) + lru_marie_orphan_add(lruvec, folio, false); + else +#endif + lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); @@ -698,6 +906,13 @@ void deactivate_file_folio(struct folio *folio) if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* Marie folios bypass legacy LRU lists; apply the demote on the + * per-PFN state instead of queueing the legacy batch. */ + if (lru_marie_deactivate(folio)) + return; +#endif + folio_batch_add_and_move(folio, lru_deactivate_file); } @@ -717,6 +932,13 @@ void folio_deactivate(struct folio *folio) if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* Marie folios bypass legacy LRU lists; apply the demote on the + * per-PFN state instead of queueing the legacy batch. */ + if (lru_marie_deactivate(folio)) + return; +#endif + folio_batch_add_and_move(folio, lru_deactivate); } @@ -734,6 +956,14 @@ void folio_mark_lazyfree(struct folio *folio) folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; +#ifdef CONFIG_LRU_MARIE + /* Marie folios bypass legacy LRU lists; lru_marie_lazyfree() clears + * PG_swapbacked synchronously (MADV_FREE: free-without-writeback) and + * demotes on the per-PFN state instead of queueing the legacy batch. */ + if (lru_marie_lazyfree(folio)) + return; +#endif + folio_batch_add_and_move(folio, lru_lazyfree); } diff --git a/mm/swap.h b/mm/swap.h index a77016f242..7f499ac84c 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -227,6 +227,10 @@ static inline void swap_read_unplug(struct swap_iocb *plug) void swap_write_unplug(struct swap_iocb *sio); int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); +#ifdef CONFIG_LRU_MARIE +/* CONFIG_SWAP is implied by this branch (mm/swap.h's CONFIG_SWAP gate). */ +int kcompmari(void *p); +#endif /* linux/mm/swap_state.c */ extern struct address_space swap_space __read_mostly; diff --git a/mm/vmscan.c b/mm/vmscan.c index bd1b1aa125..3f09c89868 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -181,6 +182,36 @@ struct scan_control { struct reclaim_state reclaim_state; }; +#ifdef CONFIG_LRU_MARIE +/* + * Accessors for out-of-tree reclaim readers (mm/lru_marie). Declared + * in mm/internal.h with the struct kept private here. Trivial + * field reads + one helper for the "target reached" comparison and + * one for the reclaimed-count update. EXPORT_SYMBOL_GPL is + * unnecessary -- Marie is built into vmlinux when CONFIG_LRU_MARIE + * is on, never a module. + */ +int sc_priority(const struct scan_control *sc) +{ + return sc->priority; +} + +int sc_reclaim_idx(const struct scan_control *sc) +{ + return sc->reclaim_idx; +} + +bool sc_reclaim_target_reached(const struct scan_control *sc) +{ + return sc->nr_reclaimed >= sc->nr_to_reclaim; +} + +void sc_add_reclaimed(struct scan_control *sc, unsigned long nr) +{ + sc->nr_reclaimed += nr; +} +#endif + #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ @@ -197,8 +228,19 @@ struct scan_control { /* * From 0 .. MAX_SWAPPINESS. Higher means more swappy. + * + * CONFIG_LRU_MARIE=y: default 1. Under Marie's per-PFN reclaim + * driver this maps to MARIE_PICK_FILE_THEN_ANON -- anon is fully + * protected until clean_min_ratio is breached, matching the + * ZRAM-era assumption that file pagecache is cheaper to refault + * than anon is to swap. See the storage-tier rationale at the top + * of mm/lru_marie/core.c. */ +#ifdef CONFIG_LRU_MARIE +int vm_swappiness = 1; +#else int vm_swappiness = 60; +#endif #ifdef CONFIG_MEMCG @@ -397,10 +439,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone struct zone *zone; for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) { - if (!mem_cgroup_disabled()) + if (!mem_cgroup_disabled()) { + /* + * mz->lru_zone_size accounts Marie-tracked folios too + * (marie_update_lru_size credits/debits mz like legacy + * update_lru_size), so no Marie-specific summing here. + */ size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); - else + } else { size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); + } } return size; } @@ -458,6 +506,45 @@ static int reclaimer_offset(struct scan_control *sc) return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } +#ifdef CONFIG_LRU_MARIE +/* + * Wrapper used by mm/lru_marie, which sees @sc but not the static + * reclaimer_offset() above. 7.0's reclaimer_offset() already takes + * @sc, so just forward it. + */ +int vmscan_reclaimer_offset(struct scan_control *sc) +{ + return reclaimer_offset(sc); +} + +/* + * cgroup_reclaim() is static above and struct scan_control is private + * to vmscan.c. Marie needs the same predicate to gate its PGSCAN_* / + * PGSTEAL_* event accounting (cgroup-scoped reclaim must not bump the + * global vm events). Expose it as an sc_* accessor matching the + * pattern already used for sc_priority / sc_reclaim_idx etc. + */ +bool sc_cgroup_reclaim(const struct scan_control *sc) +{ + return cgroup_reclaim((struct scan_control *)sc); +} + +/* + * can_reclaim_anon_pages() is static above. Marie's pick driver needs + * the same predicate: when anon cannot be reclaimed at all (no free + * swap slots, cgroup swap limit hit, no demotion target) the + * swappiness/bias controller is meaningless -- every ANON pick + * reclaims nothing -- so Marie must force FILE reclaim, mirroring + * get_scan_count()'s "!can_reclaim_anon_pages -> SCAN_FILE" forcing. + * Expose it as a vmscan_* wrapper; struct scan_control stays private. + */ +bool vmscan_can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, + struct scan_control *sc) +{ + return can_reclaim_anon_pages(memcg, nid, sc); +} +#endif + /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -885,7 +972,11 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; +#ifdef CONFIG_LRU_MARIE + if (!lru_marie_enabled() && lru_gen_enabled() && !lru_gen_switching()) { +#else if (lru_gen_enabled() && !lru_gen_switching()) { +#endif if (!referenced_ptes) return FOLIOREF_RECLAIM; @@ -894,6 +985,30 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_folio = folio_test_clear_referenced(folio); +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) { + unsigned int tier = folio_marie_get_tier(folio); + int hot_votes; + + hot_votes = (tier > 0) + (referenced_ptes > 0) + !!referenced_folio; + + if (hot_votes >= 2 || referenced_ptes > 1) + return FOLIOREF_ACTIVATE; + + if (referenced_ptes > 0 && (vm_flags & VM_EXEC) && + folio_is_file_lru(folio)) + return FOLIOREF_ACTIVATE; + + if (hot_votes == 1 && referenced_folio && folio_is_file_lru(folio)) + return FOLIOREF_RECLAIM_CLEAN; + + if (hot_votes >= 1) + return FOLIOREF_KEEP; + + return FOLIOREF_RECLAIM; + } +#endif + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1053,9 +1168,13 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) } /* - * shrink_folio_list() returns the number of reclaimed pages + * shrink_folio_list() returns the number of reclaimed pages. + * + * Exposed via mm/internal.h so that mm/lru_marie can drive its own + * isolate->shrink->putback loop without duplicating the per-folio + * reclaim machinery. */ -static unsigned int shrink_folio_list(struct list_head *folio_list, +unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references, struct mem_cgroup *memcg) @@ -1916,7 +2035,34 @@ static unsigned int move_folios_to_lru(struct list_head *list) continue; } - lruvec_add_folio(lruvec, folio); + /* + * All pages were isolated from the same lruvec (and isolation + * inhibits memcg migration). + */ + VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); +#ifdef CONFIG_LRU_MARIE + /* + * Legacy reclaim putback. Under Marie this is reached for the + * untracked orphans that legacy shrink_{in,}active_list isolates + * off legacy lists (e.g. workingset-refault activations routed + * through folio_activate). Routing their re-add through + * lruvec_add_folio() -> lru_marie_add_folio() would ADOPT them + * into Marie -- the exact adopt asymmetry fixed for swap.c's + * move_fns. Mirror that fix here: a tracked folio must never + * have been on a legacy list (WARN; hand it back to Marie, whose + * install early-out re-asserts ownership), and an untracked + * orphan gets a pure non-adopting legacy add. + */ + if (lru_marie_enabled()) { + if (unlikely(lru_marie_test_tracked(folio))) { + VM_WARN_ON_ONCE_FOLIO(1, folio); + lruvec_add_folio(lruvec, folio); + } else { + lru_marie_orphan_add(lruvec, folio, false); + } + } else +#endif + lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; if (folio_test_active(folio)) @@ -5307,6 +5453,55 @@ static bool drain_evictable(struct lruvec *lruvec) return true; } +/* + * lru_gen_fill_lruvec - hand off legacy LRU residue to MGLRU. + * + * Move every folio currently on lruvec->lists[lru] into lrugen via + * the canonical lru_gen_add_folio path. Symmetric counterpart to + * lru_gen_drain_lruvec below; exported so external LRU drivers + * (mm/lru_marie) can call it after their own drain pass to keep + * MGLRU's state_is_valid invariant ("lrugen enabled => legacy + * lists empty") intact across enable/disable cycles of the other + * driver. + * + * Caller must hold @lruvec->lru_lock with IRQs disabled. The + * helper internally releases and reacquires across the cond_resched + * between MAX_LRU_BATCH-sized passes, matching the locking pattern + * lru_gen_change_state itself uses. + */ +void lru_gen_fill_lruvec(struct lruvec *lruvec) +{ + while (!fill_evictable(lruvec)) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } +} +EXPORT_SYMBOL_GPL(lru_gen_fill_lruvec); + +/* + * lru_gen_drain_lruvec - evacuate lrugen via the canonical add path. + * + * Inverse of lru_gen_fill_lruvec: empty lrugen by removing every + * folio via lru_gen_del_folio and re-adding via lruvec_add_folio. + * With another LRU driver's gate on (e.g. Marie), the re-add routes + * through that driver's install path -- which both saves Marie from + * reimplementing MGLRU's accounting and gives the folio Marie's + * canonical per-PFN install for free. With no other driver active + * the folios fall through to lruvec->lists[lru]. + * + * Caller must hold @lruvec->lru_lock with IRQs disabled. + */ +void lru_gen_drain_lruvec(struct lruvec *lruvec) +{ + while (!drain_evictable(lruvec)) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } +} +EXPORT_SYMBOL_GPL(lru_gen_drain_lruvec); + static void lru_gen_change_state(bool enabled) { static DEFINE_MUTEX(state_mutex); @@ -5827,7 +6022,15 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; - lrugen->enabled = lru_gen_enabled(); + /* + * lrugen->enabled mirrors the raw MGLRU core key, not the + * Marie-masked lru_gen_enabled() view: it must stay true when MGLRU + * is configured-on even while Marie masks MGLRU off, so the + * Marie-disable ownership handoff (lru_gen_fill_lruvec -> + * fill_evictable -> lru_gen_add_folio, which bails on !lrugen->enabled) + * can migrate folios back onto lrugen. + */ + lrugen->enabled = lru_gen_core_enabled(); for (i = 0; i <= MIN_NR_GENS + 1; i++) lrugen->timestamps[i] = jiffies; @@ -5927,7 +6130,23 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_to_reclaim = sc->nr_to_reclaim; bool proportional_reclaim; struct blk_plug plug; +#ifdef CONFIG_LRU_MARIE + unsigned int marie_drain_mask = MARIE_DRAIN_ANON | MARIE_DRAIN_FILE; +#endif +#ifdef CONFIG_LRU_MARIE + if (lru_marie_enabled()) { + marie_drain_mask = lru_marie_shrink_lruvec(lruvec, sc); + /* + * Fall through to the legacy reclaim path below to drain orphan + * folios (failed Marie install, drain/reparent handoffs) that + * landed on lruvec->lists; MGLRU is bypassed under Marie. The + * drain is constrained by marie_drain_mask below so it touches + * only the type(s) Marie scanned. Common case: the lists are + * empty and this is a cheap no-op. + */ + } else +#endif if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); @@ -5938,6 +6157,29 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) get_scan_count(lruvec, sc, nr); +#ifdef CONFIG_LRU_MARIE + /* + * Constrain the legacy orphan drain to the type(s) Marie's pick driver + * actually scanned this call (marie_drain_mask). Stock get_scan_count's + * policy (SCAN_EQUAL at sc->priority==0, SCAN_ANON on file_is_tiny) + * ignores Marie's swappiness / clean_min_ratio / ANON_STRICT decisions + * and would otherwise cut the protected type behind the driver's back + * (e.g. evicting file at vm.swappiness=200, or swapping at swappiness=0). + * Zero the nr[] of any type Marie did not scan. Marie-only; the + * legacy/MGLRU nr[] is left byte-identical. + */ + if (lru_marie_enabled()) { + if (!(marie_drain_mask & MARIE_DRAIN_FILE)) { + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + } + if (!(marie_drain_mask & MARIE_DRAIN_ANON)) { + nr[LRU_ACTIVE_ANON] = 0; + nr[LRU_INACTIVE_ANON] = 0; + } + } +#endif + /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -6193,14 +6435,32 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; - if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { +#ifdef CONFIG_LRU_MARIE + /* + * MGLRU's root-reclaim shortcut bypasses shrink_node_memcgs entirely, + * which is where shrink_lruvec — and therefore Marie — gets invoked. + * When lru_marie_enabled() that bypass would leave kswapd walking empty + * MGLRU gens (since folios live in Marie gens) and never touching Marie + * at all. Gate the shortcut on !lru_marie_enabled() so kswapd takes the + * standard shrink_node_memcgs path under Marie. + */ + if (!lru_marie_enabled() && + (lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); if (!lru_gen_switching()) return; + } +#else + if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { + memset(&sc->nr, 0, sizeof(sc->nr)); + lru_gen_shrink_node(pgdat, sc); + if (!lru_gen_switching()) + return; } +#endif target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -6469,6 +6729,17 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; +#ifdef CONFIG_LRU_MARIE + /* + * Marie has no equivalent of legacy refault tracking yet, and the + * legacy WORKINGSET_* counters don't reflect Marie state — skip the + * snapshot to avoid feeding MGLRU/legacy-tuned heuristics with stale + * data. + */ + if (lru_marie_enabled()) + return; +#endif + if (lru_gen_enabled() && !lru_gen_switching()) return; @@ -6859,6 +7130,21 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) struct mem_cgroup *memcg; struct lruvec *lruvec; +#ifdef CONFIG_LRU_MARIE + /* + * Marie: drive proactive aging from kswapd's pre-reclaim hook so the + * gen ring has accurate hot/cold ordering by the time direct reclaim + * picks the tail. lru_marie_age_node() walks running tasks' PTEs + * (rate-limited internally) and skips the legacy active-list + * deactivation below — legacy lists only hold mempool-failure orphans + * under Marie and aging them is not worthwhile. + */ + if (lru_marie_enabled()) { + lru_marie_age_node(pgdat, sc); + return; + } +#endif + if (lru_gen_enabled() || lru_gen_switching()) { lru_gen_age_node(pgdat, sc); @@ -7632,6 +7918,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); +#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP) + int ret; +#endif pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { @@ -7645,7 +7934,32 @@ void __meminit kswapd_run(int nid) } else { wake_up_process(pgdat->kswapd); } +#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP) + ret = kfifo_alloc(&pgdat->kcompmari_fifo, + KCOMPMARI_FIFO_SIZE * sizeof(struct folio *), + GFP_KERNEL); + if (ret) { + pr_err("%s: fail to kfifo_alloc\n", __func__); + goto out; + } + + pr_info("kcompmari (forked from kcompressd-unofficial by Masahito Suzuki, originally Kcompressd by Qun-Wei Lin from MediaTek)\n"); + spin_lock_init(&pgdat->kcompmari_fifo_lock); + pgdat->kcompmari = kthread_create_on_node(kcompmari, pgdat, nid, + "kcompmari%d", nid); + if (IS_ERR(pgdat->kcompmari)) { + pr_err("Failed to start kcompmari on node %d,ret=%ld\n", + nid, PTR_ERR(pgdat->kcompmari)); + pgdat->kcompmari = NULL; + kfifo_free(&pgdat->kcompmari_fifo); + } else { + wake_up_process(pgdat->kcompmari); + } +#endif } +#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP) +out: +#endif pgdat_kswapd_unlock(pgdat); } @@ -7664,16 +7978,60 @@ void __meminit kswapd_stop(int nid) kthread_stop(kswapd); pgdat->kswapd = NULL; } +#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP) + if (pgdat->kcompmari) { + kthread_stop(pgdat->kcompmari); + pgdat->kcompmari = NULL; + kfifo_free(&pgdat->kcompmari_fifo); + } +#endif pgdat_kswapd_unlock(pgdat); } +#ifdef CONFIG_LRU_MARIE +/* + * vm.swappiness write notifier for the Marie LRU controller. Calls + * the default proc_dointvec_minmax to perform range-checked storage + * into vm_swappiness, then, on a successful write, notifies Marie so + * it can reset every per-lruvec swap_bias counter. The notification + * is skipped on read or on validation failure -- only an actual + * value change should trigger controller reset. + * + * Note: we always notify on a successful write even when the new + * value equals the old one. The cost is one xa walk; the alternative + * (snapshot+compare) would require atomicity guarantees that + * proc_dointvec_minmax does not provide, and gives no practical + * benefit since reset-to-zero is idempotent. + */ +static int marie_swappiness_sysctl_handler(const struct ctl_table *table, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (write && !ret) + lru_marie_swappiness_changed(); + return ret; +} +#endif + static const struct ctl_table vmscan_sysctl_table[] = { { .procname = "swappiness", .data = &vm_swappiness, .maxlen = sizeof(vm_swappiness), .mode = 0644, +#ifdef CONFIG_LRU_MARIE + /* + * Marie wraps the default minmax handler so that a sysctl + * write resets every per-lruvec swap_bias counter to zero. + * See mm/lru_marie/state.c::marie_swap_bias_update for the + * controller this notification clears. + */ + .proc_handler = marie_swappiness_sysctl_handler, +#else .proc_handler = proc_dointvec_minmax, +#endif .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, -- 2.34.1