From 6078869bd7dbdacb076d36a8fc8e3ee592131413 Mon Sep 17 00:00:00 2001
From: Masahito S <firelzrd@gmail.com>
Date: Thu, 11 Jun 2026 09:08:40 +0900
Subject: [PATCH] linux7.1-rc5-lru_marie-0.3.5

---
 include/linux/lru_marie.h      |  494 ++++++
 include/linux/mm_inline.h      |  104 +-
 include/linux/mmzone.h         |   30 +
 include/linux/swap.h           |   18 +
 mm/Kconfig                     |   30 +
 mm/Makefile                    |    1 +
 mm/huge_memory.c               |   26 +-
 mm/internal.h                  |   55 +
 mm/lru_marie/Makefile          |   19 +
 mm/lru_marie/account.h         |  176 ++
 mm/lru_marie/bitmap.c          |  166 ++
 mm/lru_marie/bitmap.h          |  228 +++
 mm/lru_marie/core.c            | 1982 +++++++++++++++++++++++
 mm/lru_marie/drain_scope.h     |   87 +
 mm/lru_marie/pfn_install.h     |   92 ++
 mm/lru_marie/prefetch.h        |  111 ++
 mm/lru_marie/simd.h            |   99 ++
 mm/lru_marie/simd_generic.c    |   46 +
 mm/lru_marie/simd_x86.c        |  167 ++
 mm/lru_marie/simd_x86_avx2.S   |  214 +++
 mm/lru_marie/simd_x86_avx512.S |  199 +++
 mm/lru_marie/simd_x86_sse2.S   |  215 +++
 mm/lru_marie/state.c           | 2745 ++++++++++++++++++++++++++++++++
 mm/lru_marie/state.h           | 1335 ++++++++++++++++
 mm/lru_marie/version.h         |   22 +
 mm/lru_marie/walker.c          |  961 +++++++++++
 mm/memcontrol-v1.c             |   13 +
 mm/memcontrol.c                |   35 +
 mm/mm_init.c                   |    3 +
 mm/page_alloc.c                |   76 +
 mm/page_io.c                   |  187 +++
 mm/rmap.c                      |    6 +
 mm/swap.c                      |  248 ++-
 mm/swap.h                      |    4 +
 mm/vmscan.c                    |  372 ++++-
 35 files changed, 10546 insertions(+), 20 deletions(-)
 create mode 100644 include/linux/lru_marie.h
 create mode 100644 mm/lru_marie/Makefile
 create mode 100644 mm/lru_marie/account.h
 create mode 100644 mm/lru_marie/bitmap.c
 create mode 100644 mm/lru_marie/bitmap.h
 create mode 100644 mm/lru_marie/core.c
 create mode 100644 mm/lru_marie/drain_scope.h
 create mode 100644 mm/lru_marie/pfn_install.h
 create mode 100644 mm/lru_marie/prefetch.h
 create mode 100644 mm/lru_marie/simd.h
 create mode 100644 mm/lru_marie/simd_generic.c
 create mode 100644 mm/lru_marie/simd_x86.c
 create mode 100644 mm/lru_marie/simd_x86_avx2.S
 create mode 100644 mm/lru_marie/simd_x86_avx512.S
 create mode 100644 mm/lru_marie/simd_x86_sse2.S
 create mode 100644 mm/lru_marie/state.c
 create mode 100644 mm/lru_marie/state.h
 create mode 100644 mm/lru_marie/version.h
 create mode 100644 mm/lru_marie/walker.c

diff --git a/include/linux/lru_marie.h b/include/linux/lru_marie.h
new file mode 100644
index 0000000000..7ef003b89f
--- /dev/null
+++ b/include/linux/lru_marie.h
@@ -0,0 +1,494 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LRU_MARIE_H
+#define _LINUX_LRU_MARIE_H
+
+/*
+ * Marie LRU — public API.
+ *
+ * Marie represents each folio's reclaim state as a single byte in a
+ * flat per-PFN array allocated once at boot (see mm/lru_marie/state.h
+ * for the byte layout). install / delete / aging are single byte
+ * writes — no allocation in any fault-path operation, no per-folio
+ * linked-list linkage. isolate batches the array with a persistent
+ * cursor and SIMD scan.
+ *
+ * The vmscan core invokes Marie only when lru_marie_enabled() is true;
+ * otherwise the in-tree reclaim paths run unchanged.
+ *
+ * Enabled by default; opt out at boot with `lru_marie=0` on the kernel
+ * command line, or at runtime via /sys/kernel/mm/lru_marie/enabled
+ * (echo 1 / echo 0).
+ *
+ * This header exposes only the thin dispatch surface that mm/vmscan.c,
+ * mm/swap.c, mm/memcontrol.c, etc. need to know about. Everything
+ * else lives inside mm/lru_marie/.
+ */
+
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
+#include <linux/mm_types.h>
+#include <linux/mmzone.h>
+#include <linux/types.h>
+
+struct folio;
+struct lruvec;
+struct mem_cgroup;
+struct page_vma_mapped_walk;
+struct pglist_data;
+struct scan_control;
+
+#ifdef CONFIG_LRU_MARIE
+
+DECLARE_STATIC_KEY_TRUE(lru_marie_enabled_key);
+
+/**
+ * lru_marie_enabled - is the Marie reclaim path currently active?
+ *
+ * Inlined static-branch check.  Default is enabled; the static-key
+ * compiles into a single unconditional jump that the predictor
+ * resolves in zero cycles. The MGLRU/Legacy paths cost nil when Marie
+ * is enabled (the common case) since the branch falls through to the
+ * Marie-side code without any conditional dispatch overhead.
+ */
+static inline bool lru_marie_enabled(void)
+{
+	return static_branch_likely(&lru_marie_enabled_key);
+}
+
+DECLARE_STATIC_KEY_FALSE(marie_state_ready_key);
+
+/**
+ * marie_state_ready - has the per-PFN marie_state[] array been allocated?
+ *
+ * Distinct from lru_marie_enabled(): the enable key is the runtime
+ * reclaim-policy toggle (flipped by the sysfs knob and during the
+ * disable/enable transition), whereas this key latches true once
+ * marie_state[] is allocated at first-enable and NEVER flips back --
+ * the array is never freed for the kernel's lifetime.
+ *
+ * The page-free hook (lru_marie_free_page_hook) must gate on THIS key,
+ * not on lru_marie_enabled(): during a disable transition the enable
+ * key is already false while marie_drain walks the bitmaps, but freed
+ * pages still carry stale TRACKED bits that must be wiped at the buddy
+ * handoff so the drain walk never dereferences a re-allocated folio's
+ * poisoned list head. Gating the hook on the enable key would skip
+ * exactly that window.
+ */
+static inline bool marie_state_ready(void)
+{
+	return static_branch_unlikely(&marie_state_ready_key);
+}
+
+/**
+ * lru_marie_mark_accessed - Marie's hot-signal entry point for folio_mark_accessed.
+ *
+ * Bumps @folio's Marie tier in the per-PFN marie_state[] byte. Tier is
+ * the canonical hotness signal in Marie: the walker bumps tier on
+ * young-bit hits, and this helper lets external "user just touched"
+ * callers (folio_mark_accessed) feed the same channel. When tier is at
+ * MARIE_TIER_MAX the helper triggers a synchronous in-place promote
+ * (marie_state_move_to_gen to head_gen at tier 0) inside
+ * marie_state_inc_tier. Calling this from the user access hot path
+ * therefore costs at most one byte write plus a possible single CAS;
+ * no slab alloc, no enqueue.
+ *
+ * Why not folio_set_referenced(): Marie's tier-based gen rotation already
+ * encodes "recently accessed". Setting PG_referenced in addition produced
+ * a double-counting hot signal that the reclaim path had to reconcile,
+ * and the reconciliation rule (any of {PG_referenced, PG_active} treated
+ * as promote-in-place during reclaim) starved kswapd reclaim under
+ * fault-burst workloads.
+ */
+void lru_marie_mark_accessed(struct folio *folio);
+
+/**
+ * folio_marie_get_tier - return @folio's Marie hotness tier (0..3).
+ *
+ * Reads the per-PFN marie_state[folio_pfn(folio)] byte's MARIE_PFN_TIER
+ * field. Returns 0 if Marie is disabled, the PFN is out of range, or
+ * the folio is not Marie-tracked.
+ */
+unsigned int folio_marie_get_tier(const struct folio *folio);
+
+/**
+ * lru_marie_test_tracked - is @folio currently tracked by Marie?
+ *
+ * Reads the per-PFN marie_state[folio_pfn(folio)] byte's TRACKED bit.
+ * Returns false if Marie is disabled, the PFN is out of range, or the
+ * folio is not Marie-tracked.
+ *
+ * Used by mm/swap.c per-cpu folio_batch entry points (rotate / activate
+ * / deactivate / lazyfree) to skip queueing Marie folios: those paths
+ * do legacy lruvec_del_folio + lruvec_add_folio, whose list_del/list_add
+ * assume the folio is on a legacy lruvec list. Marie folios sit on a
+ * self-loop (folio->lru points at itself), not on a legacy list, so a
+ * legacy del/add would corrupt the list. (mz->lru_zone_size is balanced
+ * for Marie folios now -- marie_update_lru_size credits it at install --
+ * so the hazard is list corruption, not count underflow.)
+ */
+bool lru_marie_test_tracked(const struct folio *folio);
+
+/*
+ * Per-cpu folio_batch LRU-op interface. mm/swap.c's folio_activate /
+ * folio_deactivate / deactivate_file_folio / folio_rotate_reclaimable /
+ * folio_mark_lazyfree each call the matching hook below; a true return
+ * means Marie owns the folio and has applied the operation directly on
+ * its per-PFN state, so the caller must NOT queue the folio onto the
+ * legacy per-cpu folio_batch (which assumes legacy-LRU list/mz invariants
+ * Marie folios break). A false return (Marie disabled or folio untracked)
+ * lets the caller fall through to the legacy folio_batch path unchanged.
+ *
+ * This mirrors lru_marie_add_folio / lru_marie_del_folio's bool contract:
+ * the Marie-specific semantics live here in mm/lru_marie/, not as inline
+ * gates scattered across mm/swap.c.
+ *
+ * Marie-state equivalents:
+ *   deactivate / _file -> demote  (move to oldest gen, tier 0) [MADV_COLD]
+ *   lazyfree           -> clear swapbacked + demote            [MADV_FREE]
+ *   activate / rotate  -> no-op (skip the batch only)
+ *
+ * activate / rotate are reclaim-internal hints: Marie already decides
+ * hotness via its tier vote in folio_check_references and orders reclaim
+ * by gen aging, so promoting/rotating here would only fight reclaim (an
+ * activate-promote starves it under all-hot workloads). Only the explicit
+ * user madvise paths (deactivate=MADV_COLD, lazyfree=MADV_FREE) map to a
+ * real Marie-state change.
+ */
+bool lru_marie_activate(struct folio *folio);
+bool lru_marie_deactivate(struct folio *folio);
+bool lru_marie_rotate(struct folio *folio);
+bool lru_marie_lazyfree(struct folio *folio);
+
+/**
+ * lru_marie_free_page_hook - canonical per-PFN state teardown at buddy
+ *                            handoff.
+ *
+ * Invoked from mm/page_alloc.c::free_pages_prepare for every page about
+ * to enter the buddy allocator. When marie_state[pfn] still carries
+ * TRACKED -- which happens whenever the reclaim isolate path
+ * (marie_evict_counters_only) decremented counters but intentionally
+ * preserved the state byte so install_local's TRACKED early-out kept
+ * blocking concurrent installs during shrink_folio_list -- this wipes
+ * the byte, the global (type, gen, tier) bitmap bit, and the
+ * gen_occupied slot in one lock-free pass.
+ *
+ * After this hook the next install at the same PFN starts from a clean
+ * state byte regardless of how quickly the page is re-allocated; no
+ * deferred drop pass is needed at the reclaim caller side.
+ *
+ * Static-branch gated by lru_marie_enabled() at the call site to keep
+ * the !Marie build / runtime byte-identical.
+ */
+void lru_marie_free_page_hook(unsigned long pfn);
+
+/*
+ * Return value of lru_marie_shrink_lruvec(): a mask of the LRU type(s) the
+ * Marie pick driver actually scanned this call. shrink_lruvec's legacy orphan
+ * drain reclaims ONLY these types (it zeroes the nr[] of any unset type), so
+ * it never cuts a type Marie's swappiness / clean_min_ratio / ANON_STRICT
+ * policy protected -- unlike stock get_scan_count, which the legacy drain
+ * would otherwise follow blindly (SCAN_EQUAL at priority 0, etc.).
+ */
+#define MARIE_DRAIN_ANON	0x1u
+#define MARIE_DRAIN_FILE	0x2u
+
+/**
+ * lru_marie_shrink_lruvec - Marie's replacement for shrink_lruvec().
+ *
+ * Called from mm/vmscan.c shrink_lruvec() when lru_marie_enabled() is true.
+ * Updates sc->nr_reclaimed in place. Returns a MARIE_DRAIN_* mask of the
+ * type(s) it scanned so the caller's legacy orphan drain can mirror the pick
+ * policy instead of running stock get_scan_count's.
+ */
+unsigned int lru_marie_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc);
+
+/**
+ * lru_marie_exit_memcg - drop any per-lruvec Marie state held for @memcg.
+ *
+ * Called from mm/memcontrol.c during memcg teardown, alongside the
+ * MGLRU lru_gen_exit_memcg() hook.  Walks the memcg's lruvecs and
+ * removes them from Marie's side-channel xarray.  Always safe to call,
+ * even when lru_marie_enabled() is false (a memcg may have been allocated
+ * while Marie was on, then Marie was disabled, then the memcg dies).
+ */
+void lru_marie_exit_memcg(struct mem_cgroup *memcg);
+
+/**
+ * lru_marie_offline_memcg - drain all lruvecs for @memcg at css_offline time.
+ *
+ * Called before css_rstat_exit() frees rstat_cpu, so lru_gen_fill_lruvec()
+ * inside the drain can safely call mod_memcg_lruvec_state().  Also sets
+ * mlv->offline on each lruvec so subsequent lru_marie_add_folio calls
+ * bail out and legacy lists stay empty through css_free.
+ */
+void lru_marie_offline_memcg(struct mem_cgroup *memcg);
+
+/**
+ * lru_marie_memcg_alloc - allocate per-memcg occupancy bitmap for @memcg.
+ *
+ * Cmdline-gated (lru_marie.memcg_bitmap=1). When the gate is off this
+ * is a no-op; the scan path falls back to per-candidate folio_memcg()
+ * lookup. When the gate is on, allocates max_pfn bits keyed by memcg
+ * pointer in an xarray so the scan can AND the gen bitmap with the
+ * memcg bitmap, eliding the per-candidate folio cacheline touch on
+ * cgroup-targeted reclaim.
+ *
+ * Returns 0 on success or no-op, -ENOMEM on alloc failure (caller
+ * may continue -- scan falls back to folio_memcg lookup).
+ *
+ * Called from mm/memcontrol.c mem_cgroup_alloc, GFP_KERNEL context.
+ */
+int lru_marie_memcg_alloc(struct mem_cgroup *memcg);
+
+/**
+ * lru_marie_reparent_lruvec - migrate every Marie-tracked folio from @child_lv
+ *                       into @parent_lv at memcg reparenting.
+ * @child_lv:  child memcg's lruvec (memcg being torn down)
+ * @parent_lv: parent memcg's lruvec (recipient)
+ *
+ * NOTE: currently has NO in-tree caller. Marie handles memcg offline by
+ * draining (lru_marie_offline_memcg -> marie_drain_one_lruvec), not by
+ * reparenting. This is the reparent counterpart, kept for a future
+ * memcg-offline path that hands child folios to @parent instead of
+ * draining them.
+ *
+ * Lock contract for a future caller: it MUST hold both lruvecs' lru_lock
+ * with IRQs disabled, AND must acquire the two lru_locks in a
+ * deterministic global order (e.g. by lruvec address) so two concurrent
+ * reparents cannot deadlock A-B/B-A. This function itself only takes
+ * @child_mlv's per-type locks via the marie_both_mlv guard; the merge is
+ * a per-memcg bitmap OR + atomic counter transfer, with no per-folio
+ * iteration.
+ *
+ * Safe to call when lru_marie_enabled() is false or @child has no mlv yet —
+ * both cases short-circuit cleanly.
+ */
+void lru_marie_reparent_lruvec(struct lruvec *child_lv, struct lruvec *parent_lv);
+
+/**
+ * lru_marie_add_folio - try to register @folio with Marie.
+ * @lruvec:     the lruvec @folio is being added to
+ * @folio:      the folio
+ * @reclaiming: caller hint, unused at this stage
+ *
+ * Returns true if Marie took ownership of the folio (and the caller must
+ * skip the legacy lruvec list_add) — false if Marie declined (gate off,
+ * lruvec state unavailable, or allocation failed) and the caller should
+ * fall through to the existing MGLRU / Legacy path.
+ */
+bool lru_marie_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming);
+
+/**
+ * lru_marie_orphan_add - pure legacy LRU add for an untracked orphan.
+ * @lruvec: the lruvec @folio is being added to
+ * @folio:  an untracked (non-Marie) folio
+ * @tail:   add to the list tail rather than the head
+ *
+ * A del+add move_fn (swap.c: lru_activate / lru_deactivate{,_file} /
+ * lru_lazyfree) and the legacy reclaim putback (vmscan.c:
+ * move_folios_to_lru) run lruvec_del_folio() -- legacy del, mz -nr for an
+ * untracked folio -- and then add the folio back. Routing that add through
+ * lruvec_add_folio() -> lru_marie_add_folio() would ADOPT the orphan into
+ * Marie; the install credits Marie's own bucket but the original -nr was a
+ * legacy debit, so the legacy mz->lru_zone_size drifts and a later del
+ * underflows ("marie underflow-del"). This does the +nr legacy leg only,
+ * never adopting. Callers MUST first bail on lru_marie_test_tracked()
+ * folios -- a tracked folio is Marie-owned and must never touch a legacy
+ * list.
+ */
+void lru_marie_orphan_add(struct lruvec *lruvec, struct folio *folio, bool tail);
+
+
+/**
+ * lru_marie_del_folio - try to remove @folio from Marie.
+ * @lruvec:     the lruvec @folio is being removed from
+ * @folio:      the folio
+ * @reclaiming: caller hint, unused at this stage
+ *
+ * Returns true iff @folio was tracked by Marie and has now been removed.
+ * Returns false if @folio was on the legacy LRU instead, in which case
+ * the caller continues with the legacy delete path.
+ */
+bool lru_marie_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming);
+
+/**
+ * lru_marie_release_folio - outer-level release for a TRACKED folio
+ *  reaching refcount 0 (called from __page_cache_release).
+ * @folio:    the folio being released
+ * @lruvecp:  caller's lruvec batch pointer (may be NULL or hold a lock)
+ * @flagsp:   caller's irqsave flags slot
+ *
+ * The dispatch contract: when Marie is enabled, upstream callers MUST
+ * gate by folio_marie_test_tracked() and call this helper for TRACKED
+ * folios INSTEAD OF the legacy folio_test_lru / lruvec_del_folio path.
+ * A TRACKED folio sits on a Marie self-loop, not on a legacy lruvec
+ * list, so legacy lruvec_del_folio's list_del would corrupt it; this
+ * helper unlinks the self-loop and debits mz instead. TRACKED is the
+ * single source of truth.
+ *
+ * Internally: relocks @lruvecp to @folio's lruvec with IRQs disabled,
+ * re-tests TRACKED under the lock, runs Marie's del (which leaves mz
+ * untouched), and clears PG_lru. If TRACKED was cleared between the
+ * caller's outer test and our lock acquisition (race with drain or
+ * evict), falls back to the legacy del so accounting stays coherent.
+ * Leaves the lock held in *@lruvecp for the caller's batch context.
+ */
+void lru_marie_release_folio(struct folio *folio, struct lruvec **lruvecp,
+			     unsigned long *flagsp);
+
+/**
+ * lru_marie_split_folio - install a freshly-split tail folio under Marie.
+ * @lruvec:     head folio's lruvec (caller holds lru_lock)
+ * @head:       THP head folio currently tracked by Marie
+ * @new_folio:  tail folio created by __split_huge_page
+ *
+ * Mirror of mm/huge_memory.c::lru_add_split_folio's
+ * "list_add_tail(&new_folio->lru, &folio->lru)" for the case where
+ * @head is Marie-tracked. Publishes the per-PFN state byte for the new
+ * folio so the dispatcher routes its eventual del through Marie;
+ * otherwise the new tail would be untracked, dispatcher del would fall
+ * to legacy update_lru_size, mlv->types[].nr_pages would not decrement,
+ * and reclaim heuristics would drift.
+ *
+ * Caller MUST verify lru_marie_enabled() && folio_marie_test_tracked(head)
+ * before calling. Caller holds @lruvec->lru_lock; the per-type lock is
+ * taken internally.
+ *
+ * Caller is responsible for folio_set_lru(@new_folio) AFTER this
+ * returns — the "state byte published before PG_lru" rule is preserved
+ * by the call ordering.
+ */
+void lru_marie_split_folio(struct lruvec *lruvec, struct folio *head,
+			 struct folio *new_folio);
+
+/**
+ * lru_marie_look_around - opportunistic PMD scan during rmap reference check.
+ * @pvmw: page-vma-mapped walk supplied by folio_referenced_one()
+ * @nr:   number of consecutive PTEs of the target folio at pvmw->address
+ *
+ * Called from rmap.c::folio_referenced_one() in the Marie branch with
+ * pvmw->ptl already held.  Clears the target folio's young bit (returning
+ * its previous state) and, while the PTL is hot, scans up to
+ * MARIE_LOOK_AROUND_BATCH PTEs of the surrounding PMD, clearing young bits
+ * found there too.  This batches what would otherwise be one rmap walk
+ * per neighbouring folio and improves the accuracy of subsequent
+ * folio_referenced() calls.  Returns true iff the target's own PTE(s)
+ * were young.
+ */
+bool lru_marie_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr);
+
+/**
+ * lru_marie_age_node - kswapd pre-reclaim aging hook for Marie.
+ * @pgdat: kswapd's pgdat
+ * @sc:    kswapd's scan_control
+ *
+ * Called from kswapd_age_node() when Marie owns the LRU.  Drives the
+ * proactive PTE walker (marie_walk_pgdat internally) so per-PFN tier
+ * encoding has accurate hot/cold ordering by the time direct reclaim
+ * picks the oldest gen. Internally rate-limited; safe to call from any
+ * kswapd cycle.
+ */
+void lru_marie_age_node(struct pglist_data *pgdat, struct scan_control *sc);
+
+/**
+ * lru_marie_swappiness_changed - notify Marie that a swappiness value
+ *                                has been written via sysctl or memcg.
+ *
+ * Resets every per-lruvec swap_bias counter to zero so the next
+ * reclaim cycle starts from a neutral state under the new swappiness.
+ * Stale bias accumulated under the previous value would otherwise
+ * steer the first several picks in the wrong direction, especially
+ * across transitions into or out of the special-value range
+ * {0, 1, MAX_SWAPPINESS}.
+ *
+ * Walks every lruvec via mem_cgroup_iter unconditionally (no per-memcg
+ * filtering): the extra resets on unaffected lruvecs are harmless under
+ * a controller whose only state is the bias counter, and the
+ * alternative -- per memcg / per cgroup-version classification -- adds
+ * cost without changing observable behaviour. Sysctl writes are
+ * human-rate and the walk is O(N_lruvecs * one atomic64_set).
+ *
+ * Safe to call from sysctl proc_handler context (BKL-free, may sleep
+ * on xa_lock contention but never under the writer's caller locks).
+ */
+void lru_marie_swappiness_changed(void);
+
+/*
+ * Runtime-tunable knobs exposed via /sys/kernel/mm/lru_marie/.
+ * Read with READ_ONCE; sysfs store writes with WRITE_ONCE. Hot-path
+ * snapshots are taken at the top of each loop iteration so a concurrent
+ * write only takes effect on the next pass.
+ */
+extern unsigned long marie_gen_growth_threshold;
+extern unsigned long marie_walker_interval_critical;	/* jiffies */
+extern unsigned long marie_walker_interval_low;		/* jiffies */
+extern unsigned long marie_walker_interval_normal;	/* jiffies */
+extern unsigned long marie_walker_interval_idle;	/* jiffies */
+
+#ifdef CONFIG_SWAP
+/*
+ * kcompmari mode (sysfs /sys/kernel/mm/lru_marie/kcompmari):
+ *   signed -100..+100, default +24.
+ *
+ *     0          — disabled (kthread fan-out off, swap_writeout inline)
+ *     +1..+100   — Marie-gated. |value| is the queue depth at which the
+ *                  producer treats the kfifo as full and falls back to
+ *                  synchronous writeout.
+ *     -1..-100   — force mode. Same queue-length semantics; runs even
+ *                  when Marie is off.
+ *
+ * Default +24 mirrors the queue length kcompressd-unofficial proved
+ * sound under sustained anon pressure. The producer reads vm_kcompmari
+ * directly to derive the queue depth; the on/off and Marie/force gates
+ * are encoded as a pair of static branches so the hot path in
+ * mm/page_io.c::kcompmari_store costs a single jump in the common case:
+ *
+ *   kcompmari_enabled_key — true when vm_kcompmari != 0  (default TRUE)
+ *   kcompmari_force_key   — true when vm_kcompmari  < 0  (default FALSE)
+ *
+ * The Marie-gated branch (positive value) reuses lru_marie_enabled_key
+ * directly, so no extra branch is paid when Marie is on.
+ */
+DECLARE_STATIC_KEY_TRUE(kcompmari_enabled_key);
+DECLARE_STATIC_KEY_FALSE(kcompmari_force_key);
+
+extern int vm_kcompmari;
+
+/**
+ * kcompmari_active - should kswapd off-load this swap-out to kcompmari?
+ *
+ * Default-on: the enabled_key starts TRUE for the +24 default. Setting
+ * vm_kcompmari to 0 flips it off; negative values force-on regardless
+ * of Marie; positive values gate on lru_marie_enabled_key.
+ */
+static inline bool kcompmari_active(void)
+{
+	if (!static_branch_likely(&kcompmari_enabled_key))
+		return false;
+	if (static_branch_unlikely(&kcompmari_force_key))
+		return true;
+	return lru_marie_enabled();
+}
+#endif /* CONFIG_SWAP */
+
+/*
+ * Marie's per-folio state lives entirely in the per-PFN byte
+ * marie_state[pfn] (declared in mm/lru_marie/state.h). Public callers
+ * reach Marie state via the dispatch surface above
+ * (lru_marie_add_folio / lru_marie_del_folio / lru_marie_shrink_lruvec /
+ * lru_marie_exit_memcg / lru_marie_look_around / lru_marie_age_node).
+ * folio->flags carries no Marie bits.
+ */
+
+#endif /* CONFIG_LRU_MARIE */
+
+/*
+ * CONFIG_LRU_MARIE=n: this header intentionally exposes NO inline
+ * shims. Every call site in mm/ is wrapped in #ifdef CONFIG_LRU_MARIE,
+ * so when Marie is off the kernel image contains no Marie symbols and
+ * no Marie calls at all. Refusing to provide no-ops here makes any
+ * stray, un-gated reference fail to compile loudly rather than silently
+ * disappearing into a return-false stub.
+ */
+
+#endif /* _LINUX_LRU_MARIE_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index a171070e15..06b86e03a2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -5,6 +5,7 @@
 #include <linux/atomic.h>
 #include <linux/huge_mm.h>
 #include <linux/mm_types.h>
+#include <linux/lru_marie.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
@@ -36,7 +37,22 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 {
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
+	/*
+	 * Marie's reclaim isolate path (marie_evict_counters_only) and
+	 * deferred post-reclaim teardown (marie_state_drop_pfn_after_reclaim
+	 * via marie_state_shrink_lruvec) intentionally run this without
+	 * lru_lock: install/evict serialise via marie_state[pfn]'s TRACKED
+	 * bit and folio_test_clear_lru, and the per-CPU vmstat helpers
+	 * called below are preempt-off-safe on their own. Skip the lockdep
+	 * assertion in that mode. Legacy / MGLRU paths still get full
+	 * coverage when lru_marie_enabled() is false.
+	 */
+#ifdef CONFIG_LRU_MARIE
+	if (!lru_marie_enabled())
+		lockdep_assert_held(&lruvec->lru_lock);
+#else
 	lockdep_assert_held(&lruvec->lru_lock);
+#endif
 	WARN_ON_ONCE(nr_pages != (int)nr_pages);
 
 	mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
@@ -104,14 +120,14 @@ static inline bool lru_gen_switching(void)
 	return static_branch_unlikely(&lru_switch);
 }
 #ifdef CONFIG_LRU_GEN_ENABLED
-static inline bool lru_gen_enabled(void)
+static inline bool lru_gen_core_enabled(void)
 {
 	DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
 
 	return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
 }
 #else
-static inline bool lru_gen_enabled(void)
+static inline bool lru_gen_core_enabled(void)
 {
 	DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
 
@@ -119,11 +135,58 @@ static inline bool lru_gen_enabled(void)
 }
 #endif
 
+/*
+ * lru_gen_core_enabled() reports the raw MGLRU core static key. Almost no
+ * caller wants that directly -- they want lru_gen_enabled() below, which is
+ * additionally masked off whenever Marie is the active LRU manager.
+ *
+ * Marie and MGLRU are mutually exclusive at runtime. When Marie owns aging,
+ * every MGLRU code path must be inert: folio_mark_accessed() ->
+ * lru_gen_inc_refs(), the reclaim/aging dispatch, workingset refault, the
+ * rmap look-around, and so on. Reporting MGLRU as disabled here makes "both
+ * managers touch the same folio" structurally unrepresentable for every
+ * lru_gen_enabled() reader, so an MGLRU writer that forgets a
+ * !lru_marie_enabled() guard can no longer stamp LRU_GEN / LRU_REFS state
+ * onto a Marie-owned folio -- residue that would otherwise leak into
+ * PAGE_FLAGS_CHECK_AT_FREE (LRU_GEN_MASK) at the buddy handoff.
+ *
+ * The Marie<->MGLRU ownership handoff (mm/lru_marie enable/disable
+ * transition) must still observe the real key; it calls
+ * lru_gen_core_enabled() directly.
+ */
+static inline bool lru_gen_enabled(void)
+{
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled())
+		return false;
+#endif
+	return lru_gen_core_enabled();
+}
+
 static inline bool lru_gen_in_fault(void)
 {
 	return current->in_lru_fault;
 }
 
+/*
+ * Move lruvec contents between legacy lruvec->lists[lru] and
+ * lrugen->folios[gen][type][zone] using MGLRU's canonical add/del
+ * helpers. Exported for external LRU drivers (mm/lru_marie) that
+ * need to keep MGLRU's state_is_valid invariant intact across their
+ * own enable/disable transitions.
+ *
+ *   lru_gen_fill_lruvec  -- legacy lists  -> lrugen (MGLRU's normal
+ *                            enable-time fill, made callable)
+ *   lru_gen_drain_lruvec -- lrugen        -> lruvec_add_folio path
+ *                            (when another driver's gate is on the
+ *                            folios route into that driver directly)
+ *
+ * Caller holds lruvec->lru_lock irqsave; helper releases+reacquires
+ * across cond_resched.
+ */
+void lru_gen_fill_lruvec(struct lruvec *lruvec);
+void lru_gen_drain_lruvec(struct lruvec *lruvec);
+
 static inline int lru_gen_from_seq(unsigned long seq)
 {
 	return seq % MAX_NR_GENS;
@@ -312,6 +375,11 @@ static inline void folio_migrate_refs(struct folio *new, const struct folio *old
 }
 #else /* !CONFIG_LRU_GEN */
 
+static inline bool lru_gen_core_enabled(void)
+{
+	return false;
+}
+
 static inline bool lru_gen_enabled(void)
 {
 	return false;
@@ -322,6 +390,9 @@ static inline bool lru_gen_switching(void)
 	return false;
 }
 
+static inline void lru_gen_fill_lruvec(struct lruvec *lruvec) { }
+static inline void lru_gen_drain_lruvec(struct lruvec *lruvec) { }
+
 static inline bool lru_gen_in_fault(void)
 {
 	return false;
@@ -350,8 +421,23 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_add_folio(lruvec, folio, false))
+		return;
+
+	/*
+	 * If Marie is enabled, lru_marie_add_folio failed only due to allocation
+	 * exhaustion (slab + mempool reserve both depleted).  Skip MGLRU
+	 * and fall directly to the legacy LRU lists: shrink_lruvec runs
+	 * legacy reclaim alongside Marie specifically to drain these
+	 * orphans, but MGLRU is bypassed entirely when lru_marie_enabled().
+	 */
+	if (!lru_marie_enabled() && lru_gen_add_folio(lruvec, folio, false))
+		return;
+#else
 	if (lru_gen_add_folio(lruvec, folio, false))
 		return;
+#endif
 
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 			folio_nr_pages(folio));
@@ -366,8 +452,17 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_add_folio(lruvec, folio, true))
+		return;
+
+	/* See lruvec_add_folio() — Marie alloc failure falls to legacy, not MGLRU. */
+	if (!lru_marie_enabled() && lru_gen_add_folio(lruvec, folio, true))
+		return;
+#else
 	if (lru_gen_add_folio(lruvec, folio, true))
 		return;
+#endif
 
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 			folio_nr_pages(folio));
@@ -382,6 +477,11 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_del_folio(lruvec, folio, false))
+		return;
+#endif
+
 	if (lru_gen_del_folio(lruvec, folio, false))
 		return;
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21d..f2c4a1cd80 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,6 +24,7 @@
 #include <linux/local_lock.h>
 #include <linux/zswap.h>
 #include <linux/sizes.h>
+#include <linux/kfifo.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -783,6 +784,21 @@ struct lruvec {
 	struct pglist_data *pgdat;
 #endif
 	struct zswap_lruvec_state zswap_lruvec_state;
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Pointer to this lruvec's struct marie_lruvec — the single source
+	 * of truth (no side xarray). Lazily allocated on first use by
+	 * marie_get_lruvec() via cmpxchg, and cleared then freed by
+	 * marie_drop_lruvec() under lv->lru_lock when the lruvec's memcg is
+	 * torn down. Fault-path dispatchers (lru_marie_add_folio,
+	 * lru_marie_del_folio, etc.) read it with a single READ_ONCE.
+	 *
+	 * void * because struct marie_lruvec is internal to
+	 * mm/lru_marie/state.h. Its lifetime is tied to the lruvec/memcg, so
+	 * any caller holding a valid lruvec observes a live mlv without RCU.
+	 */
+	void				*marie_mlv;
+#endif
 };
 
 /* Isolate for asynchronous migration */
@@ -1529,6 +1545,20 @@ typedef struct pglist_data {
 
 	atomic_t kswapd_failures;	/* Number of 'reclaimed == 0' runs */
 
+#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP)
+/*
+ * kfifo backing storage capacity (in folio* slots). The sysfs knob
+ * vm_kcompmari sets the effective queue length in [-100, +100]; this
+ * matches the upper bound. ~800 bytes per pgdat regardless of the
+ * currently configured depth.
+ */
+#define KCOMPMARI_FIFO_SIZE 100
+	wait_queue_head_t kcompmari_wait;
+	struct task_struct *kcompmari;
+	struct kfifo kcompmari_fifo;
+	spinlock_t kcompmari_fifo_lock;
+#endif
+
 #ifdef CONFIG_COMPACTION
 	int kcompactd_max_order;
 	enum zone_type kcompactd_highest_zoneidx;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7a09df6977..cb9d9acecb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -421,6 +421,24 @@ extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
 extern atomic_t nr_rotate_swap;
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * linux/mm/page_io.c: monotonic counter incremented on every failed swap-out
+ * bio completion (bio->bi_status != 0). The early-OOM gate in
+ * mm/page_alloc.c:should_reclaim_retry consults the delta from
+ * alloc_context.initial_swap_write_failed to detect "swap backend has free
+ * entries but cannot actually write" — primarily ZRAM/zswap zs_malloc
+ * failures under combined RAM + swap pressure, but also disk swap I/O
+ * errors. Sustained delta > MAX_SWAP_WRITE_FAIL_RETRIES skips the standard
+ * MAX_RECLAIM_RETRIES wait and triggers OOM directly.
+ *
+ * Marie-only: only the Marie-gated path in should_reclaim_retry consumes
+ * this counter, so it is omitted entirely under CONFIG_LRU_MARIE=n to
+ * keep vanilla MGLRU/Legacy builds byte-identical.
+ */
+extern atomic_long_t nr_swap_write_failed;
+#endif
+
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6a..2914498cad 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1396,6 +1396,36 @@ config LRU_GEN_WALKS_MMU
 	depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG
 # }
 
+# Marie LRU {
+config LRU_MARIE
+	bool "Marie LRU"
+	def_bool y
+	depends on MMU
+	help
+	  Marie LRU represents each folio's reclaim state as a single
+	  byte in a flat per-PFN array allocated once at boot. install,
+	  delete, and aging are all single byte writes — there is no
+	  allocation in any fault-path Marie operation, and no linked
+	  lists for the reclaim path to chase. isolate batches the array
+	  with SIMD plus a persistent cursor, so a 32-folio batch
+	  typically costs a few hundred PFNs of sequential read.
+
+	  The PTE young-bit walker is SIMD-accelerated and auto-dispatched
+	  at boot (AVX-512F > AVX2 > SSE2 on x86; scalar elsewhere); a
+	  bloom-filter forward feedback from rmap keeps walker cost on
+	  hot PMDs.
+
+	  Memory cost: ~1 byte per RAM PFN (4 MiB on 16 GiB, 16 MiB on
+	  64 GiB, capped at 4 GiB by the 32-bit PFN limit Marie requires).
+
+	  Enabled by default; disable at boot with lru_marie=0 on the
+	  kernel cmdline, or at runtime via /sys/kernel/mm/lru_marie/enabled.
+	  When disabled, vmscan falls through to the in-tree LRU paths
+	  unchanged.
+
+	  Say Y unless you understand what this is.
+# }
+
 config ARCH_SUPPORTS_PER_VMA_LOCK
        def_bool n
 
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab0824..a2162965c3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,6 +75,7 @@ ifdef CONFIG_MMU
 	obj-$(CONFIG_ADVISE_SYSCALLS)	+= madvise.o
 endif
 
+obj-$(CONFIG_LRU_MARIE)	+= lru_marie/
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 970e077019..9b6315ceef 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -6,6 +6,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/mm.h>
+#include <linux/lru_marie.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
@@ -3562,10 +3563,31 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
 	} else {
 		/* head is still on lru (and we have it frozen) */
 		VM_WARN_ON(!folio_test_lru(folio));
-		if (folio_test_unevictable(folio))
+		if (folio_test_unevictable(folio)) {
 			new_folio->mlock_count = 0;
-		else
+		} else {
+#ifdef CONFIG_LRU_MARIE
+			/*
+			 * If Marie owns @folio (the head), the legacy
+			 * list_add_tail below would put the new tail on the
+			 * legacy LRU without a TRACKED state byte, leaving it
+			 * invisible to Marie's per-mlv bookkeeping
+			 * (mlv->types[].nr_pages, marie_nr_folios). Route
+			 * through Marie's split helper which sets TRACKED,
+			 * publishes the per-PFN state at the same gen as
+			 * @folio, and increments the folio counter. The
+			 * helper falls back to plain list_add_tail when
+			 * @folio is not Marie-tracked, so the static branch
+			 * is the only gate the !lru_marie_enabled() case sees.
+			 */
+			if (lru_marie_enabled())
+				lru_marie_split_folio(lruvec, folio, new_folio);
+			else
+				list_add_tail(&new_folio->lru, &folio->lru);
+#else
 			list_add_tail(&new_folio->lru, &folio->lru);
+#endif
+		}
 		folio_set_lru(new_folio);
 	}
 }
diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e..9307a24661 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -625,12 +625,55 @@ extern unsigned long highest_memmap_pfn;
  */
 #define MAX_RECLAIM_RETRIES 16
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * Maximum number of swap-write failures (incremented by mm/page_io.c
+ * __end_swap_bio_write on bio->bi_status != 0) tolerated within a single
+ * __alloc_pages_slowpath attempt before the early-OOM gate gives up. Lets
+ * a handful of transient failures (concurrent ZRAM ops, brief retry
+ * windows) recover, but trips OOM well before MAX_RECLAIM_RETRIES on
+ * sustained backend rejection. Marie-only; omitted under
+ * CONFIG_LRU_MARIE=n.
+ */
+#define MAX_SWAP_WRITE_FAIL_RETRIES 16
+
+#endif
+
 /*
  * in mm/vmscan.c:
+ *
+ * struct scan_control is private to vmscan.c. Out-of-tree LRU
+ * experiments (mm/lru_marie) read/update individual fields via the
+ * sc_* accessors declared below; the struct itself is opaque to
+ * everything outside vmscan.c.
  */
+struct scan_control;
+
 bool folio_isolate_lru(struct folio *folio);
 void folio_putback_lru(struct folio *folio);
+struct reclaim_stat;
+unsigned int shrink_folio_list(struct list_head *folio_list,
+		struct pglist_data *pgdat, struct scan_control *sc,
+		struct reclaim_stat *stat, bool ignore_references,
+		struct mem_cgroup *memcg);
 extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
+int vmscan_reclaimer_offset(struct scan_control *sc);
+bool vmscan_can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid,
+				   struct scan_control *sc);
+
+/*
+ * scan_control accessors -- read/update the few fields out-of-tree
+ * readers need without exposing the struct layout. All defined in
+ * vmscan.c next to the struct definition; trivial enough that the
+ * compiler routinely inlines the body across LTO. Non-LTO builds
+ * pay one extra call per use, which lands only on cold paths
+ * (entry of marie_state_shrink_lruvec and the inner tier loop).
+ */
+int  sc_priority(const struct scan_control *sc);
+int  sc_reclaim_idx(const struct scan_control *sc);
+bool sc_reclaim_target_reached(const struct scan_control *sc);
+void sc_add_reclaimed(struct scan_control *sc, unsigned long nr);
+bool sc_cgroup_reclaim(const struct scan_control *sc);
 int user_proactive_reclaim(char *buf,
 			   struct mem_cgroup *memcg, pg_data_t *pgdat);
 
@@ -693,6 +736,18 @@ struct alloc_context {
 	 */
 	enum zone_type highest_zoneidx;
 	bool spread_dirty_pages;
+
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Snapshot of nr_swap_write_failed at the entry to
+	 * __alloc_pages_slowpath. should_reclaim_retry takes the delta to
+	 * decide whether the swap backend has rejected enough writes during
+	 * THIS allocation attempt to skip the rest of the reclaim retry
+	 * budget and OOM directly. See include/linux/swap.h for the
+	 * counter's contract. Marie-only; omitted under CONFIG_LRU_MARIE=n.
+	 */
+	long initial_swap_write_failed;
+#endif
 };
 
 /*
diff --git a/mm/lru_marie/Makefile b/mm/lru_marie/Makefile
new file mode 100644
index 0000000000..c1e9c9becf
--- /dev/null
+++ b/mm/lru_marie/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Marie LRU. Reached from mm/Makefile via obj-$(CONFIG_LRU_MARIE) += lru_marie/.
+# Public dispatch surface for the rest of the kernel is in
+# include/linux/lru_marie.h; everything below is Marie-private.
+
+obj-y	+= bitmap.o
+obj-y	+= core.o
+obj-y	+= state.o
+obj-y	+= walker.o
+
+ifdef CONFIG_X86
+obj-y	+= simd_x86.o
+obj-y	+= simd_x86_sse2.o
+obj-y	+= simd_x86_avx2.o
+obj-y	+= simd_x86_avx512.o
+else
+obj-y	+= simd_generic.o
+endif
diff --git a/mm/lru_marie/account.h b/mm/lru_marie/account.h
new file mode 100644
index 0000000000..ef1b7b7695
--- /dev/null
+++ b/mm/lru_marie/account.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_ACCOUNT_H
+#define _MM_LRU_MARIE_ACCOUNT_H
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+
+#include "state.h"
+
+/*
+ * Marie's four counter+vmstat updates that move together on every
+ * install/evict.  Until Step 3 of the abstraction plan these were
+ * hand-written at five sites (install, evict_locked, evict_counters_only,
+ * survivor install, survivor evict-on-free).  The drift hazard that
+ * cost us 9c6a93782 was: each site picked its own IRQ-state discipline
+ * because no helper enforced it.
+ *
+ *   marie_pc_add(&mlv->types[type].nr_pages,            +-nr)
+ *   marie_pc_add(&marie_nr_folios,                      +-1 )
+ *   marie_update_lru_size(lv, lru, zone,                +-nr)
+ *   marie_pc_add(&mlv->marie_lru_zone_size[lru][zone],  +-nr)
+ *
+ * marie_update_lru_size credits/debits the per-memcg mz->lru_zone_size
+ * (unified with legacy update_lru_size), so a Marie folio is counted in
+ * mz from install to evict exactly like a legacy/MGLRU folio.
+ * marie_lru_zone_size is Marie's own per-bucket tally, kept only for the
+ * reparent transfer; it no longer feeds lruvec_lru_size().
+ *
+ * Two contexts:
+ *
+ *   LOCKED   - caller holds lv->lru_lock with IRQs off.  Used by the
+ *              install / evict_locked / del_folio_locked /
+ *              fill / drain hot paths.  Helpers assert both held
+ *              conditions via lockdep.  mlv is non-NULL.
+ *
+ *   ISOLATE  - caller holds NOTHING (no lru_lock, IRQs on).  Used by
+ *              the reclaim isolate path and the survivor putback.
+ *              Helpers own local_irq_save/restore so the marie_pc_add
+ *              fast path and __mod_zone_page_state inside
+ *              marie_update_lru_size are safe against same-CPU
+ *              softirq reentrancy (the very property 9c6a93782
+ *              introduced).  mlv may be NULL: under reclaim a brand-
+ *              new lruvec may fail GFP_ATOMIC at marie_get_lruvec, and
+ *              teardown can race with the isolate.  In that case only
+ *              the global counters move; the per-mlv counters are
+ *              gone with the missing carrier (no leak -- mirrors the
+ *              pre-helper code in marie_evict_counters_only).
+ *
+ * The helpers do NOT touch folio flags, the per-PFN state byte, the
+ * scan bitmap, or memcg L1.  Those belong to pfn_install.h
+ * (marie_pfn_publish_inherit) and to marie_state_publish_at_gen /
+ * marie_state_drop_pfn.  Each layer keeps its own invariant.
+ */
+
+/*
+ * Drain any deferred legacy-mz delta the lock-free isolate paths accumulated
+ * for (@lru, @zone) into mz->lru_zone_size. MUST be called with @mlv's
+ * lru_lock held (the locked funnels below do), so the non-atomic mz RMW inside
+ * marie_update_lru_size is serialised. atomic_long_xchg claims the pending
+ * delta in one shot against concurrent lock-free accumulators.
+ */
+static inline void marie_mz_drain_locked(struct marie_lruvec *mlv,
+					 enum lru_list lru, int zone)
+{
+#ifdef CONFIG_MEMCG
+	long d = atomic_long_xchg(&mlv->mz_pending[lru][zone], 0);
+
+	/*
+	 * Apply ONLY the deferred mz->lru_zone_size delta -- the vmstat halves
+	 * (NR_LRU_BASE / NR_ZONE_LRU_BASE) were updated immediately and lock-free
+	 * in the isolate path (they are per-CPU safe). Re-running the full
+	 * marie_update_lru_size here would double-count vmstat.
+	 */
+	if (d)
+		mem_cgroup_update_lru_size(mlv->lruvec, lru, zone, d);
+#endif
+}
+
+static inline void marie_account_install(struct marie_lruvec *mlv,
+					 struct folio *f,
+					 enum lru_list lru, int zone)
+{
+	int type = folio_is_file_lru(f);
+	long nr = folio_nr_pages(f);
+
+	lockdep_assert_held(&mlv->lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	marie_pc_add(&mlv->types[type].nr_pages, nr);
+	marie_pc_add(&marie_nr_folios, 1);
+	/* Fold in any lock-free isolate deltas first, then our own, under lock. */
+	marie_mz_drain_locked(mlv, lru, zone);
+	marie_update_lru_size(mlv->lruvec, lru, zone, nr);
+	marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], nr);
+}
+
+static inline void marie_account_evict(struct marie_lruvec *mlv,
+				       struct folio *f,
+				       enum lru_list lru, int zone)
+{
+	int type = folio_is_file_lru(f);
+	long nr = folio_nr_pages(f);
+
+	lockdep_assert_held(&mlv->lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	marie_pc_add(&mlv->types[type].nr_pages, -nr);
+	marie_pc_add(&marie_nr_folios, -1);
+	marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], -nr);
+	/* Fold in any lock-free isolate deltas first, then our own, under lock. */
+	marie_mz_drain_locked(mlv, lru, zone);
+	marie_update_lru_size(mlv->lruvec, lru, zone, -nr);
+}
+
+static inline void marie_account_install_isolate(struct lruvec *lv,
+						 struct marie_lruvec *mlv,
+						 struct folio *f,
+						 enum lru_list lru, int zone)
+{
+	int type = folio_is_file_lru(f);
+	long nr = folio_nr_pages(f);
+	unsigned long flags;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	local_irq_save(flags);
+	marie_pc_add(&marie_nr_folios, 1);
+	/* vmstat halves are per-CPU safe lock-free; do them now. */
+	__update_lru_size(lv, lru, zone, nr);
+	if (likely(mlv)) {
+		marie_pc_add(&mlv->types[type].nr_pages, nr);
+		marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], nr);
+#ifdef CONFIG_MEMCG
+		/*
+		 * No lru_lock here -- DEFER only the non-atomic mz->lru_zone_size
+		 * RMW (the race that drifts mz negative) instead of doing it
+		 * lock-free. The next LOCKED op on this bucket drains it. If mlv
+		 * is NULL (offlining / GFP_ATOMIC fail) there is nowhere to defer
+		 * and no shadow either, so mz is left untouched -- mz and shadow
+		 * stay paired and the dying memcg is reparented.
+		 */
+		atomic_long_add(nr, &mlv->mz_pending[lru][zone]);
+#endif
+	}
+	local_irq_restore(flags);
+}
+
+static inline void marie_account_evict_isolate(struct lruvec *lv,
+					       struct marie_lruvec *mlv,
+					       struct folio *f,
+					       enum lru_list lru, int zone)
+{
+	int type = folio_is_file_lru(f);
+	long nr = folio_nr_pages(f);
+	unsigned long flags;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	local_irq_save(flags);
+	marie_pc_add(&marie_nr_folios, -1);
+	/* vmstat halves are per-CPU safe lock-free; do them now. */
+	__update_lru_size(lv, lru, zone, -nr);
+	if (likely(mlv)) {
+		marie_pc_add(&mlv->types[type].nr_pages, -nr);
+		marie_pc_add(&mlv->marie_lru_zone_size[lru][zone], -nr);
+#ifdef CONFIG_MEMCG
+		/* DEFER only the lock-free mz RMW; drained under lru_lock. See
+		 * marie_account_install_isolate for the mlv==NULL rationale. */
+		atomic_long_add(-nr, &mlv->mz_pending[lru][zone]);
+#endif
+	}
+	local_irq_restore(flags);
+}
+
+#endif /* _MM_LRU_MARIE_ACCOUNT_H */
diff --git a/mm/lru_marie/bitmap.c b/mm/lru_marie/bitmap.c
new file mode 100644
index 0000000000..da8f595a6b
--- /dev/null
+++ b/mm/lru_marie/bitmap.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hierarchical PFN bitmap operations. See bitmap.h for the design
+ * overview. Used by both the global per-(type, gen, tier) plane and
+ * the per-memcg plane.
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/cache.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "bitmap.h"
+#include "state.h"	/* max_pfn, marie_l2_shift, marie_pfn_to_l2_bit */
+
+/*
+ * 512 cacheline-aligned spinlocks, one per L2 bit. Each lock makes
+ * one concurrent scanner the exclusive owner of the PFN range
+ * covered by that L2 bit -- collisions never produce wasted
+ * candidate scan work, only a single try_lock failure that costs
+ * one atomic op. ____cacheline_aligned_in_smp prevents false sharing
+ * between adjacent locks while keeping UP-build footprint flat.
+ *
+ * 32 KiB total on SMP (64 B x 512). Shared across every marie_bitmap
+ * instance: the lock is over the PFN address space, not per-bitmap.
+ * Two scanners walking different (type, gen, tier) bitmaps in the
+ * same L2 range still serialise via the same lock, avoiding wasted
+ * parallel L1 fetches of the same physical cachelines.
+ *
+ * Trylock / unlock are static inline in bitmap.h; this file only
+ * holds the storage and the boot-time init.
+ */
+struct marie_bm_range_lock marie_bm_range_locks[MARIE_L2_BITS];
+
+void marie_bm_range_locks_init(void)
+{
+	int i;
+
+	for (i = 0; i < MARIE_L2_BITS; i++)
+		spin_lock_init(&marie_bm_range_locks[i].lock);
+}
+
+int marie_bm_init(struct marie_bitmap *bm)
+{
+	unsigned long bytes;
+
+	if (!max_pfn)
+		return 0;
+	bytes = BITS_TO_LONGS(max_pfn) * sizeof(unsigned long);
+	bm->l1 = kvmalloc(bytes, GFP_KERNEL | __GFP_ZERO);
+	if (!bm->l1)
+		return -ENOMEM;
+	return 0;
+}
+
+void marie_bm_free(struct marie_bitmap *bm)
+{
+	if (!bm)
+		return;
+	kvfree(bm->l1);
+	bm->l1 = NULL;
+}
+
+/*
+ * marie_bm_set / marie_bm_clear / marie_bm_test are static inline in
+ * bitmap.h -- they sit on the install / del / promote hot path and
+ * out-of-lining costs measurable cycles per fault.
+ */
+
+/*
+ * Inclusive [start_word, end_word) covering one L2 bit's worth of L1 words.
+ * Clipped to the actual l1 storage extent.
+ */
+static void marie_bm_l1_word_range(unsigned int l2bit,
+				   unsigned long *start_word,
+				   unsigned long *end_word)
+{
+	unsigned long pfns_per_l2 = 1UL << marie_l2_shift;
+	unsigned long start_pfn = (unsigned long)l2bit << marie_l2_shift;
+	unsigned long end_pfn = start_pfn + pfns_per_l2;
+	unsigned long max_words = BITS_TO_LONGS(max_pfn);
+
+	*start_word = start_pfn / BITS_PER_LONG;
+	*end_word = DIV_ROUND_UP(end_pfn, BITS_PER_LONG);
+	if (*end_word > max_words)
+		*end_word = max_words;
+}
+
+void marie_bm_drop_l2_range(struct marie_bitmap *bm, unsigned int l2bit)
+{
+	unsigned long start_word, end_word, wi;
+
+	if (!bm->l1)
+		return;
+	marie_bm_l1_word_range(l2bit, &start_word, &end_word);
+	for (wi = start_word; wi < end_word; wi++)
+		bm->l1[wi] = 0;
+	atomic_set(&bm->l2_count[l2bit], 0);
+	clear_bit(l2bit, bm->l2);
+}
+
+void marie_bm_reset(struct marie_bitmap *bm)
+{
+	int i;
+
+	if (!bm->l1)
+		return;
+	if (max_pfn)
+		bitmap_zero(bm->l1, max_pfn);
+	bitmap_zero(bm->l2, MARIE_L2_BITS);
+	for (i = 0; i < MARIE_L2_BITS; i++)
+		atomic_set(&bm->l2_count[i], 0);
+}
+
+void marie_bm_merge(struct marie_bitmap *dst, struct marie_bitmap *src)
+{
+	const int l2_words = BITS_TO_LONGS(MARIE_L2_BITS);
+	int lw;
+
+	if (!src || !src->l1)
+		return;
+
+	for (lw = 0; lw < l2_words; lw++) {
+		unsigned long w = src->l2[lw];
+
+		while (w) {
+			unsigned int b = __ffs(w);
+			unsigned int l2bit = lw * BITS_PER_LONG + b;
+			unsigned long start_word, end_word, wi;
+			int child_count, parent_new;
+
+			w &= w - 1;
+
+			marie_bm_l1_word_range(l2bit, &start_word, &end_word);
+
+			if (dst && dst->l1) {
+				for (wi = start_word; wi < end_word; wi++) {
+					unsigned long cw = src->l1[wi];
+
+					if (!cw)
+						continue;
+					dst->l1[wi] |= cw;
+					src->l1[wi] = 0;
+				}
+				child_count = atomic_xchg(&src->l2_count[l2bit], 0);
+				if (child_count <= 0)
+					continue;
+				parent_new = atomic_add_return(child_count,
+					&dst->l2_count[l2bit]);
+				if (parent_new == child_count)
+					set_bit(l2bit, dst->l2);
+			} else {
+				for (wi = start_word; wi < end_word; wi++)
+					src->l1[wi] = 0;
+				atomic_set(&src->l2_count[l2bit], 0);
+			}
+		}
+		src->l2[lw] = 0;
+	}
+}
diff --git a/mm/lru_marie/bitmap.h b/mm/lru_marie/bitmap.h
new file mode 100644
index 0000000000..1439771849
--- /dev/null
+++ b/mm/lru_marie/bitmap.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hierarchical PFN bitmap shared by Marie's global per-(type, gen, tier)
+ * planes and per-memcg plane.
+ *
+ * Two layers held by one struct:
+ *   L1: per-PFN bit, sized BITS_TO_LONGS(max_pfn). set_bit()/clear_bit()
+ *       (atomic). One word covers 64 PFNs.
+ *
+ *   L2: 512-bit summary, each bit covers (max_pfn / 512) PFNs (one
+ *       "L2 range", typically 32 MiB on an 8 GiB system). A companion
+ *       per-cell atomic_t refcount tracks how many L1 bits are set in
+ *       that range. The L2 bit transitions on the 0 <-> 1 counter
+ *       boundary, performed inside the same atomic_*_return path,
+ *       so concurrent set/clear cannot desynchronise the bit from
+ *       the counter.
+ *
+ * Two consumers:
+ *   - Global plane:  one struct per (type, gen, tier), 16 instances
+ *                    total (marie_track_bm[type][gen][tier]).
+ *   - Memcg plane:   one struct per non-root memcg, wrapped by
+ *                    struct marie_memcg_bm.
+ *
+ * The same struct + the same set/clear/merge/iter operations are
+ * used by both consumers. No internal lock; producers serialise via
+ * the existing Marie lock hierarchy (lru_lock on the install/del
+ * side, reparent_locks on the merge side, marie_l2_locks[bit]
+ * trylock on the scanner side).
+ */
+#ifndef _MM_LRU_MARIE_BITMAP_H
+#define _MM_LRU_MARIE_BITMAP_H
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/cache.h>
+#include <linux/mm.h>		/* max_pfn */
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+/*
+ * MARIE_L2_BITS sizes the L2 summary plane. Placed here so the
+ * struct can lay out its inline arrays without pulling in state.h.
+ */
+#define MARIE_L2_BITS		512
+
+/*
+ * PFN -> L2 bit shift, set at marie_state_init time so that
+ * (1 << marie_l2_shift) PFNs map to one L2 bit and 512 L2 bits cover
+ * the full max_pfn range. shift = ceil(log2(max_pfn / 512)).
+ */
+extern unsigned int marie_l2_shift;
+
+static inline unsigned int marie_pfn_to_l2_bit(unsigned long pfn)
+{
+	unsigned int b = pfn >> marie_l2_shift;
+
+	return b < MARIE_L2_BITS ? b : MARIE_L2_BITS - 1;
+}
+
+static inline unsigned long marie_l2_bit_pfn_start(unsigned int bit)
+{
+	return (unsigned long)bit << marie_l2_shift;
+}
+
+static inline unsigned long marie_l2_bit_pfn_end(unsigned int bit)
+{
+	return ((unsigned long)bit + 1) << marie_l2_shift;
+}
+
+struct marie_bitmap {
+	unsigned long	*l1;					/* BITS_TO_LONGS(max_pfn) words */
+	unsigned long	 l2[BITS_TO_LONGS(MARIE_L2_BITS)];	/* 64 B inline */
+	atomic_t	 l2_count[MARIE_L2_BITS];		/* 2 KiB inline */
+};
+
+/*
+ * marie_bm_init - allocate @bm->l1 sized for the system max_pfn.
+ * @bm->l2 and @bm->l2_count are zero-initialised by the caller (the
+ * struct itself is typically zero-allocated). Returns 0 on success,
+ * -ENOMEM on allocation failure.
+ */
+int marie_bm_init(struct marie_bitmap *bm);
+
+/* marie_bm_free - release @bm->l1 (no-op when never initialised). */
+void marie_bm_free(struct marie_bitmap *bm);
+
+/*
+ * marie_bm_set - mark @pfn tracked.
+ *
+ * Atomically sets the L1 bit at @pfn. The per-cell refcount is then
+ * atomic_inc_return'd; on the 0 -> 1 transition the L2 summary bit
+ * for @pfn's range is set. Idempotent w.r.t. already-set L1 bit if
+ * the same PFN is set twice (cell_count overcounts; balanced by a
+ * matching number of clears).
+ *
+ * static inline because this is a hot-path operation invoked at
+ * every install / promote / move; out-of-lining would add a function
+ * call + bound-check overhead per call.
+ */
+static inline void marie_bm_set(struct marie_bitmap *bm, unsigned long pfn)
+{
+	unsigned int l2bit;
+
+	if (!bm->l1 || pfn >= max_pfn)
+		return;
+	set_bit(pfn, bm->l1);
+	l2bit = marie_pfn_to_l2_bit(pfn);
+	if (atomic_inc_return(&bm->l2_count[l2bit]) == 1)
+		set_bit(l2bit, bm->l2);
+}
+
+/*
+ * marie_bm_clear - mark @pfn untracked.
+ *
+ * Atomically clears the L1 bit at @pfn. The per-cell refcount is
+ * atomic_dec_return'd; on the 1 -> 0 transition the L2 summary bit
+ * for @pfn's range is cleared.
+ */
+static inline void marie_bm_clear(struct marie_bitmap *bm, unsigned long pfn)
+{
+	unsigned int l2bit;
+
+	if (!bm->l1 || pfn >= max_pfn)
+		return;
+	clear_bit(pfn, bm->l1);
+	l2bit = marie_pfn_to_l2_bit(pfn);
+	if (atomic_dec_return(&bm->l2_count[l2bit]) == 0)
+		clear_bit(l2bit, bm->l2);
+}
+
+/*
+ * marie_bm_test - is @pfn tracked? Lock-free single-word read.
+ * Returns false when @bm->l1 is unallocated.
+ */
+static inline bool marie_bm_test(const struct marie_bitmap *bm,
+				 unsigned long pfn)
+{
+	if (!bm->l1 || pfn >= max_pfn)
+		return false;
+	return test_bit(pfn, bm->l1);
+}
+
+/*
+ * marie_bm_drop_l2_range - bulk-clear all L1 / L2 / counter state
+ * for the L2 range identified by @l2bit. Used when recycling one
+ * range of a bitmap (precise, touches the L1 words covered by the
+ * range as well).
+ *
+ * Caller must guarantee no concurrent set/clear on @bm for the
+ * affected PFN range (try_advance_head fences via head_gen cmpxchg).
+ */
+void marie_bm_drop_l2_range(struct marie_bitmap *bm, unsigned int l2bit);
+
+/*
+ * marie_bm_reset - reset @bm to fully empty: L1 cleared, L2 cleared,
+ * all l2_count cells zeroed.
+ *
+ * L1 must be cleared too: leaving stale L1 bits and resetting only
+ * L2 + l2_count would let a subsequent marie_bm_set(@pfn) on a
+ * different PFN in a stale-set L1 word leave that stale bit visible
+ * to the scanner (which now sees the just-set L2 bit and enters the
+ * range). Worse, a later marie_bm_clear() on the stale PFN would
+ * dec the l2_count below zero, corrupting the refcount invariant.
+ *
+ * Used by try_advance_head when recycling a (type, gen, tier) slot
+ * for the next ring cycle. Caller must fence subsequent installs
+ * (head_gen cmpxchg in try_advance_head's case) so no install can
+ * target @bm until the reset is visible.
+ */
+void marie_bm_reset(struct marie_bitmap *bm);
+
+/*
+ * marie_bm_merge - L2-pruned word-wise OR of @src into @dst.
+ *
+ * Walks @src->l2 sparsely (only set bits via __ffs unset-loop). For
+ * each populated L2 range, OR's the matching L1 word range into
+ * @dst and atomic_xchg's @src's l2_count contribution into @dst's
+ * (set_bit on @dst's L2 fires at the 0 -> N transition). The
+ * processed words / counters in @src are zeroed.
+ *
+ * @dst == NULL: @src is simply drained (l1 cleared, l2_count zeroed,
+ * l2 cleared) -- used by reparent when no parent bitmap exists.
+ *
+ * Cost scales with @src's populated L2 range count, not max_pfn.
+ *
+ * Caller must serialise against concurrent set/clear on either
+ * bitmap (reparent_locks on the memcg offline path is sufficient).
+ */
+void marie_bm_merge(struct marie_bitmap *dst, struct marie_bitmap *src);
+
+/*
+ * L2 range coordination locks: 512 spinlocks (one per L2 bit, ~32 KiB
+ * total), used by scanners to claim exclusive ownership of a PFN
+ * range for the duration of their L1 walk in that range.
+ *
+ * Shared by ALL marie_bitmap instances: the lock is over the PFN
+ * address space, not the bitmap instance. Two scanners walking
+ * different (type, gen, tier) bitmaps in the same L2 range still
+ * serialise via the same lock, avoiding wasted parallel L1 fetches
+ * of the same physical cachelines.
+ *
+ * The storage is exposed (rather than wrapped in opaque accessors)
+ * so the per-bit trylock / unlock can be static inline in this
+ * header -- scanners take them once per processed L2 bit, which is
+ * hot enough that a function-call wrapper costs measurable cycles.
+ * Callers must guarantee @l2bit < MARIE_L2_BITS (always true for
+ * indices produced by __ffs on an L2 word).
+ */
+struct marie_bm_range_lock {
+	spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
+extern struct marie_bm_range_lock marie_bm_range_locks[MARIE_L2_BITS];
+
+void marie_bm_range_locks_init(void);
+
+static inline bool marie_bm_range_trylock(unsigned int l2bit)
+{
+	return spin_trylock(&marie_bm_range_locks[l2bit].lock);
+}
+
+static inline void marie_bm_range_unlock(unsigned int l2bit)
+{
+	spin_unlock(&marie_bm_range_locks[l2bit].lock);
+}
+
+#endif	/* _MM_LRU_MARIE_BITMAP_H */
diff --git a/mm/lru_marie/core.c b/mm/lru_marie/core.c
new file mode 100644
index 0000000000..f376a63990
--- /dev/null
+++ b/mm/lru_marie/core.c
@@ -0,0 +1,1982 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/lru_marie/core.c — Marie LRU.
+ *
+ * Multi-graded Adaptive Reclaim & Independent Eviction (Marie)
+ *
+ * Architecture in one paragraph:
+ *   - per-PFN state byte (marie_state[pfn]) as the single source of
+ *     truth for every folio's (TRACKED, type, zone, gen, tier) tuple;
+ *     install / del / aging are single byte writes with no allocation
+ *     in any fault-path operation
+ *   - per-lruvec mlv carrying per-type locks and per-(lru, zone)
+ *     percpu counters; per-type counter writes hit the local CPU diff
+ *     so install/del do not bounce a shared cache line
+ *   - per-pgdat walker driven from kswapd, with rmap-fed bloom
+ *     feedback so PMD scans concentrate on hot regions
+ *   - cycling per-type gen ring (MARIE_PFN_NR_GENS = 4) encoded in
+ *     the per-PFN byte, advanced by install cadence
+ *     (marie_install_advance_hook) and by the reclaim-driven trigger
+ *     (occupied gen count < 2 at shrink_lruvec entry)
+ *   - SIMD PTE young-bit batch scan with boot-time AVX-512F / AVX2 /
+ *     SSE2 dispatch on x86; scalar fallback on arm64 and elsewhere
+ *
+ * Core types:
+ *
+ *   marie_state[]        — global per-PFN byte array (state.{h,c})
+ *   struct marie_type    — per-type slot inside marie_lruvec (anon /
+ *                          file): per-type lock + nr_pages counter
+ *   struct marie_lruvec  — Marie's per-lruvec state (anon + file types,
+ *                          per-(lru, zone) page count, swap_bias)
+ *
+ *   lv->marie_mlv        — per-lruvec pointer to struct marie_lruvec,
+ *                          the single source of truth (no side xarray;
+ *                          lifetime tied to the lruvec/memcg, no RCU)
+ *
+ *   marie_get_lruvec()   — lazy lookup/allocation (cmpxchg publish)
+ *   marie_drop_lruvec()  — remove and free one entry
+ *   lru_marie_exit_memcg()   — drop all entries belonging to a dying memcg
+ *
+ * Recommended userspace configuration:
+ *   - vm.swappiness = 1
+ *     swappiness historically encoded the relative IO cost of swap
+ *     vs. filesystem paging, on the assumption that file cache and
+ *     anon working set carry comparable "hotness" distributions and
+ *     comparable refault costs. That assumption was authored against
+ *     spinning-disk-era hardware and no longer matches modern
+ *     systems:
+ *
+ *       Storage type           File cache cost    Recommended
+ *       -------------------    ----------------   -------------
+ *       SSD+ZRAM (Modern)          Low            1 (Marie default)
+ *       HDD (Slow,Unresponsive)    High           Higher (60+)
+ *
+ *     On modern desktops with NVMe-class file storage, lost file
+ *     cache refaults in microseconds and is largely transparent to
+ *     the user. ZRAM-backed swap, by contrast, is "free in RAM" only
+ *     on the surface: every swapout/swapin pays compression CPU,
+ *     L1/L2/L3 cache pollution from the codec working set, and
+ *     blocks the calling context -- costs that are systematically
+ *     hidden in IO accounting but ergonomically very visible as UI
+ *     stutter and jank.
+ *
+ *     Worse, on a ZRAM-equipped system in normal steady state the
+ *     pagecache typically fills physical memory. Any proportional
+ *     anon eviction at that point disturbs the anon working set just
+ *     to make room for what is mostly cold pagecache anyway --
+ *     trading a transparent SSD refault on the file side for a
+ *     visible ZRAM hit on the anon side. The cart goes before the
+ *     horse.
+ *
+ *     swappiness = 1 captures the resulting policy precisely: anon
+ *     is fully protected until the file pagecache falls below the
+ *     clean_min_ratio floor, at which point swap engages as a true
+ *     last resort. Marie's per-PFN reclaim driver maps this onto
+ *     MARIE_PICK_FILE_THEN_ANON -- FILE scanned first, ANON engaged
+ *     ONLY when skip_file is set inside marie_state_shrink_lruvec
+ *     (i.e. the floor has been breached). Per-call transient FILE
+ *     failures (empty oldest gen, all shrink_folio_list rejects,
+ *     etc.) do NOT leak into ANON -- the clean_min_ratio floor is
+ *     the single depletion signal. The bias controller stays at
+ *     zero throughout, because swappiness=1 short-circuits the
+ *     proportional update path.
+ *
+ *     Higher values (s = 2..199) remain useful on slower-storage
+ *     systems where the file refault cost is no longer negligible;
+ *     Marie honours them via the stubborn proportional controller
+ *     in marie_swap_bias_update. s = 0 is a hard "never swap" --
+ *     reach OOM rather than touch anon. s = 200 is the symmetric
+ *     "anon only" override. Both are intentional user policy
+ *     overrides; clean_min_ratio does not punch through them.
+ *   - systemd-oomd OFF
+ *     systemd-oomd reacts to PSI before Marie's clean_min_ratio
+ *     floor + no-progress OOM path has a chance to stabilise
+ *     reclaim. With Marie engaged the kernel-side OOM gate is more
+ *     accurate, and userspace OOMD ends up killing tasks Marie
+ *     would have rescued. Disable it (or leave its swap thresholds
+ *     at 100%) for predictable behaviour.
+ */
+
+#define pr_fmt(fmt) "lru_marie: " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/jump_label.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/memblock.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmzone.h>
+#include <linux/pagewalk.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rmap.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/lru_marie.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/swap.h>
+#include <linux/sysfs.h>
+#include <linux/vmstat.h>
+#include <linux/workqueue.h>
+#include <linux/writeback.h>
+#include <linux/xarray.h>
+
+#include "../internal.h"	/* struct scan_control, shrink_folio_list */
+#include "drain_scope.h"
+#include "pfn_install.h"
+#include "simd.h"
+#include "state.h"
+#include "version.h"
+
+DEFINE_STATIC_KEY_TRUE(lru_marie_enabled_key);
+EXPORT_SYMBOL_GPL(lru_marie_enabled_key);
+
+/*
+ * Marie indexes its per-PFN state array by raw PFN. The implementation
+ * caps max_pfn at 2^32 (= 4 KiB pages × 2^32 = 16 TiB of physical
+ * address space, holes included): the per-PFN byte array would be at
+ * most 4 GiB under that cap, and several other internal helpers
+ * assume the PFN fits in 32 bits. On a box that violates this Marie
+ * refuses to enable; Legacy / MGLRU continue to run unchanged.
+ *
+ * marie_pfn_unsupported is latched at subsys_initcall once max_pfn is
+ * stable (set during setup_arch / memblock init) and read-only after
+ * that, so the runtime cost is a single __read_mostly load.
+ */
+#define MARIE_MAX_SUPPORTED_PFN	(1UL << 32)
+static bool marie_pfn_unsupported __read_mostly;
+
+/*
+ * ---------------------------------------------------------------------
+ *  install path -- fully synchronous, no staging or pending queues
+ * ---------------------------------------------------------------------
+ *
+ * lru_marie_add_folio dispatches into marie_folio_install (under the
+ * per-type lock for THP, lock-free for small folios). That helper
+ * publishes the per-PFN state byte, sets the tracking bitmaps, bumps
+ * the per-mlv /
+ * global percpu_counters, and set PG_lru -- all under the caller's
+ * lru_lock irqsave. No per-CPU staging, no session-end flush hook, no
+ * wrapper allocation, no async drain, no kworker dispatch.
+ *
+ * Walker tier promotion is similarly synchronous: when a young PTE
+ * references a folio whose tier is already saturated, the walker calls
+ * marie_state_move_to_gen() directly on the per-PFN byte (no queueing).
+ */
+
+/*
+ * ---------------------------------------------------------------------
+ *  data structures (struct definitions live in mm/lru_marie/state.h)
+ * ---------------------------------------------------------------------
+ *
+ * struct marie_type / marie_lruvec, the MARIE_PFN_NR_GENS /
+ * MARIE_NR_TIERS / MARIE_TIER_MAX / MARIE_ISOLATE_BATCH constants, and
+ * the per-mlv alloc/free helpers (marie_alloc_lruvec,
+ * marie_free_lruvec) live in state.{h,c}. Runtime-tunable knobs
+ * (marie_gen_growth_threshold, etc.) live in the sysfs section at the
+ * bottom of this file. Lifecycle (xarray lookup, RCU defer, memcg
+ * teardown) stays here.
+ */
+
+/*
+ * Per-lruvec Marie state is reached directly through lv->marie_mlv (the
+ * single source of truth; no side xarray, no RCU). marie_nr_lruvecs
+ * counts live mlvs for stats only.
+ */
+static atomic_long_t marie_nr_lruvecs;
+
+/*
+ * Exported via mm/lru_marie/state.h for the install / evict / drain
+ * helpers to update during TRACKED 0<->1 transitions. percpu_counter
+ * so per-folio writes hit the local CPU's diff and only flush to the
+ * global on every percpu_counter_batch ops; reads use
+ * percpu_counter_sum (accurate, slower) in stats_show and
+ * percpu_counter_read_positive (approximate, fast) where a hot
+ * heuristic is good enough.
+ */
+struct percpu_counter marie_nr_folios;
+
+/**
+ * marie_get_lruvec - lookup or lazily allocate the Marie state for @lv.
+ *
+ * Returns NULL if Marie is disabled (so callers can skip cheaply) or if
+ * an atomic-context allocation fails.  Otherwise returns the per-lruvec
+ * state, which lives until marie_drop_lruvec() or lru_marie_exit_memcg().
+ *
+ * Hot path uses the cached @lv->marie_mlv pointer set after a successful
+ * xa_cmpxchg in this very function (or refreshed on a cache miss).
+ * Profile showed xas_load at ~1.5 % of cycles before this cache landed —
+ * the dispatcher's xarray lookup ran on every fault even though the
+ * answer was stable for the lruvec's lifetime.
+ */
+struct marie_lruvec *marie_get_lruvec(struct lruvec *lv)
+{
+	struct marie_lruvec *mlv, *old;
+	gfp_t gfp;
+
+	if (!lru_marie_enabled())
+		return NULL;
+
+	mlv = READ_ONCE(lv->marie_mlv);
+	if (likely(mlv))
+		return mlv;
+
+#ifdef CONFIG_MEMCG
+	{
+		struct mem_cgroup *memcg = lruvec_memcg(lv);
+
+		if (memcg && css_is_dying(&memcg->css))
+			return NULL;
+	}
+#endif
+
+	/* Use GFP_ATOMIC to be safe from any context.  The first hit for
+	 * any given lruvec pays this cost; subsequent reads hit the cached
+	 * pointer above. */
+	gfp = (in_task() && !irqs_disabled()) ? GFP_KERNEL : GFP_ATOMIC;
+	mlv = marie_alloc_lruvec(lv, gfp);
+	if (!mlv)
+		return NULL;
+
+	/*
+	 * Publish authoritatively into lv->marie_mlv -- the single source
+	 * of truth for this lruvec's Marie state (no side xarray). cmpxchg
+	 * resolves the lazy-alloc race: the loser frees its allocation and
+	 * adopts the winner's. The pointer lives until the lruvec's memcg is
+	 * freed (marie_drop_lruvec from mem_cgroup_free), so any caller
+	 * holding a valid lruvec sees a live mlv without RCU.
+	 */
+	old = cmpxchg((struct marie_lruvec **)&lv->marie_mlv, NULL, mlv);
+	if (old) {
+		marie_free_lruvec(mlv);
+		return old;
+	}
+
+	atomic_long_inc(&marie_nr_lruvecs);
+	return mlv;
+}
+
+/* Forward declaration: drain implementation lives further down with
+ * marie_fill_one_lruvec near the change_state machinery. */
+static void marie_drain_one_lruvec(struct lruvec *lruvec,
+				   struct list_head *to_free);
+
+static void marie_drop_lruvec(struct lruvec *lv)
+{
+	struct marie_lruvec *mlv;
+	MARIE_DRAIN_DEFER(to_free);
+
+	/*
+	 * Atomic "drain + xa_erase" under @lv->lru_lock.
+	 *
+	 * The invariant: every TRACKED bit owned by @lv must be cleared
+	 * before the xa_erase publishes "@lv has no mlv". Otherwise a
+	 * concurrent folio_put on a TRACKED folio could reach
+	 * lru_marie_del_folio, observe xa_load(@lv) == NULL, fall into
+	 * the cleanup branch and decrement counters on a torn-down mlv.
+	 *
+	 * Draining (wiping per-PFN state for every still-tracked folio
+	 * and handing folios back to legacy lruvec lists) under the same
+	 * lru_lock that brackets xa_erase closes the race: by the time
+	 * xa_erase is visible, no folio under @lv carries MARIE_TRACKED.
+	 * lru_marie_add_folio holds lru_lock too, so it cannot install
+	 * new TRACKED folios during this critical section.
+	 *
+	 * Drain itself does not sleep -- it operates entirely on the
+	 * per-(type, gen, tier) bitmaps and counters -- so it is safe
+	 * inside spin_lock_irq. drop_lruvec is rare (toggle / memcg
+	 * teardown); the extra drain work paid here is not on any hot
+	 * path. spin_lock_irq already disables preemption and IRQs (the
+	 * former migrate_disable() only existed to give the now-removed
+	 * synchronize_rcu() a stable CPU context).
+	 */
+	scoped_guard(spinlock_irq, &lv->lru_lock) {
+		marie_drain_one_lruvec(lv, &to_free);
+		/*
+		 * Capture and clear the authoritative lv->marie_mlv under
+		 * lru_lock. A concurrent dispatcher (lru_marie_add_folio /
+		 * lru_marie_del_folio) holds the same lru_lock, and
+		 * marie_drain_one_lruvec above cleared MARIE_TRACKED on
+		 * every still-tracked folio, so once this lock is dropped
+		 * no path can route a del back into @mlv.
+		 */
+		mlv = READ_ONCE(lv->marie_mlv);
+		WRITE_ONCE(lv->marie_mlv, NULL);
+	}
+	/* to_free is auto-flushed at function return via __cleanup. */
+
+	if (!mlv)
+		return;
+	atomic_long_dec(&marie_nr_lruvecs);
+
+	/*
+	 * No synchronize_rcu() needed. With the side xarray gone there is no
+	 * ref-free RCU reader of @mlv: swappiness_changed enumerates via
+	 * mem_cgroup_iter (ref-pinned), the walker's pass-end housekeeping no
+	 * longer walks per-mlv, and its per-PTE lookups deref @mlv only for
+	 * folios charged to a live memcg. Every other accessor is serialised
+	 * by lv->lru_lock above. drop_lruvec runs at mem_cgroup_free
+	 * (refcount 0, no charges) or runtime disable, so @mlv is unreachable
+	 * by the time it is freed.
+	 */
+	marie_free_lruvec(mlv);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *  generation lifecycle (helpers in mm/lru_marie/state.c)
+ * ---------------------------------------------------------------------
+ *
+ * Marie keeps a cycling per-type gen ring of MARIE_PFN_NR_GENS (= 4)
+ * slots, encoded directly in the per-PFN state byte. Aging is driven
+ * by three signals:
+ *
+ *   - lru_marie_add_folio always lands on the current head gen
+ *     (atomic_read(&marie_head_gen[type])).
+ *   - marie_state_isolate_scan_l2lock always pulls from the oldest
+ *     occupied gen (marie_find_oldest_occupied).
+ *   - shrink_folio_list classifies each isolated folio:
+ *       FOLIOREF_RECLAIM  → freed (this folio truly was cold)
+ *       FOLIOREF_KEEP     → returned in folio_list, putback re-routes
+ *       FOLIOREF_ACTIVATE → ditto, with PG_active set
+ *     putback re-installs the survivor at (oldest+1)&3 with
+ *     target_tier = max(prev_tier, w_tier) via
+ *     marie_install_at_gen.
+ *   - head_gen advances per type via marie_try_advance_head, fired by
+ *     install cadence (marie_install_advance_hook checks
+ *     marie_gen_installs against marie_gen_growth_threshold) and by
+ *     the reclaim-driven trigger at shrink_lruvec entry (occupied gen
+ *     count < 2). Advance is drain-wait gated: the next slot must be
+ *     fully empty (marie_gen_occupied[next][type] == 0).
+ *
+ * The reclaim cycle alone does not surface every hot folio; a per-pgdat
+ * SIMD walker (mm/lru_marie/walker.c) clears young PTEs and bumps
+ * marie_state_inc_tier on tracked folios. When tier saturates, the
+ * walker calls marie_state_move_to_gen() synchronously to move the
+ * folio into the head gen at tier 0 (no pending queue). The rmap path
+ * (lru_marie_look_around) feeds the walker via a per-pgdat bloom
+ * filter so PMD scans concentrate on regions the rmap recently
+ * flagged hot.
+ */
+
+/*
+ * ---------------------------------------------------------------------
+ *  folio add / del
+ * ---------------------------------------------------------------------
+ */
+
+/*
+ * lru_marie_add_folio: per-folio synchronous install.
+ *
+ * All folios are installed by marie_folio_install under the caller's
+ * lru_lock irqsave: per-PFN state byte (TRACKED + initial tier + type +
+ * zone + head_gen), tracking bitmaps, per-mlv / global counters
+ * (percpu_counter), and PG_lru are all published in one synchronous
+ * call. No per-CPU staging and no session-end flush hook -- every
+ * install is self-contained, so no carry-over state can leak across
+ * calls or across lruvecs.
+ *
+ * Skipped folio classes:
+ *
+ *   - Unevictable folios: struct folio overlays folio->lru with
+ *     folio->mlock_count via union. mm/mlock.c writes mlock_count
+ *     directly while the folio is "owned" by an lruvec but NOT on a
+ *     list. Marie keeps unevictable folios on the legacy path so
+ *     mlock_count stays addressable.
+ *
+ * THP folios are routed through the per-type lock at the dispatcher
+ * level so the install is ordered against concurrent operations on
+ * the THP's lifetime. The per-type lock is purely a caller concern;
+ * marie_folio_install's body is identical for both branches.
+ */
+bool lru_marie_add_folio(struct lruvec *lv, struct folio *folio, bool reclaiming)
+{
+	struct marie_lruvec *mlv;
+
+	lockdep_assert_held(&lv->lru_lock);
+	lockdep_assert_irqs_disabled();
+	WARN_ON_ONCE(in_hardirq());
+
+	if (!lru_marie_enabled())
+		return false;
+	if (folio_test_unevictable(folio))
+		return false;
+
+	mlv = marie_get_lruvec(lv);
+	if (!mlv)
+		return false;
+	if (unlikely(READ_ONCE(mlv->offline)))
+		return false;
+
+	/*
+	 * Large folios (THP) take the per-type lock on the way in so the
+	 * install is ordered against drain / reparent on the same type;
+	 * small folios run lock-free with only lru_lock. Both branches
+	 * route to marie_folio_install; the per-type lock is purely a
+	 * caller concern.
+	 */
+	if (folio_test_large(folio)) {
+		bool ok;
+		int type = folio_is_file_lru(folio);
+
+		scoped_guard(marie_type_lock, &mlv->types[type])
+			ok = marie_folio_install(folio, mlv);
+		return ok;
+	}
+
+	return marie_folio_install(folio, mlv);
+}
+EXPORT_SYMBOL_GPL(lru_marie_add_folio);
+
+/*
+ * Non-adopting legacy LRU add for an untracked orphan inside a del+add
+ * move_fn (swap.c: lru_activate / lru_deactivate{,_file} / lru_lazyfree) or
+ * the legacy reclaim putback (vmscan.c: move_folios_to_lru).
+ *
+ * Those paths run lruvec_del_folio() (legacy del, mz -nr for an untracked
+ * folio) and then add the folio back. Routing that add through
+ * lruvec_add_folio() -> lru_marie_add_folio() would ADOPT the folio into
+ * Marie: the install credits Marie's own accounting, but the original -nr was
+ * a legacy debit, so mz->lru_zone_size drifts and a later legacy/Marie del
+ * underflows ("marie underflow-del" / mem_cgroup_update_lru_size lru_size -1).
+ * Do a pure legacy add (the +nr leg) instead.
+ *
+ * Callers MUST first bail on lru_marie_test_tracked() folios -- a tracked
+ * folio is Marie-owned and must never touch a legacy list. Shared by swap.c's
+ * move_fns and vmscan.c's putback; see the header doc in lru_marie.h.
+ */
+void lru_marie_orphan_add(struct lruvec *lruvec, struct folio *folio, bool tail)
+{
+	enum lru_list lru = folio_lru_list(folio);
+
+	update_lru_size(lruvec, lru, folio_zonenum(folio),
+			folio_nr_pages(folio));
+	if (tail)
+		list_add_tail(&folio->lru, &lruvec->lists[lru]);
+	else
+		list_add(&folio->lru, &lruvec->lists[lru]);
+}
+EXPORT_SYMBOL_GPL(lru_marie_orphan_add);
+
+/**
+ * lru_marie_split_folio - install a freshly-split tail folio under Marie.
+ * @lv:        head folio's lruvec (caller holds lru_lock)
+ * @head:      THP head folio currently RESIDENT in Marie
+ * @new_folio: tail folio created by __split_huge_page
+ *
+ * Mirrors mm/huge_memory.c::lru_add_split_folio's
+ * "list_add_tail(&new_folio->lru, &folio->lru)" for the Marie case so
+ * that @new_folio:
+ *
+ *   - inherits @head's tier 0 install at the current head_gen
+ *     (a freshly-split tail page has no independent hotness signal
+ *     yet -- subsequent walker passes promote it on young hits)
+ *   - has its TRACKED bit set in marie_state[pfn] so dispatcher del
+ *     routes through Marie (without TRACKED, dispatcher del would
+ *     fall through to legacy update_lru_size and bypass
+ *     mlv->types[].nr_pages bookkeeping)
+ *
+ * Accounting note: lru_size and mlv->types[].nr_pages are NOT
+ * incremented for @new_folio. The original head install +N covered the
+ * full pre-split compound, and each sub-folio's eventual del decrements
+ * by its own folio_nr_pages; the sum balances. marie_nr_folios IS
+ * incremented because it is a folio count, not a page count, and the
+ * post-split state has 1 + ntails folios where there was 1 before.
+ *
+ * Caller MUST hold @lv->lru_lock and have established that @head is
+ * Marie-tracked (folio_marie_test_tracked) before invoking this. The
+ * helper takes the per-type lock internally via
+ * scoped_guard(marie_type_lock, ...).
+ *
+ * No-op (and returns) if @head is unevictable -- legacy
+ * lru_add_split_folio handles that branch separately, before calling
+ * here.
+ */
+void lru_marie_split_folio(struct lruvec *lv, struct folio *head,
+			 struct folio *new_folio)
+{
+	struct marie_lruvec *mlv;
+
+	lockdep_assert_held(&lv->lru_lock);
+	lockdep_assert_irqs_disabled();
+	WARN_ON_ONCE(in_hardirq());
+
+	/*
+	 * Caller already checked lru_marie_enabled() via the static branch,
+	 * but @head may not be Marie-tracked (e.g. THP added to legacy LRU
+	 * because Marie alloc failed at the original add). Fall back to
+	 * plain list_add_tail in that case so @new_folio joins @head's
+	 * neighbour link on the legacy LRU list as it would have without
+	 * Marie.
+	 */
+	if (!folio_marie_test_tracked(head)) {
+		list_add_tail(&new_folio->lru, &head->lru);
+		return;
+	}
+
+	if (folio_test_unevictable(head))
+		return;
+
+	mlv = marie_get_lruvec(lv);
+	if (!mlv) {
+		/*
+		 * @head is Marie-tracked but marie_get_lruvec returned NULL --
+		 * its memcg is offlining (marie_drop_lruvec already NULLed
+		 * lv->marie_mlv) or a rare GFP_ATOMIC alloc failed.
+		 *
+		 * Do NOT fall back to a bare list_add_tail: that would leave
+		 * @new_folio !TRACKED while the caller stamps PG_lru on it, and
+		 * Marie's install NEVER credited mz->lru_zone_size (Marie folios
+		 * bypass it by design). The tail's eventual legacy lruvec_del
+		 * would then underflow mz->lru_zone_size, and the list_add_tail
+		 * onto @head's Marie self-loop would corrupt the list at del.
+		 *
+		 * Instead publish the tail's TRACKED state (inheriting head's
+		 * type/gen at tier 0) so its del routes through
+		 * lru_marie_del_folio's orphan path, and keep folio->lru a
+		 * self-loop (INIT_LIST_HEAD) like every other Marie folio -- the
+		 * orphan path asserts list_empty(&folio->lru). Skip the per-type
+		 * lock and the mlv counters (there is no mlv); the per-PFN publish
+		 * primitives are atomic/lock-free and the buddy-handoff hook
+		 * (marie_state_drop_pfn_at_free) clears the byte regardless.
+		 */
+		int type = folio_is_file_lru(head);
+
+		if (folio_test_active(new_folio))
+			folio_clear_active(new_folio);
+		INIT_LIST_HEAD(&new_folio->lru);
+		marie_pfn_publish_inherit(new_folio, type,
+					  (u8)atomic_read(&marie_head_gen[type]),
+					  0, folio_zonenum(new_folio));
+		marie_pc_add(&marie_nr_folios, 1);
+		return;
+	}
+
+	/* Marie's invariant: clear PG_active before publishing TRACKED. */
+	if (folio_test_active(new_folio))
+		folio_clear_active(new_folio);
+
+	/* Head and new_folio share the same Marie type (folio split does
+	 * not change LRU category), so head's per-type lock guards both. */
+	scoped_guard(marie_type_lock, &mlv->types[folio_is_file_lru(head)]) {
+		int type = folio_is_file_lru(head);
+		int zone = folio_zonenum(new_folio);
+		u8 head_gen = (u8)atomic_read(&marie_head_gen[type]);
+
+		/*
+		 * Inherit head's tier-0 install at the current head_gen
+		 * (a freshly-split tail page has no independent hotness
+		 * signal). marie_pfn_publish_inherit writes the state byte,
+		 * the (type, gen, tier) bitmap, the per-memcg L1 bitmap, and
+		 * gen_occupied++; it deliberately skips gen_installs because
+		 * the tail inherits the parent's install budget.
+		 *
+		 * folio->lru is initialised to a self-loop, exactly as
+		 * marie_folio_install() does for a fresh install: every Marie
+		 * folio is OFF the legacy lruvec lists and tracked purely by the
+		 * per-PFN state + bitmap. The old list_add_tail onto @head's link
+		 * instead built a multi-element ring from @head and all of its
+		 * split tails; the reclaim isolate path
+		 * (marie_evict_counters_only + list_add, state.c) and every other
+		 * self-loop-assuming site then corrupted that ring, abandoning
+		 * neighbours that still pointed at the moved folio -- folios
+		 * orphaned off mz accounting (the mz->lru_zone_size underflow) and
+		 * list_del corruption / use-after-free of a reused page (the
+		 * userspace SEGV). PG_lru is set by the caller (lru_add_split_folio)
+		 * after this returns, after the per-PFN state is published, so a
+		 * concurrent __page_cache_release observing PG_lru=1 also observes
+		 * marie_state[pfn] & MARIE_PFN_TRACKED.
+		 */
+		INIT_LIST_HEAD(&new_folio->lru);
+		marie_pfn_publish_inherit(new_folio, type, head_gen, 0, zone);
+		marie_pc_add(&marie_nr_folios, 1);
+	}
+}
+EXPORT_SYMBOL_GPL(lru_marie_split_folio);
+
+bool lru_marie_del_folio(struct lruvec *lv, struct folio *folio, bool reclaiming)
+{
+	struct marie_lruvec *mlv;
+
+	lockdep_assert_held(&lv->lru_lock);
+	lockdep_assert_irqs_disabled();
+	WARN_ON_ONCE(in_hardirq());
+
+	/*
+	 * TRACKED takes priority over the lru_marie_enabled() gate.  A
+	 * folio with TRACKED=1 may still be Marie-owned after the gate
+	 * flips false during a disable transition: marie_change_state
+	 * iterates lruvecs serially, and between the gate flip and the
+	 * per-lruvec drain_one_lruvec there is a window where
+	 * not-yet-drained lruvecs hold TRACKED folios.  If we bailed at
+	 * the gate here, the dispatcher (mm_inline.h::lruvec_del_folio)
+	 * would fall through to legacy lruvec_del_folio, which does
+	 * list_del(&folio->lru) on a folio that is on a Marie self-loop,
+	 * not on the lruvec list -- corrupting the folio. (mz->lru_zone_size
+	 * stays balanced either way now that marie_update_lru_size credits
+	 * mz at install, but the list must still be handled by Marie.)
+	 *
+	 * Trusting TRACKED is safe regardless of the gate: TRACKED is
+	 * only ever set under Marie's install helpers (marie_folio_install
+	 * on every install path, marie_pfn_publish_inherit on split's tail,
+	 * marie_state_publish_at_gen on the reclaim survivor putback, and
+	 * marie_fill_one_lruvec for the enable-time legacy sweep) and
+	 * only ever cleared by Marie's evict paths or by
+	 * marie_drain_pfn_locked.
+	 */
+	if (!folio_marie_test_tracked(folio))
+		return false;
+
+	/*
+	 * Direct lv->marie_mlv read (rather than marie_get_lruvec) so we
+	 * work even when the gate is off mid-transition. Race against
+	 * marie_drop_lruvec is closed by lru_lock serialisation:
+	 * marie_drop_lruvec clears lv->marie_mlv under the same lv->lru_lock
+	 * that our caller (lruvec_del_folio) holds.
+	 */
+	mlv = READ_ONCE(lv->marie_mlv);
+	if (!mlv) {
+		/*
+		 * @lv has no mlv but @folio still carries MARIE_TRACKED.
+		 *
+		 * Per-PFN paradigm: Marie's per-folio state lives entirely
+		 * in marie_state[pfn]; folio->lru is always a self-loop
+		 * (Marie never re-attaches folios onto Marie-owned lists).
+		 * marie_drop_lruvec runs marie_drain_one_lruvec (which
+		 * clears every TRACKED bit via marie_state_drop_pfn) and
+		 * xa_erase under the SAME lru_lock the dispatcher holds,
+		 * so by the time xa_load returns NULL no folio under @mlv
+		 * remains TRACKED. Reaching here therefore implies a
+		 * cross-mlv stale state: the folio was tracked by some
+		 * other mlv whose xa entry is also gone, or an enable
+		 * transient where alloc on this lv has not yet completed.
+		 *
+		 * Note: alloc-failure at add time (marie_alloc_lruvec
+		 * returning NULL) is NOT a path that reaches here — that
+		 * path makes lru_marie_add_folio return false and the folio
+		 * goes onto the legacy LRU with TRACKED=0, so the earlier
+		 * folio_marie_test_tracked check short-circuits before this.
+		 *
+		 * Wiping the per-PFN byte and returning true is safe because
+		 * folio->lru is a self-loop; no live list neighbour points
+		 * at @folio.
+		 */
+		VM_WARN_ON_ONCE_FOLIO(!list_empty(&folio->lru), folio);
+		/* Wipe per-PFN state so the orphan does not reappear in the bitmap walk. */
+		marie_state_drop_pfn(folio);
+		if (folio_test_active(folio))
+			folio_clear_active(folio);
+		return true;
+	}
+
+	/*
+	 * External-removal entry runs without acquiring the per-type
+	 * lock. The caller (lruvec_del_folio reaching here from
+	 * compaction / folio_put -> __page_cache_release) holds
+	 * lruvec->lru_lock, which serialises every other path that could
+	 * clear MARIE_TRACKED. The eviction's list_del_init is
+	 * unconditional (folio->lru is either a self-loop or on legacy
+	 * lruvec->lists[lru], whose mutation is already covered by the
+	 * caller's lru_lock).
+	 *
+	 * marie_del_folio_locked -> marie_evict_locked -> marie_account_evict
+	 * owns the full counter wind-down, including the single
+	 * marie_nr_folios -1. Do NOT decrement it again here (the old
+	 * caller-side -1 predated the account.h funnel and double-counted
+	 * every generic del of a tracked folio).
+	 */
+	return marie_del_folio_locked(mlv, folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_del_folio);
+
+/*
+ * Outer-level release entry called from __page_cache_release when the
+ * caller has determined that TRACKED is set. See the contract in
+ * <linux/lru_marie.h>.
+ *
+ * Why a TRACKED outer gate (rather than the legacy folio_test_lru
+ * gate) matters: a Marie-installed folio is on a self-loop
+ * (folio->lru points at itself), not on a legacy lruvec list. If the
+ * legacy gate let such a folio reach mm_inline.h::lruvec_del_folio,
+ * its list_del(&folio->lru) would operate on the self-loop instead of
+ * a real list and corrupt Marie's bookkeeping. With TRACKED as the
+ * outer gate, Marie folios are routed here, which unlinks the
+ * self-loop and debits mz->lru_zone_size (marie_update_lru_size is
+ * unified with legacy update_lru_size, so the +nr at install and the
+ * -nr here balance structurally).
+ */
+void lru_marie_release_folio(struct folio *folio, struct lruvec **lruvecp,
+			     unsigned long *flagsp)
+{
+	folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
+
+	lockdep_assert_held(&(*lruvecp)->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * lru_marie_del_folio re-tests TRACKED under the lock and handles
+	 * both the normal Marie folio case and the orphan case (mlv freed
+	 * but TRACKED still set, see its body). Returns true on Marie
+	 * ownership; false means TRACKED was cleared between our caller's
+	 * outer test and our lock acquisition (race with drain).
+	 */
+	if (lru_marie_del_folio(*lruvecp, folio, false)) {
+		__folio_clear_lru_flags(folio);
+		return;
+	}
+
+	/*
+	 * Drain raced us. The folio is now on a legacy lruvec list with mz
+	 * credited (drain's line 1012). Run the legacy del to keep PG_lru,
+	 * the list membership, and mz consistent.
+	 */
+	if (folio_test_clear_lru(folio))
+		lruvec_del_folio(*lruvecp, folio);
+	__folio_clear_lru_flags(folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_release_folio);
+
+/*
+ * ---------------------------------------------------------------------
+ *  memcg lifecycle hook
+ * ---------------------------------------------------------------------
+ */
+
+void lru_marie_exit_memcg(struct mem_cgroup *memcg)
+{
+	int nid;
+
+	might_sleep();
+
+	for_each_node(nid) {
+		struct lruvec *lv = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+		marie_drop_lruvec(lv);
+	}
+
+	/* Release per-memcg bitmap if allocated (no-op when gate is off). */
+	marie_memcg_bitmap_free(memcg);
+}
+EXPORT_SYMBOL_GPL(lru_marie_exit_memcg);
+
+/*
+ * Drain one lruvec at css_offline time, while rstat_cpu is still valid.
+ * Setting mlv->offline under lru_lock before the drain ensures that any
+ * concurrent lru_marie_add_folio (Patch 2) sees the flag and bails out,
+ * keeping the legacy lists truly empty by the time css_free runs.
+ */
+static void marie_offline_lruvec(struct lruvec *lv)
+{
+	struct marie_lruvec *mlv;
+	MARIE_DRAIN_DEFER(to_free);
+
+	scoped_guard(spinlock_irq, &lv->lru_lock) {
+		mlv = READ_ONCE(lv->marie_mlv);
+		if (mlv)
+			WRITE_ONCE(mlv->offline, true);
+		marie_drain_one_lruvec(lv, &to_free);
+	}
+	/* to_free is auto-flushed at function return via __cleanup. */
+}
+
+void lru_marie_offline_memcg(struct mem_cgroup *memcg)
+{
+	int nid;
+
+	might_sleep();
+
+	if (!lru_marie_enabled())
+		return;
+
+	for_each_node(nid) {
+		struct lruvec *lv = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+		marie_offline_lruvec(lv);
+	}
+}
+EXPORT_SYMBOL_GPL(lru_marie_offline_memcg);
+
+/*
+ * NOTE: currently unused -- no in-tree caller. Marie's memcg offline path
+ * (lru_marie_offline_memcg) drains rather than reparents. Kept as the
+ * reparent counterpart for a future offline path. A future caller MUST
+ * hold both lruvecs' lru_lock (IRQs off) AND acquire them in a
+ * deterministic global order to avoid A-B/B-A between concurrent reparents;
+ * this function only takes @child_mlv's per-type locks (marie_both_mlv).
+ */
+void lru_marie_reparent_lruvec(struct lruvec *child_lv, struct lruvec *parent_lv)
+{
+	struct marie_lruvec *child_mlv, *parent_mlv;
+
+	lockdep_assert_held(&child_lv->lru_lock);
+	lockdep_assert_held(&parent_lv->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	if (!lru_marie_enabled())
+		return;
+
+	child_mlv = READ_ONCE(child_lv->marie_mlv);
+	if (!child_mlv)
+		return;
+
+	/*
+	 * Best-effort parent_mlv materialisation. A caller is required to
+	 * hold both lruvecs' lru_lock with IRQs disabled, so marie_get_lruvec
+	 * falls back to GFP_ATOMIC. On allocation failure parent_mlv stays NULL and
+	 * reparent zeroes child's per-memcg bitmap without merging --
+	 * folios then rely on global tracking only (no per-memcg filter).
+	 */
+	parent_mlv = marie_get_lruvec(parent_lv);
+
+	/*
+	 * Reparent touches child_mlv's per-type counters; take both
+	 * per-type locks in canonical order via the marie_both_mlv guard.
+	 * Caller has IRQs disabled (objcg_lock pinned), matching the
+	 * guard's spin_lock_irqsave contract.
+	 *
+	 * Reparent is per-folio iteration free: it merges the per-memcg
+	 * L1/L2 bitmap from child into parent (L2-pruned word OR) and
+	 * transfers per-type / per-(lru, zone) percpu_counters. The
+	 * per-PFN state bytes are memcg-agnostic and stay in place, so
+	 * marie_nr_folios does not change here -- reparent returns 0 and
+	 * there is no global counter adjustment.
+	 */
+	scoped_guard(marie_both_mlv, child_mlv)
+		marie_reparent_locked(child_mlv, parent_mlv);
+}
+EXPORT_SYMBOL_GPL(lru_marie_reparent_lruvec);
+
+/*
+ * Invoked from the vm.swappiness sysctl handler and memcg's
+ * memory.swappiness writer when a swappiness value has changed.
+ * Walks the marie_lruvec xarray once and resets every swap_bias to
+ * zero so the proportional controller restarts from neutral under
+ * the new weight ratio. See lru_marie.h for the rationale.
+ *
+ * Resets unconditionally rather than filtering by memcg / cgroup
+ * version: extra resets on lruvecs whose effective swappiness did
+ * not actually change are harmless under a controller whose only
+ * state is the bias counter, and the filtering would add code
+ * without changing observable behaviour. Sysctl writes are
+ * human-rate so the xa walk's cost is negligible.
+ */
+void lru_marie_swappiness_changed(void)
+{
+	struct mem_cgroup *memcg;
+
+	might_sleep();
+
+	if (!lru_marie_enabled())
+		return;
+
+	/*
+	 * Reset every lruvec's swap_bias to neutral. Enumerate via
+	 * mem_cgroup_iter (which ref-pins each memcg across the step,
+	 * keeping its mlv alive) rather than a ref-free xarray walk -- a
+	 * human-rate sysctl path, so the per-memcg ref is irrelevant.
+	 *
+	 * Also covers the !memcg / mem_cgroup_disabled node lruvec:
+	 * mem_cgroup_iter then yields NULL, the body runs once with
+	 * memcg==NULL, and mem_cgroup_lruvec(NULL, pgdat) resolves to
+	 * pgdat->__lruvec. Best-effort -- a missed lruvec simply keeps its
+	 * bias and reconverges under the controller.
+	 */
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		int nid;
+
+		for_each_node_state(nid, N_MEMORY) {
+			struct lruvec *lv =
+				mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+			struct marie_lruvec *mlv = READ_ONCE(lv->marie_mlv);
+
+			if (mlv)
+				atomic64_set(&mlv->swap_bias, 0);
+		}
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+}
+EXPORT_SYMBOL_GPL(lru_marie_swappiness_changed);
+
+/*
+ * ---------------------------------------------------------------------
+ *  enable / disable
+ * ---------------------------------------------------------------------
+ */
+
+/*
+ * Per-lruvec migration helpers used by marie_change_state(). Caller
+ * holds @lruvec->lru_lock with IRQs disabled, which orders correctly
+ * against the inner per-type locks taken by the scoped_guard
+ * (marie_both_mlv) below.
+ *
+ * marie_drain_one_lruvec(): walk every (type, gen, tier) bitmap, wipe
+ * the per-PFN state for each tracked folio under this lv, and hand the
+ * folio back to legacy lruvec->lists[lru] (or MGLRU's lrugen if MGLRU
+ * is the fallback). After return, the lruvec is purely legacy LRU /
+ * MGLRU.
+ *
+ * marie_fill_one_lruvec(): force-allocate @mlv for @lruvec and pull
+ * every evictable folio off @lruvec->lists[lru] into Marie via the
+ * canonical lruvec_del_folio + marie_folio_install pair. After
+ * return, the lruvec's standard lru lists hold only unevictable folios;
+ * everything else is tracked via marie_state[].
+ */
+
+/* Non-allocating lookup; mirror of marie_get_lruvec without the
+ * lru_marie_enabled() gate (state-change runs while the gate is
+ * mid-flip). Same lv->marie_mlv cache as the gated path. */
+static struct marie_lruvec *marie_lookup_lruvec(struct lruvec *lv)
+{
+	/* lv->marie_mlv is authoritative; NULL means no Marie state. */
+	return READ_ONCE(lv->marie_mlv);
+}
+
+/* As marie_get_lruvec() but bypasses the lru_marie_enabled() gate. */
+static struct marie_lruvec *marie_force_alloc_lruvec(struct lruvec *lv,
+						 gfp_t gfp)
+{
+	struct marie_lruvec *mlv, *old;
+
+	mlv = READ_ONCE(lv->marie_mlv);
+	if (mlv)
+		return mlv;
+
+	mlv = marie_alloc_lruvec(lv, gfp);
+	if (!mlv)
+		return NULL;
+
+	old = cmpxchg((struct marie_lruvec **)&lv->marie_mlv, NULL, mlv);
+	if (old) {
+		marie_free_lruvec(mlv);
+		return old;
+	}
+	atomic_long_inc(&marie_nr_lruvecs);
+	return mlv;
+}
+
+/*
+ * Drain one tracked PFN found by a bitmap walk: wipe the per-PFN state
+ * artifacts and hand the folio back to legacy lruvec->lists[lru]
+ * (or detach for unevictable), updating counters.
+ *
+ * folio->lru is a self-loop at this point (install/flush both leave it
+ * that way), so list_move just inserts into the legacy list head
+ * without disturbing any prior list. Caller-held lruvec->lru_lock
+ * serialises the legacy-list mutation.
+ *
+ * Caller must hold mlv's marie_both_mlv scope (type_lock for both
+ * types).
+ */
+static void marie_drain_pfn_locked(struct marie_lruvec *mlv,
+				   struct lruvec *lruvec,
+				   unsigned long pfn, int type,
+				   struct list_head *to_free)
+{
+	struct folio *folio;
+	long nr;
+	enum lru_list lru;
+	int z;
+	u8 state_byte;
+
+	lockdep_assert_held(&lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	if (!pfn_valid(pfn))
+		return;
+	folio = pfn_folio(pfn);
+	if (!folio || folio_pfn(folio) != pfn)
+		return;
+	/*
+	 * Re-validate against the per-PFN byte: any racing del path
+	 * would have cleared TRACKED already. Walking the bitmap without
+	 * a per-PFN lock means we may observe a stale bit whose backing
+	 * folio is gone; the byte is the source of truth for the
+	 * transition.
+	 */
+	state_byte = READ_ONCE(marie_state[pfn]);
+	if (!(state_byte & MARIE_PFN_TRACKED))
+		return;
+
+	/*
+	 * Pin + claim before touching folio->lru. The reclaim isolate
+	 * path (marie_evict_counters_only) runs lock-free -- it claims a
+	 * folio via folio_test_clear_lru WITHOUT lru_lock and can free a
+	 * TRACKED folio concurrently with this drain even though we hold
+	 * lruvec->lru_lock. If we list_move() such a folio after it has
+	 * been freed (folio->lru poisoned to LIST_POISON) the list op
+	 * dereferences the poison and oopses (NULL write at
+	 * marie_drain_pfn_locked).
+	 *
+	 *   1. folio_try_get: fails if the folio is already at refcount 0
+	 *      (mid-free). The page-free hook clears the per-PFN state, so
+	 *      nothing to do here.
+	 *   2. folio_test_clear_lru: the same atomic claim the isolate
+	 *      path uses. If PG_lru is already clear, an in-flight isolate
+	 *      owns the folio (its folio->lru is on a private reclaim list,
+	 *      not Marie's self-loop); skip and let that path + the free
+	 *      hook clean up. If we win the claim, the folio is ours: its
+	 *      folio->lru is a Marie self-loop, safe to list_move onto the
+	 *      legacy list, and we re-publish PG_lru afterwards.
+	 */
+	if (!folio_try_get(folio))
+		return;
+	if (!folio_test_clear_lru(folio)) {
+		folio_put(folio);
+		return;
+	}
+	/* Re-check TRACKED under our exclusive claim. */
+	if (!(READ_ONCE(marie_state[pfn]) & MARIE_PFN_TRACKED)) {
+		/*
+		 * Same last-reference hazard as the tail below: under the
+		 * css_offline drain our pin can be the last one. Defer the free
+		 * to @to_free (released by the caller after lru_lock is dropped);
+		 * freeing here would re-enter __page_cache_release under the held
+		 * lru_lock. The folio is a Marie self-loop here (never
+		 * list_move'd), so no list/counter unwind is needed.
+		 */
+		if (folio_put_testzero(folio)) {
+			folio_clear_active(folio);
+			list_add(&folio->lru, to_free);
+		} else {
+			folio_set_lru(folio);
+		}
+		return;
+	}
+
+	nr = folio_nr_pages(folio);
+	z = folio_zonenum(folio);
+
+	/*
+	 * Wipe per-PFN state directly via marie_state_drop_pfn
+	 * (byte + bitmap + counters + l2_range_count + memcg L1).
+	 */
+	marie_state_drop_pfn(folio);
+
+	/*
+	 * Mirror Marie's install invariant: PG_active=0 before computing
+	 * the lru index, so the -nr on marie_lru_zone_size lands on the
+	 * same INACTIVE bucket Marie's install +nr'd.
+	 */
+	if (folio_test_active(folio))
+		folio_clear_active(folio);
+	lru = folio_lru_list(folio);
+
+	/*
+	 * Hand the folio to its legacy lruvec list. This is mz-NEUTRAL:
+	 * Marie already credited mz->lru_zone_size at install
+	 * (marie_update_lru_size is unified with legacy update_lru_size),
+	 * so the folio is counted in mz the whole time -- the drain only
+	 * moves it from its Marie self-loop onto the real lruvec list. Do
+	 * NOT re-credit mz here; that would double-count. PG_lru is
+	 * re-published below, and the PG_lru=1 invariant is "on a real
+	 * lruvec list AND counted in mz->lru_zone_size".
+	 *
+	 * This MUST cover LRU_UNEVICTABLE the same way -- generic
+	 * lruvec_add_folio places unevictable folios on
+	 * lists[LRU_UNEVICTABLE]. An earlier list_del_init()+skip
+	 * special-case orphaned a re-published-PG_lru folio off every list;
+	 * when exit_mmap later munlocked and freed it, the generic
+	 * lruvec_del_folio walked a folio that was no longer on any list ->
+	 * corrupted-rmap Oops in shrink_folio_list. Keep every drained folio
+	 * on a real list. Only reachable via marie_drop_lruvec (memcg
+	 * offline), so swap tests without cgroup churn never exercised it.
+	 */
+	list_move(&folio->lru, &lruvec->lists[lru]);
+
+	marie_pc_add(&mlv->types[type].nr_pages, -nr);
+	marie_pc_add(&marie_nr_folios, -1);
+	/* Marie's internal per-bucket tally drops; mz keeps the +nr. */
+	marie_pc_add(&mlv->marie_lru_zone_size[lru][z], -nr);
+
+	/*
+	 * Drop our transient pin and re-publish on the legacy LRU.
+	 *
+	 * folio_put_testzero, not a plain folio_put: the css_offline drain
+	 * (marie_offline_lruvec) runs while exit_mmap is concurrently freeing
+	 * the dying memcg's folios, so our pin can be the LAST reference. A
+	 * plain folio_put would then enter __page_cache_release, which -- with
+	 * the PG_lru we set below -- re-acquires lruvec->lru_lock, the very
+	 * lock the caller already holds: a recursive self-deadlock (IRQs off
+	 * -> hard lockup). The change-state drain never hit this because its
+	 * folios are still mapped (pin never reaches zero).
+	 *
+	 * When the put frees the folio: leave PG_lru clear, undo the
+	 * list_move, and debit mz->lru_zone_size. The folio is leaving the
+	 * LRU for the buddy allocator, so this -nr is the del-debit that
+	 * settles the +nr Marie credited at install (no legacy
+	 * lruvec_del_folio will run for it -- PG_lru stays clear and the
+	 * free goes straight to @to_free). DEFER the actual free to
+	 * @to_free: the caller calls marie_drain_release() after dropping
+	 * lru_lock, so __folio_put() (mem_cgroup_uncharge, deferred-split
+	 * unqueue, buddy free) never runs under lru_lock -- matching the
+	 * release_pages() / survivor-putback discipline.
+	 */
+	if (folio_put_testzero(folio)) {
+		list_del(&folio->lru);
+#ifdef CONFIG_MEMCG
+		mem_cgroup_update_lru_size(lruvec, lru, z, -nr);
+#endif
+		folio_clear_active(folio);
+		list_add(&folio->lru, to_free);
+	} else {
+		folio_set_lru(folio);
+	}
+}
+
+/*
+ * Walk marie_gen_bitmap[type][gen][tier] AND (when memcg-targeted)
+ * the per-memcg L1, restricted to mlv's pgdat PFN range. For each set
+ * bit, drain the underlying folio onto its legacy lruvec list.
+ *
+ * Outer loop is L2-pruned via the per-(type, gen, tier) L2 bitmap;
+ * the L2 bit is refcount-maintained against marie_l2_cell_count so
+ * an L2 bit set implies at least one L1 bit set in the same cell.
+ *
+ * Inner L1 iteration uses a local word copy (__ffs/blsr extraction),
+ * so the marie_state_drop_pfn calls that clear the global bitmap
+ * behind our back do not perturb forward progress.
+ *
+ * Caller holds mlv's marie_both_mlv scope (drain runs with the gate off
+ * or during memcg teardown; concurrent install/del on this mlv's
+ * memcg-bound PFNs is quiescent).
+ */
+static void marie_drain_bitmap_walk_one(struct marie_lruvec *mlv,
+					struct lruvec *lruvec,
+					int type, int gen, int tier,
+					struct list_head *to_free)
+{
+	struct marie_bitmap *bm = &marie_track_bm[type][gen][tier];
+	unsigned long *l1 = bm->l1;
+	unsigned long *l2 = bm->l2;
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	unsigned long *memcg_l1 = NULL, *memcg_l2 = NULL;
+	unsigned long start_pfn, end_pfn;
+	unsigned int start_l2, end_l2, l2_word, l2_word_end;
+
+	lockdep_assert_held(&lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	if (!l1 || !marie_state)
+		return;
+	start_pfn = pgdat->node_start_pfn;
+	end_pfn = pgdat_end_pfn(pgdat);
+	if (end_pfn > marie_state_size)
+		end_pfn = marie_state_size;
+	if (start_pfn >= end_pfn)
+		return;
+
+	if (memcg && !mem_cgroup_is_root(memcg)) {
+		memcg_l1 = marie_memcg_bitmap_get(memcg);
+		memcg_l2 = marie_memcg_bitmap_get_l2(memcg);
+	}
+
+	start_l2 = marie_pfn_to_l2_bit(start_pfn);
+	end_l2 = marie_pfn_to_l2_bit(end_pfn - 1) + 1;
+	if (end_l2 > MARIE_L2_BITS)
+		end_l2 = MARIE_L2_BITS;
+	l2_word = start_l2 / BITS_PER_LONG;
+	l2_word_end = DIV_ROUND_UP(end_l2, BITS_PER_LONG);
+
+	for (; l2_word < l2_word_end; l2_word++) {
+		unsigned long l2w = l2[l2_word];
+
+		if (memcg_l2)
+			l2w &= memcg_l2[l2_word];
+		if (l2_word == start_l2 / BITS_PER_LONG &&
+		    (start_l2 % BITS_PER_LONG))
+			l2w &= ~((1UL << (start_l2 % BITS_PER_LONG)) - 1);
+		if (l2_word + 1 == l2_word_end &&
+		    (end_l2 % BITS_PER_LONG))
+			l2w &= (1UL << (end_l2 % BITS_PER_LONG)) - 1;
+
+		while (l2w) {
+			unsigned int bit = l2_word * BITS_PER_LONG + __ffs(l2w);
+			unsigned long lo, hi;
+			unsigned long word_i, end_word;
+
+			l2w &= l2w - 1;
+
+			lo = marie_l2_bit_pfn_start(bit);
+			hi = marie_l2_bit_pfn_end(bit);
+			if (lo < start_pfn)
+				lo = start_pfn;
+			if (hi > end_pfn)
+				hi = end_pfn;
+
+			word_i = lo / BITS_PER_LONG;
+			end_word = BITS_TO_LONGS(hi);
+
+			for (; word_i < end_word; word_i++) {
+				unsigned long w = l1[word_i];
+
+				if (memcg_l1)
+					w &= memcg_l1[word_i];
+
+				if (word_i == lo / BITS_PER_LONG &&
+				    (lo % BITS_PER_LONG))
+					w &= ~((1UL << (lo % BITS_PER_LONG)) - 1);
+				if (word_i + 1 == end_word &&
+				    (hi % BITS_PER_LONG))
+					w &= (1UL << (hi % BITS_PER_LONG)) - 1;
+
+				while (w) {
+					unsigned int b = __ffs(w);
+					unsigned long pfn = word_i * BITS_PER_LONG + b;
+
+					w &= w - 1;
+					marie_drain_pfn_locked(mlv, lruvec,
+							       pfn, type,
+							       to_free);
+				}
+			}
+		}
+	}
+}
+
+static void marie_drain_one_lruvec(struct lruvec *lruvec,
+				   struct list_head *to_free)
+{
+	struct marie_lruvec *mlv = marie_lookup_lruvec(lruvec);
+	int t, g, tier;
+
+	lockdep_assert_held(&lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	if (!mlv)
+		return;
+
+	scoped_guard(marie_both_mlv, mlv) {
+		/*
+		 * No pending-queue drain needed: install / evict / tier
+		 * saturate are all synchronous in the per-PFN paradigm
+		 * (marie_state_inc_tier handles saturation in-place via
+		 * marie_state_move_to_gen), so no out-of-band state
+		 * remains after the gate flip.
+		 */
+
+		/*
+		 * Iterate the global L1 bitmap per (type, gen, tier) AND'd
+		 * with the memcg L1 (when targeted), restricted to this
+		 * pgdat's PFN range. Each set bit is drained via
+		 * marie_drain_pfn_locked: state byte / bitmap / counters
+		 * are wiped and the folio is handed back to legacy
+		 * lruvec->lists[lru] under the caller's lru_lock.
+		 */
+		for (t = 0; t < ANON_AND_FILE; t++)
+			for (g = 0; g < MARIE_PFN_NR_GENS; g++)
+				for (tier = 0; tier < MARIE_PFN_NR_TIERS; tier++)
+					marie_drain_bitmap_walk_one(mlv, lruvec,
+								    t, g, tier,
+								    to_free);
+	}
+
+	/*
+	 * Hand off legacy residue to MGLRU if MGLRU is the fallback.
+	 *
+	 * Marie's gate is off here (the caller flipped it before
+	 * invoking drain). The folios just deposited on
+	 * lruvec->lists[lru] need to migrate onto lrugen so MGLRU's
+	 * state_is_valid invariant ("lrugen enabled => legacy lists
+	 * empty") holds, and so MGLRU's own reclaim path can see and
+	 * evict them while Marie is dormant.
+	 *
+	 * lru_gen_fill_lruvec calls fill_evictable internally, which
+	 * uses lruvec_del_folio (skips Marie because the gate is off,
+	 * falls through to the plain legacy list_del) + lru_gen_add_folio
+	 * to land each folio on lrugen with correct accounting.
+	 *
+	 * Invoked OUTSIDE the marie_both_mlv scope above: the helper
+	 * does not need (and must not be entangled with) Marie's
+	 * per-type locks once the drain proper is complete.
+	 *
+	 * At css_free time this call is a guaranteed no-op: mlv->offline
+	 * was set at css_offline time and prevented all new installs, so
+	 * lruvec->lists[lru] is empty and fill_evictable() returns
+	 * immediately without touching rstat_cpu.
+	 *
+	 * Use lru_gen_core_enabled(), not lru_gen_enabled(): this is the
+	 * ownership handoff and must see the raw MGLRU key. lru_gen_enabled()
+	 * is masked to false whenever Marie is enabled, but here Marie's gate
+	 * is already off (the caller flipped it) so the two agree -- we use
+	 * the raw key for intent and symmetry with marie_fill_one_lruvec.
+	 */
+	if (!READ_ONCE(mlv->offline) && lru_gen_core_enabled())
+		lru_gen_fill_lruvec(lruvec);
+}
+
+
+static void marie_fill_one_lruvec(struct lruvec *lruvec)
+{
+	struct marie_lruvec *mlv;
+	enum lru_list lru;
+
+	lockdep_assert_held(&lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	mlv = marie_force_alloc_lruvec(lruvec, GFP_ATOMIC);
+	if (!mlv)
+		return;
+
+	/*
+	 * If MGLRU was the fallback while Marie was off, its lrugen
+	 * lists hold every folio added during that window. Drain them
+	 * via MGLRU's canonical lru_gen_del_folio + lruvec_add_folio
+	 * sequence: with Marie's gate already on at this point in
+	 * marie_change_state, lruvec_add_folio routes each folio
+	 * straight into lru_marie_add_folio, which sets per-PFN state
+	 * via marie_folio_install. No MGLRU internals are
+	 * touched on the Marie side; accounting drift is impossible
+	 * because the drain uses MGLRU's own helpers throughout.
+	 *
+	 * Use lru_gen_core_enabled(), not lru_gen_enabled(): Marie's gate is
+	 * already ON at this point, so lru_gen_enabled() is masked to false
+	 * and would skip this drain, stranding MGLRU-tagged folios on lrugen
+	 * (gen bits intact) -- exactly the residue this whole pass exists to
+	 * clear. The handoff must see the raw MGLRU key.
+	 */
+	if (lru_gen_core_enabled())
+		lru_gen_drain_lruvec(lruvec);
+
+	scoped_guard(marie_both_mlv, mlv) {
+		/*
+		 * Walk legacy lruvec->lists[lru]. By the time we reach
+		 * here, MGLRU lrugen contents have already been drained
+		 * by the caller via lru_gen_drain_lruvec (with Marie's
+		 * gate on, those folios route through lru_marie_add_folio
+		 * directly into Marie's normal install path, so the
+		 * per-PFN state is set correctly without any work here).
+		 *
+		 * What remains on the legacy lists is the residue from a
+		 * previous marie_drain_one_lruvec under
+		 * MGLRU=n -- folios Marie itself evicted back to legacy
+		 * lists during a prior disable transition. Import each
+		 * via lruvec_del_folio + marie_folio_install.
+		 */
+		for (lru = 0; lru < NR_LRU_LISTS; lru++) {
+			struct folio *f, *next_f;
+
+			if (lru == LRU_UNEVICTABLE)
+				continue;
+
+			list_for_each_entry_safe(f, next_f,
+						 &lruvec->lists[lru], lru) {
+				/*
+				 * Transfer ownership from the legacy LRU to
+				 * Marie via the canonical del/add pair.
+				 *
+				 * lruvec_del_folio: PG_active is still set
+				 * at this point so folio_lru_list() returns
+				 * the correct old bucket (ACTIVE or INACTIVE)
+				 * for the mz and vmstat debits; the matching
+				 * credit lands in INACTIVE after install
+				 * because marie_folio_install clears
+				 * PG_active before computing inst_lru.
+				 *
+				 * marie_folio_install: clears PG_active (via
+				 * set_mask_bits), sets PG_lru, writes the
+				 * per-PFN state byte and bitmaps, and credits
+				 * all Marie counters -- exactly the same path
+				 * the fault-install uses. The per-type lock
+				 * is already held by marie_both_mlv at this
+				 * scope; routing through lru_marie_add_folio
+				 * would deadlock by trying to re-acquire it.
+				 */
+				lruvec_del_folio(lruvec, f);
+				marie_folio_install(f, mlv);
+			}
+		}
+
+	}
+}
+
+/*
+ * Per-lruvec body of marie_change_state. Acquires lru_lock with IRQs
+ * off (drain/fill require it), invokes the appropriate transition,
+ * releases, then runs marie_drop_lruvec outside lru_lock (it frees the
+ * mlv after dropping the lock).
+ *
+ * marie_drop_lruvec is idempotent (lv->marie_mlv reads NULL on a second
+ * visit), so this helper is safe to call on lruvecs that never carried
+ * an mlv.
+ */
+static void marie_change_state_lruvec(struct lruvec *lruvec, bool enable)
+{
+	MARIE_DRAIN_DEFER(to_free);
+
+	/*
+	 * spin_lock_irq disables preemption and IRQs; the earlier
+	 * migrate_disable() was only needed to give a now-removed
+	 * synchronize_rcu() a stable CPU context and is no longer
+	 * required (marie_drop_lruvec carries the same note).
+	 */
+	scoped_guard(spinlock_irq, &lruvec->lru_lock) {
+		if (enable)
+			marie_fill_one_lruvec(lruvec);
+		else
+			marie_drain_one_lruvec(lruvec, &to_free);
+	}
+	/* to_free is auto-flushed at function return (no-op on enable). */
+
+	if (!enable)
+		marie_drop_lruvec(lruvec);
+
+	cond_resched();
+}
+
+static int marie_change_state(bool enable)
+{
+	static DEFINE_MUTEX(state_mutex);
+	struct mem_cgroup *memcg;
+	int ret = 0;
+
+	/*
+	 * Refuse enable on boxes whose PFN space does not fit in 32 bits;
+	 * Marie's gen lists store folios as packed 32-bit PFN indices and
+	 * would corrupt them silently. Disable is always allowed.
+	 */
+	if (enable && unlikely(marie_pfn_unsupported))
+		return -EOPNOTSUPP;
+
+	cgroup_lock();
+	cpus_read_lock();
+	get_online_mems();
+	mutex_lock(&state_mutex);
+
+	if (enable == static_branch_likely(&lru_marie_enabled_key))
+		goto unlock;
+
+	/*
+	 * Gate is flipped BEFORE the per-lruvec walk in both directions.
+	 *
+	 * Disable: gate flips off first so no new TRACKED bits get set
+	 *   while we drain.  lru_marie_add_folio sees gate=off and bails;
+	 *   drain_one_lruvec catches in-flight adds via lru_lock
+	 *   serialisation.  Concurrent lru_marie_del_folio relies on the
+	 *   TRACKED-first check (not the gate) to route TRACKED folios
+	 *   through Marie on not-yet-drained lruvecs.
+	 *
+	 * Enable: gate flips on first so concurrent lru_marie_add_folio
+	 *   routes new faults into Marie's synchronous install path
+	 *   while we walk existing legacy LRU lists to install pre-fault
+	 *   folios. If the gate flipped after fill (the historical order),
+	 *   dels of already-filled folios (TRACKED=1) would race against
+	 *   gate=off and the dispatcher would underflow mz->lru_zone_size
+	 *   via a legacy update_lru_size that Marie already accounted for.
+	 */
+	if (enable)
+		static_branch_enable_cpuslocked(&lru_marie_enabled_key);
+	else
+		static_branch_disable_cpuslocked(&lru_marie_enabled_key);
+
+	/*
+	 * Disable path iterates non-root memcgs first, then root. The
+	 * global tracking bitmaps (marie_track_bm[type][gen][tier]) are
+	 * the union of every memcg's residency. The per-memcg L1/L2 mask
+	 * applied inside marie_drain_bitmap_walk_one narrows the walk to
+	 * the current memcg's PFNs -- but only for non-root memcgs.
+	 * root_mem_cgroup has NO per-memcg bitmap (marie_memcg_bitmap_set
+	 * short-circuits on root), so a root-lruvec drain runs with
+	 * memcg_l1 = memcg_l2 = NULL and walks every set bit in the
+	 * global bitmap.
+	 *
+	 * If root were drained first (the natural mem_cgroup_iter order),
+	 * the no-mask walk would scoop every tracked PFN -- including
+	 * those owned by non-root memcgs -- onto root's lruvec lists. Each
+	 * such folio's +nr lives in its true owner's mz->lru_zone_size
+	 * (credited at install), but it now sits on root's lruvec. The
+	 * subsequent lru_gen_fill_lruvec on root's lruvec would then issue
+	 * lruvec_del_folio against root's lruvec, decrementing root's
+	 * mz->lru_zone_size for a page the owner memcg was charged for --
+	 * underflowing root while leaving the owner over-counted, and
+	 * MGLRU's reclaim under memory pressure livelocks on the corrupted
+	 * counters.
+	 *
+	 * Draining non-root memcgs first incrementally clears their bits
+	 * from the global bitmap, so by the time root's no-mask pass runs
+	 * only PFNs that genuinely belong to root remain set. No special
+	 * per-PFN folio_memcg() check is needed -- the bitmap arithmetic
+	 * makes the filter implicit.
+	 *
+	 * Enable path is symmetric in iteration but does NOT depend on
+	 * the order: marie_fill_one_lruvec walks @lruvec's legacy / lrugen
+	 * lists, not the global bitmap, so cross-memcg confusion is
+	 * impossible. Kept as a single normal-order pass for simplicity.
+	 */
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		int nid;
+
+		if (!enable && mem_cgroup_is_root(memcg))
+			continue;
+
+		for_each_node(nid)
+			marie_change_state_lruvec(mem_cgroup_lruvec(memcg,
+								    NODE_DATA(nid)),
+						  enable);
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+	if (!enable) {
+		int nid;
+
+		for_each_node(nid)
+			marie_change_state_lruvec(mem_cgroup_lruvec(NULL,
+								    NODE_DATA(nid)),
+						  false);
+	}
+
+	pr_info("%s\n", enable ? "enabled" : "disabled");
+
+unlock:
+	mutex_unlock(&state_mutex);
+	put_online_mems();
+	cpus_read_unlock();
+	cgroup_unlock();
+	return ret;
+}
+
+/* boot param: lru_marie=0 / lru_marie=1. At boot the cgroup tree is
+ * not yet populated and no lruvec carries folios, so a plain static-key
+ * toggle is sufficient — marie_change_state's mem_cgroup_iter would
+ * have nothing to migrate anyway. */
+static int __init marie_setup(char *str)
+{
+	int v;
+
+	if (!str || kstrtoint(str, 0, &v))
+		return 0;
+	if (v)
+		static_branch_enable(&lru_marie_enabled_key);
+	else
+		static_branch_disable(&lru_marie_enabled_key);
+	return 1;
+}
+__setup("lru_marie=", marie_setup);
+
+
+unsigned int lru_marie_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+	WARN_ON_ONCE(!sc);
+
+	/*
+	 * Per-PFN bitmap scan is the sole reclaim driver in Marie. The
+	 * returned MARIE_DRAIN_* mask tells shrink_lruvec which orphan type(s)
+	 * its legacy drain may reclaim (exactly the type(s) Marie scanned).
+	 */
+	return marie_state_shrink_lruvec(lruvec, sc);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *  /sys/kernel/mm/lru_marie/
+ * ---------------------------------------------------------------------
+ */
+
+static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr,
+			    char *buf)
+{
+	return sysfs_emit(buf, "%d\n",
+			  static_branch_likely(&lru_marie_enabled_key) ? 1 : 0);
+}
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	bool v;
+	int err = kstrtobool(buf, &v);
+
+	if (err)
+		return err;
+
+	err = marie_change_state(v);
+	if (err)
+		return err;
+	return count;
+}
+
+static struct kobj_attribute marie_enabled_attr = __ATTR_RW(enabled);
+
+/*
+ * /sys/kernel/mm/lru_marie/version
+ *
+ * Read-only. Exposes MARIE_VERSION so userspace tooling (benchmark
+ * scripts, sysadmins, support pastes) can identify which Marie build
+ * is running without parsing dmesg.
+ */
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+			    char *buf)
+{
+	return sysfs_emit(buf, "%s\n", MARIE_VERSION);
+}
+
+static struct kobj_attribute marie_version_attr = __ATTR_RO(version);
+
+static ssize_t stats_show(struct kobject *kobj, struct kobj_attribute *attr,
+			  char *buf)
+{
+	return sysfs_emit(buf,
+			  "nr_lruvecs %ld\n"
+			  "nr_folios %lld\n",
+			  atomic_long_read(&marie_nr_lruvecs),
+			  percpu_counter_sum(&marie_nr_folios));
+}
+
+static struct kobj_attribute marie_stats_attr = __ATTR_RO(stats);
+
+/*
+ * clean_min_ratio sysfs knob.
+ * Range 0..100 (percentage of node_present_pages).
+ */
+static ssize_t clean_min_ratio_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", READ_ONCE(marie_clean_min_ratio));
+}
+
+static ssize_t clean_min_ratio_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count)
+{
+	unsigned int v;
+	int err = kstrtouint(buf, 10, &v);
+
+	if (err)
+		return err;
+	if (v > 100)
+		return -EINVAL;
+	WRITE_ONCE(marie_clean_min_ratio, v);
+	return count;
+}
+
+static struct kobj_attribute marie_clean_min_ratio_attr =
+	__ATTR_RW(clean_min_ratio);
+
+#ifdef CONFIG_SWAP
+/*
+ * kcompmari sysfs knob: signed -100..+100, default +24.
+ *
+ *   0           — disabled. kcompmari_store short-circuits to false
+ *                 and swap_writeout falls straight through to inline
+ *                 zswap_store / __swap_writepage.
+ *   +1..+100    — Marie-gated. Queue length = |v|. The kfifo backing
+ *                 storage is sized at KCOMPMARI_FIFO_SIZE (the max);
+ *                 |v| is the soft depth at which the producer treats
+ *                 the queue as full and falls back to sync writeout.
+ *                 Tracks lru_marie_enabled() so disabling Marie at
+ *                 runtime also quiesces kcompmari without a second
+ *                 sysfs write.
+ *   -1..-100    — force mode. Queue length = |v|. Runs even when
+ *                 Marie is off, for users who want the async-compress
+ *                 helper independently of the Marie reclaim path.
+ *
+ * Default +24 mirrors the queue length kcompressd-unofficial proved
+ * sound under sustained anon pressure. Use -24 to force kcompmari on
+ * even with Marie off; use 0 to disable entirely.
+ *
+ * Encoded as two static branches (kcompmari_enabled_key and
+ * kcompmari_force_key declared in <linux/lru_marie.h>) so the hot path
+ * costs a single predicted jump in the common (enabled, Marie-gated) case.
+ */
+DEFINE_STATIC_KEY_TRUE(kcompmari_enabled_key);
+EXPORT_SYMBOL_GPL(kcompmari_enabled_key);
+DEFINE_STATIC_KEY_FALSE(kcompmari_force_key);
+EXPORT_SYMBOL_GPL(kcompmari_force_key);
+
+int vm_kcompmari = 24;
+EXPORT_SYMBOL_GPL(vm_kcompmari);
+
+static ssize_t kcompmari_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", READ_ONCE(vm_kcompmari));
+}
+
+static ssize_t kcompmari_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	int v;
+	int err = kstrtoint(buf, 10, &v);
+
+	if (err)
+		return err;
+	if (v < -100 || v > 100)
+		return -EINVAL;
+	WRITE_ONCE(vm_kcompmari, v);
+
+	if (v != 0)
+		static_branch_enable(&kcompmari_enabled_key);
+	else
+		static_branch_disable(&kcompmari_enabled_key);
+	if (v < 0)
+		static_branch_enable(&kcompmari_force_key);
+	else
+		static_branch_disable(&kcompmari_force_key);
+
+	return count;
+}
+
+static struct kobj_attribute marie_kcompmari_attr = __ATTR_RW(kcompmari);
+#endif /* CONFIG_SWAP */
+
+#ifdef CONFIG_X86
+/*
+ * SIMD walker kill-switch: /sys/kernel/mm/lru_marie/simd
+ *
+ * Default 1: walker uses the boot-detected SIMD wrapper (AVX-512F /
+ * AVX2 / SSE2). Writing 0 flips marie_simd_enabled_key so the walker
+ * falls through to a scalar pte_young loop in mm/lru_marie/simd_x86.c.
+ */
+static ssize_t simd_show(struct kobject *kobj,
+			 struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n",
+			  static_branch_likely(&marie_simd_enabled_key) ? 1 : 0);
+}
+
+static ssize_t simd_store(struct kobject *kobj,
+			  struct kobj_attribute *attr,
+			  const char *buf, size_t count)
+{
+	bool v;
+	int err = kstrtobool(buf, &v);
+
+	if (err)
+		return err;
+	if (v)
+		static_branch_enable(&marie_simd_enabled_key);
+	else
+		static_branch_disable(&marie_simd_enabled_key);
+	return count;
+}
+
+static struct kobj_attribute marie_simd_attr = __ATTR_RW(simd);
+#endif /* CONFIG_X86 */
+
+/*
+ * ---------------------------------------------------------------------
+ *  Reclaim / walker tunables (runtime-adjustable via sysfs)
+ * ---------------------------------------------------------------------
+ *
+ * Each variable is read with READ_ONCE on its hot path. Reclaim-loop
+ * snapshots take the value at the top of each pass, so concurrent
+ * sysfs writes take effect on the next pass without locking.
+ */
+
+/*
+ * marie_clean_min_ratio — file-pagecache floor as a percentage of
+ * node_present_pages. marie_state_shrink_lruvec diverts file reclaim
+ * to anon when the node's NR_*_FILE total drops below this fraction,
+ * preserving a working set of clean cache for codepaths that depend
+ * on it (executable text, mapped data files, etc.) instead of
+ * letting unbounded anon pressure flush it. 0 disables the floor
+ * (legacy behaviour); 100 caps every file fault as protected.
+ * Range 0..100; default 10.
+ */
+unsigned int marie_clean_min_ratio = 10;
+
+/*
+ * marie_gen_growth_threshold — pages installed onto the head gen
+ * before marie_install_advance_hook triggers marie_try_advance_head.
+ * Default 8192 pages (= MARIE_ISOLATE_BATCH << 8, i.e. 32 MiB).
+ * Lower values produce finer gen granularity (more aging churn but
+ * tighter hot/cold separation); higher values coarsen the ring (less
+ * churn, broader gens).
+ *
+ * marie_install_advance_hook combines this static floor with a
+ * dynamic total_occupied / 8 leg so heavy workloads scale the
+ * trigger automatically.
+ */
+unsigned long marie_gen_growth_threshold = (unsigned long)SWAP_CLUSTER_MAX << 8;
+
+/*
+ * marie_walker_interval_* — adaptive walker pass deadline per pgdat,
+ * stored in jiffies. marie_walker_interval() picks one based on the
+ * zone's free-page state relative to its watermarks:
+ *
+ *   free < min      -> critical
+ *   free < low      -> low
+ *   free < high     -> normal
+ *   free >= high    -> idle
+ *
+ * Defaults mirror the original literal cadence (HZ/30, HZ/10, HZ/4,
+ * HZ — ~33 ms, 100 ms, 250 ms, 1 s on HZ=1000).  Hot writers see the
+ * value via READ_ONCE inside marie_walker_interval(); the sysfs
+ * helpers convert to and from ms for user friendliness.
+ */
+unsigned long marie_walker_interval_critical = HZ / 30;
+unsigned long marie_walker_interval_low      = HZ / 10;
+unsigned long marie_walker_interval_normal   = HZ / 4;
+unsigned long marie_walker_interval_idle     = HZ;
+
+static ssize_t gen_growth_threshold_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *buf)
+{
+	return sysfs_emit(buf, "%lu\n",
+			  READ_ONCE(marie_gen_growth_threshold));
+}
+
+static ssize_t gen_growth_threshold_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	unsigned long v;
+	int err = kstrtoul(buf, 10, &v);
+
+	if (err)
+		return err;
+	/* Must hold at least one MARIE_ISOLATE_BATCH-sized pop. */
+	if (v < SWAP_CLUSTER_MAX || v > (1UL << 28))
+		return -EINVAL;
+	WRITE_ONCE(marie_gen_growth_threshold, v);
+	return count;
+}
+
+static struct kobj_attribute marie_gen_growth_threshold_attr =
+	__ATTR_RW(gen_growth_threshold);
+
+/*
+ * Walker-interval knob factory: every stage uses the same show/store
+ * shape (ms in, jiffies stored, clamped to >= 1 jiffy).  Range is
+ * 1..60000 ms — anything shorter than a jiffy is meaningless on
+ * commodity HZ, anything longer than a minute defeats the adaptive
+ * gating.
+ */
+#define MARIE_WALKER_INTERVAL_KNOB(name, var)				\
+static ssize_t name##_show(struct kobject *kobj,			\
+			   struct kobj_attribute *attr, char *buf)	\
+{									\
+	return sysfs_emit(buf, "%u\n",					\
+			  jiffies_to_msecs(READ_ONCE(var)));		\
+}									\
+static ssize_t name##_store(struct kobject *kobj,			\
+			    struct kobj_attribute *attr,		\
+			    const char *buf, size_t count)		\
+{									\
+	unsigned int ms;						\
+	unsigned long j;						\
+	int err = kstrtouint(buf, 10, &ms);				\
+									\
+	if (err)							\
+		return err;						\
+	if (ms < 1 || ms > 60000)					\
+		return -EINVAL;						\
+	j = msecs_to_jiffies(ms);					\
+	if (j < 1)							\
+		j = 1;							\
+	WRITE_ONCE(var, j);						\
+	return count;							\
+}									\
+static struct kobj_attribute marie_##name##_attr = __ATTR_RW(name)
+
+MARIE_WALKER_INTERVAL_KNOB(walker_interval_critical_ms,
+			   marie_walker_interval_critical);
+MARIE_WALKER_INTERVAL_KNOB(walker_interval_low_ms,
+			   marie_walker_interval_low);
+MARIE_WALKER_INTERVAL_KNOB(walker_interval_normal_ms,
+			   marie_walker_interval_normal);
+MARIE_WALKER_INTERVAL_KNOB(walker_interval_idle_ms,
+			   marie_walker_interval_idle);
+
+static struct attribute *marie_attrs[] = {
+	&marie_enabled_attr.attr,
+	&marie_version_attr.attr,
+	&marie_stats_attr.attr,
+	&marie_clean_min_ratio_attr.attr,
+#ifdef CONFIG_SWAP
+	&marie_kcompmari_attr.attr,
+#endif
+#ifdef CONFIG_X86
+	&marie_simd_attr.attr,
+#endif
+	&marie_gen_growth_threshold_attr.attr,
+	&marie_walker_interval_critical_ms_attr.attr,
+	&marie_walker_interval_low_ms_attr.attr,
+	&marie_walker_interval_normal_ms_attr.attr,
+	&marie_walker_interval_idle_ms_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group marie_attr_group = {
+	.attrs = marie_attrs,
+};
+
+static int __init marie_init(void)
+{
+	struct kobject *marie_kobj;
+	int err;
+
+	printk(KERN_INFO "%s %s by %s\n",
+	       MARIE_PROGNAME, MARIE_VERSION, MARIE_AUTHOR);
+
+	marie_prefetch_params_init();
+
+	/*
+	 * Latch the 32-bit PFN gate. max_pfn is established by setup_arch /
+	 * memblock init well before subsys_initcall, so this single read is
+	 * authoritative for the lifetime of the system. If the box overflows
+	 * the 32-bit PFN window we disable Marie up front, regardless of
+	 * lru_marie= boot param or the static-key default, and refuse later
+	 * sysfs enables in marie_change_state.
+	 */
+	if (max_pfn > MARIE_MAX_SUPPORTED_PFN) {
+		marie_pfn_unsupported = true;
+		if (static_branch_likely(&lru_marie_enabled_key))
+			static_branch_disable(&lru_marie_enabled_key);
+		pr_warn("disabled: max_pfn %lu exceeds 32-bit limit (%lu); Marie requires physical address space <= 16 TiB\n",
+			max_pfn, MARIE_MAX_SUPPORTED_PFN);
+	} else {
+		/*
+		 * Allocate the per-PFN state array now that the gate has
+		 * been verified. If this fails we cannot run, so disable
+		 * Marie and continue boot with the in-tree LRU paths.
+		 */
+		err = marie_state_init();
+		if (err) {
+			marie_pfn_unsupported = true;
+			if (static_branch_likely(&lru_marie_enabled_key))
+				static_branch_disable(&lru_marie_enabled_key);
+			pr_warn("disabled: marie_state_init failed (%d)\n",
+				err);
+		}
+	}
+
+	/*
+	 * Initialise the global marie_nr_folios percpu_counter. (Earlier
+	 * revisions also set up slab caches and per-CPU pools here; the
+	 * per-PFN paradigm has none of that to allocate.)
+	 */
+	err = marie_counters_init();
+	if (err < 0)
+		return err;
+
+	marie_walker_init();
+
+	marie_kobj = kobject_create_and_add("lru_marie", mm_kobj);
+	if (!marie_kobj) {
+		pr_err("failed to create /sys/kernel/mm/lru_marie\n");
+		return -ENOMEM;
+	}
+
+	err = sysfs_create_group(marie_kobj, &marie_attr_group);
+	if (err) {
+		pr_err("failed to create /sys/kernel/mm/lru_marie attributes: %d\n", err);
+		kobject_put(marie_kobj);
+		return err;
+	}
+
+	pr_info("currently %s\n",
+		static_branch_likely(&lru_marie_enabled_key) ? "enabled" : "disabled");
+	return 0;
+}
+subsys_initcall(marie_init);
diff --git a/mm/lru_marie/drain_scope.h b/mm/lru_marie/drain_scope.h
new file mode 100644
index 0000000000..918d2ce914
--- /dev/null
+++ b/mm/lru_marie/drain_scope.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_DRAIN_SCOPE_H
+#define _MM_LRU_MARIE_DRAIN_SCOPE_H
+
+#include <linux/cleanup.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+
+/*
+ * Drain-path deferred-free scaffold.
+ *
+ * Background: marie_drain_pfn_locked drops the transient pin on every
+ * still-tracked folio while holding lv->lru_lock with IRQs off. If a
+ * pin is the last reference (the css_offline drain races exit_mmap),
+ * a plain folio_put inside the lock would recurse into
+ * __page_cache_release -> folio_lruvec_lock_irqsave on the same
+ * lru_lock and self-deadlock. e2eafb4c0 fixed this by collecting
+ * last-ref folios onto a deferred-free list and freeing them AFTER
+ * lru_lock is dropped.
+ *
+ * Every drain caller now needs the same three-step scaffold:
+ *
+ *   LIST_HEAD(to_free);
+ *   spin_lock_irq(&lv->lru_lock);
+ *   marie_drain_one_lruvec(lv, &to_free);
+ *   ...maybe other in-lock work...
+ *   spin_unlock_irq(&lv->lru_lock);
+ *   marie_drain_release(&to_free);
+ *
+ * Three sites repeat it (marie_drop_lruvec, marie_offline_lruvec,
+ * marie_change_state_lruvec). A fourth site that forgets the release
+ * leaks every last-ref folio it collected; a fifth that forgets the
+ * to_free list outright would deadlock again.
+ *
+ * Make the release impossible to forget: MARIE_DRAIN_DEFER declares a
+ * list_head with the GCC __cleanup attribute, which runs
+ * marie_drain_release at scope exit unconditionally. The lock half is
+ * the existing scoped_guard(spinlock_irq, ...). Caller code becomes:
+ *
+ *   MARIE_DRAIN_DEFER(to_free);
+ *   scoped_guard(spinlock_irq, &lv->lru_lock) {
+ *       marie_drain_one_lruvec(lv, &to_free);
+ *       ...
+ *   }
+ *   // lock released; to_free auto-flushed when its scope ends.
+ *
+ * Forgetting MARIE_DRAIN_DEFER is a compile error: marie_drain_one_lruvec
+ * still requires a struct list_head * argument, and there is nothing to
+ * pass.
+ */
+
+/*
+ * Release folios the drain found at refcount 0 -- the transient pin in
+ * marie_drain_pfn_locked was the last reference. Run after lru_lock has
+ * been dropped so __folio_put (mem_cgroup_uncharge,
+ * folio_unqueue_deferred_split's split_queue_lock, the buddy free)
+ * never runs under lru_lock. Mirrors release_pages()'s deferred free.
+ */
+static inline void marie_drain_release(struct list_head *to_free)
+{
+	struct folio *folio, *next;
+
+	list_for_each_entry_safe(folio, next, to_free, lru) {
+		list_del(&folio->lru);
+		__folio_put(folio);
+	}
+}
+
+static inline void __marie_drain_release_cleanup(struct list_head *l)
+{
+	marie_drain_release(l);
+}
+
+/*
+ * Declare a deferred-free list in the current scope. The list is
+ * initialised empty; at scope exit, marie_drain_release runs on it
+ * unconditionally (empty -> no-op). Threading the list into
+ * marie_drain_one_lruvec / marie_drain_bitmap_walk_one is the caller's
+ * responsibility -- those helpers' lockdep_assert_held catches the
+ * "lock not actually held" mistake.
+ */
+#define MARIE_DRAIN_DEFER(name)						\
+	struct list_head name __cleanup(__marie_drain_release_cleanup) = \
+		LIST_HEAD_INIT(name)
+
+#endif /* _MM_LRU_MARIE_DRAIN_SCOPE_H */
diff --git a/mm/lru_marie/pfn_install.h b/mm/lru_marie/pfn_install.h
new file mode 100644
index 0000000000..77a4b09f68
--- /dev/null
+++ b/mm/lru_marie/pfn_install.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_PFN_INSTALL_H
+#define _MM_LRU_MARIE_PFN_INSTALL_H
+
+#include <linux/atomic.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+
+#include "bitmap.h"
+#include "state.h"
+
+/*
+ * Marie's "publish a PFN as TRACKED" primitive, factored out of the
+ * install/split paths.
+ *
+ * What it writes (the single source of truth for "Marie owns this PFN"):
+ *   - marie_state[pfn]: TRACKED | (gen) | (tier) | (type) | (zone)
+ *   - marie_track_bm[type][gen][tier]: scan bit for this PFN
+ *   - per-memcg L1 bitmap (folio_memcg(f))
+ *   - atomic_long_inc(&marie_gen_occupied[gen][type])
+ *
+ * What it deliberately does NOT touch:
+ *   - folio->flags (PG_active / PG_lru) -- the install path flips these
+ *     in one atomic mask write after publish; the split path's caller
+ *     sets PG_lru later.
+ *   - folio->lru list pointers -- INIT_LIST_HEAD vs list_add_tail differs
+ *     between install and split.
+ *   - mlv counters and vmstat lru_size -- accounted by the caller (or by
+ *     marie_folio_install for the fresh-install path).
+ *   - marie_gen_installs -- this is the install "throttle" counter that
+ *     drives gen advance; split intentionally does NOT bump it because
+ *     the split tail inherits its parent's install budget (the parent
+ *     was already counted at fault-install).
+ *
+ * Caller context: lru_lock held with IRQs off. The publish is a plain
+ * non-atomic byte write because lru_lock serialises every install on the
+ * same PFN, and the "already TRACKED" early-out in marie_folio_install
+ * catches concurrent re-install attempts.
+ */
+static inline void marie_pfn_publish_inherit(struct folio *f, int type,
+					     u8 gen, u8 tier, int zone)
+{
+	unsigned long pfn = folio_pfn(f);
+
+	marie_state[pfn] = MARIE_PFN_TRACKED |
+		(gen << MARIE_PFN_GEN_SHIFT) |
+		(tier << MARIE_PFN_TIER_SHIFT) |
+		(type ? MARIE_PFN_TYPE_FILE : 0) |
+		marie_pfn_zone_bits(zone);
+	marie_bm_set(&marie_track_bm[type][gen][tier], pfn);
+	marie_memcg_bitmap_set(folio_memcg(f), pfn);
+	atomic_long_inc(&marie_gen_occupied[gen][type]);
+}
+
+/*
+ * marie_folio_install - the unified fresh-install path.
+ *
+ * Single entry point that replaces the former marie_install_local /
+ * marie_install_locked pair. Both call sites (lru_marie_add_folio for THP
+ * via per-type lock + small folio direct, and marie_change_state_lruvec
+ * during gate-on fill) now route here. The per-type lock context that
+ * used to distinguish "locked" from "local" is the caller's concern, not
+ * this function's: the body only requires lru_lock + IRQs off and uses
+ * the same publish + flag flip + account sequence in both cases.
+ *
+ * Sequence:
+ *   1. TRACKED early-out (returns false). Defends against gate-flip race
+ *      and reclaim-survivor re-install (TRACKED is preserved across
+ *      isolate by design; marie_state_publish_at_gen handles the
+ *      survivor putback separately, never this function).
+ *   2. Capture (PG_active, PG_workingset) -> 2-bit tier signal.
+ *   3. Clear PG_active early; the final flag write is still a single
+ *      atomic set_mask_bits, but capturing was_active before the clear
+ *      keeps the tier value coherent with the byte we publish below.
+ *   4. INIT_LIST_HEAD(&f->lru) -- a recycled folio arrives with
+ *      LIST_POISON{1,2} that would later fault list_del_init.
+ *   5. Publish per-PFN state via marie_pfn_publish_inherit.
+ *   6. Bump marie_gen_installs and trigger the advance hook. Split path
+ *      skips this bump (publish_inherit only).
+ *   7. set_mask_bits(PG_active->0, PG_lru->1) -- one atomic flag write.
+ *      Ordered AFTER step 5 so a concurrent __page_cache_release
+ *      observing PG_lru=1 also observes TRACKED=1.
+ *   8. Account (mlv->types[type].nr_pages, marie_nr_folios, vmstat
+ *      lru_size, mlv->marie_lru_zone_size). Step 3 of the abstraction
+ *      plan will factor these into marie_account_install.
+ *
+ * Returns true on success, false on TRACKED early-out.
+ */
+bool marie_folio_install(struct folio *f, struct marie_lruvec *mlv);
+
+#endif /* _MM_LRU_MARIE_PFN_INSTALL_H */
diff --git a/mm/lru_marie/prefetch.h b/mm/lru_marie/prefetch.h
new file mode 100644
index 0000000000..7f9a2eeb3d
--- /dev/null
+++ b/mm/lru_marie/prefetch.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_PREFETCH_H
+#define _MM_LRU_MARIE_PREFETCH_H
+
+/*
+ * Two-stage software prefetch primitives used by Marie's per-PFN
+ * array scan. The bitmap-driven isolate loop issues
+ *
+ *   marie_prefetch_l3(target_N_ahead);   // pull from DRAM into L3
+ *   marie_prefetch_l1(target_K_ahead);   // pull from L3 into L1
+ *
+ * where N (~marie_l3_ahead) is sized to cover DRAM round-trip
+ * (~200 cycles) and K (~marie_l1_ahead) is sized to cover the L3->
+ * L1 round-trip (~30 cycles). Splitting lets the AGU fire the long-
+ * haul prefetch as early as the bitmap walk can predict the next
+ * candidate PFN, without keeping the L1 occupied with all the
+ * pending lines at once.
+ *
+ * The kernel's generic prefetch() expands to PREFETCHNTA on x86
+ * (L1 with bypass-LRU semantics), which is wrong for the L3-ahead
+ * leg — NTA evicts quickly from L1 and never settles in L3, so by
+ * the time the target should be in L3 it is gone. We therefore
+ * drop to the bare instructions:
+ *
+ *   prefetcht0 -- T0 hint, fetched into all cache levels (L1+L2+L3)
+ *   prefetcht2 -- T2 hint, fetched into L2/L3 but not L1
+ *
+ * Non-x86 builds get no-op stubs; the scan still works, just
+ * without the prefetch acceleration (HW prefetcher alone).
+ */
+
+#ifdef CONFIG_X86
+static __always_inline void marie_prefetch_l1(const void *addr)
+{
+	asm volatile("prefetcht0 %0" :: "m" (*(const char *)addr));
+}
+
+static __always_inline void marie_prefetch_l3(const void *addr)
+{
+	asm volatile("prefetcht2 %0" :: "m" (*(const char *)addr));
+}
+#else
+static __always_inline void marie_prefetch_l1(const void *addr) { (void)addr; }
+static __always_inline void marie_prefetch_l3(const void *addr) { (void)addr; }
+#endif
+
+/*
+ * Ahead distances for the two-stage prefetch ring. Values are set at
+ * boot by marie_prefetch_params_init() based on CPUID and stored in
+ * the file-static variables in state.c. MARIE_L3_AHEAD_MAX is the
+ * compile-time upper bound used to size the on-stack ring[] array;
+ * the runtime value (marie_l3_ahead) may be smaller on MSHR-limited
+ * microarchitectures.
+ *
+ * prefetcht2 requests are tracked by L2/L3 MSHRs (independent of L1
+ * LFBs); prefetcht0 requests are tracked by L1 LFBs. Tiers chosen by
+ * marie_prefetch_params_init():
+ *
+ *   AVX-512F (Zen 4/5, Sapphire Rapids): L2 MSHR ~32 → l3=32, l1=8
+ *   AMD Zen 3 (fam 0x19):               L2 MSHR ~24 → l3=24, l1=8
+ *   AMD Zen 1/2 (fam 0x17):             L2 MSHR ~20 → l3=20, l1=8
+ *   AMD Excavator (fam 0x15):           L2 MSHR ~12 → l3=16, l1=6
+ *   Intel Skylake+ (CLFLUSHOPT):        L2 MSHR ~24 → l3=24, l1=8
+ *   Intel Haswell/Broadwell:            L2 MSHR ~16 → l3=16, l1=6
+ *   x86_64-v2 or below / non-x86:      L2 MSHR  ~8 → l3= 8, l1=2
+ *
+ * marie_l3_mask = marie_l3_ahead - 1 (all values are powers of 2,
+ * enabling bitwise-AND modulo in the hot path).
+ *
+ * These can be promoted to sysfs tunables in a later commit if
+ * profiling shows different sweet spots per workload.
+ */
+#define MARIE_L3_AHEAD_MAX	32	/* on-stack ring[] sizing upper bound */
+
+void marie_prefetch_params_init(void);
+
+/*
+ * Cache-line cursor look-ahead for marie_state[] (1 byte per PFN, 64 PFN
+ * per cache line). Unlike the per-PFN struct page prefetch (where 1 PFN
+ * = 1 cache line already gives ring-depth look-ahead), state[] is dense
+ * — without an explicit cursor, the producer issues up to 64 prefetches
+ * for the same cache line and gains zero look-ahead in cache-line space.
+ *
+ * Sized for the sparse-bitmap fast-skip case. On OOO x86 (~5 cycles/PFN,
+ * DRAM ~200 cycles) we need ≥ 40 PFN on top of the runtime ring lag
+ * (up to 32); on MSHR-limited in-order x86 (~20 cy/PFN, ring lag 8)
+ * ~18 PFN suffices. 512 PFN (8 cache lines) covers all tiers and also
+ * absorbs bitmap-density jumps within an L2 range.
+ *
+ * L1 distance is the L3→L1 analogue: shorter latency target, smaller
+ * margin since L1d evicts aggressively.
+ */
+#define MARIE_STATE_L3_AHEAD_PFN	512
+#define MARIE_STATE_L1_AHEAD_PFN	64
+
+/*
+ * Cache-line cursor look-ahead for the bitmap arrays (l1[], mbm[]) used
+ * by the isolate producer. The arrays are u64 (8 words per cache line,
+ * each word covering 64 PFN). The producer reads one word per "word_rem
+ * exhausted" event; in the sparse-bitmap worst case (1 bit per word) the
+ * word transition rate hits ~5-30 cycles per consumer iter, so the next
+ * cache line must be on the way well before the cursor crosses it.
+ *
+ * 16 words = 2 cache lines ahead gives margin for the sparse case while
+ * keeping the prefetch budget modest. Only L3 hint is needed — once a
+ * bitmap cache line lands in L3, the L3->L1 promote (~30-40 cycles) is
+ * easily hidden by the per-word consumer drain (64 PFN × 5+ cycles).
+ */
+#define MARIE_BM_L3_AHEAD_WORDS		16
+
+#endif /* _MM_LRU_MARIE_PREFETCH_H */
diff --git a/mm/lru_marie/simd.h b/mm/lru_marie/simd.h
new file mode 100644
index 0000000000..146fe5fd3d
--- /dev/null
+++ b/mm/lru_marie/simd.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_SIMD_H
+#define _MM_LRU_MARIE_SIMD_H
+
+/*
+ * Marie SIMD-accelerated PTE scan.
+ *
+ * Two call modes:
+ *
+ *   Batched (walker):
+ *      lru_marie_simd_batch_begin();
+ *      for_each(pmd in batch)
+ *          lru_marie_simd_young_pte_mask_raw(pte_table, bitmap);
+ *      lru_marie_simd_batch_end();
+ *
+ *   Single-shot (any non-batched caller):
+ *      lru_marie_simd_young_pte_mask(pte_table, bitmap);
+ *
+ * On x86 batch_begin/end map to kernel_fpu_begin/end so the per-call
+ * FPU save-restore is amortised across the batch. On every other arch
+ * (including arm64) the generic scalar fallback is used and
+ * batch_begin/end are no-ops.
+ */
+
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/jump_label.h>
+
+#ifdef CONFIG_X86
+/*
+ * Runtime kill-switch for the boot-detected SIMD walker, exposed via
+ * /sys/kernel/mm/lru_marie/simd. Default true: walker uses the widest
+ * SIMD kernel that arch_initcall could pick (AVX-512F > AVX2 > SSE2).
+ * Writing 0 to the sysfs file flips the static branch so the walker
+ * falls back to a pure scalar pte_young loop in the same translation
+ * unit.
+ *
+ * Other arches use the generic scalar fallback already, so the toggle
+ * does not need to exist there and the sysfs attribute is hidden.
+ */
+DECLARE_STATIC_KEY_TRUE(marie_simd_enabled_key);
+
+static inline bool marie_simd_enabled(void)
+{
+	return static_branch_likely(&marie_simd_enabled_key);
+}
+#else
+static inline bool marie_simd_enabled(void) { return false; }
+#endif
+
+/*
+ * Number of unsigned longs needed to hold the young-bit bitmap for one
+ * PMD's worth of PTEs (PTRS_PER_PTE = 512 on x86_64; the value is
+ * pulled from the arch's pgtable headers via the caller's includes).
+ */
+#define MARIE_SIMD_PTE_BITMAP_LONGS	((512 + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+/**
+ * lru_marie_simd_batch_begin - open a SIMD batch (FPU bracket on x86).
+ *
+ * Holds preempt-disabled until the matching batch_end. The caller is
+ * responsible for keeping the bracketed region short -- batch a small
+ * fixed number of _raw scans then close the batch and let
+ * cond_resched() run before opening the next one.
+ *
+ * Calling _raw without an enclosing batch_begin is undefined on x86
+ * (FPU registers will be corrupted relative to userspace state); on
+ * scalar arches it is harmless because batch_begin/end are no-ops.
+ */
+void lru_marie_simd_batch_begin(void);
+
+/**
+ * lru_marie_simd_batch_end - close a SIMD batch (kernel_fpu_end on x86).
+ */
+void lru_marie_simd_batch_end(void);
+
+/**
+ * lru_marie_simd_young_pte_mask_raw - scan one PMD without opening a bracket.
+ * @table:  pointer to the first pte_t in the PMD's PTE array (512 entries)
+ * @bitmap: output, MARIE_SIMD_PTE_BITMAP_LONGS unsigned longs.
+ *
+ * Caller MUST hold an enclosing lru_marie_simd_batch_begin/end pair.
+ * Used in the per-PMD walker hot path to amortise the FPU save/restore
+ * cost across a batch of consecutive PMD scans.
+ */
+void lru_marie_simd_young_pte_mask_raw(const void *table, unsigned long *bitmap);
+
+/**
+ * lru_marie_simd_young_pte_mask - single-shot scan (begin + raw + end).
+ * @table:  pointer to the first pte_t in the PMD's PTE array (512 entries)
+ * @bitmap: output, MARIE_SIMD_PTE_BITMAP_LONGS unsigned longs.
+ *
+ * Self-contained convenience wrapper for any non-batched caller. The
+ * walker uses the batched path directly; this wrapper exists for
+ * completeness and any future single-shot use.
+ */
+void lru_marie_simd_young_pte_mask(const void *table, unsigned long *bitmap);
+
+#endif /* _MM_LRU_MARIE_SIMD_H */
diff --git a/mm/lru_marie/simd_generic.c b/mm/lru_marie/simd_generic.c
new file mode 100644
index 0000000000..0045098bfa
--- /dev/null
+++ b/mm/lru_marie/simd_generic.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/lru_marie/simd_generic.c -- fallback PTE young-bit scan for arches
+ *                              without a SIMD path.
+ *
+ * Uses the arch-provided pte_young() helper so we don't need to know
+ * the accessed-bit name on every architecture.
+ *
+ * batch_begin / batch_end are no-ops here because the scan is scalar
+ * (no FPU state to preserve). Every arch other than x86 currently
+ * lands on this file (including arm64, where a future NEON variant
+ * could be slotted in once its FPSIMD save/restore cost has been
+ * profiled against the per-pmd gain).
+ */
+
+#include <linux/bitmap.h>
+#include <linux/mm.h>		/* pte_young */
+#include <asm/pgtable.h>
+
+#include "simd.h"
+
+#define PTES_PER_PMD	512
+
+void lru_marie_simd_batch_begin(void) { }
+EXPORT_SYMBOL_GPL(lru_marie_simd_batch_begin);
+
+void lru_marie_simd_batch_end(void) { }
+EXPORT_SYMBOL_GPL(lru_marie_simd_batch_end);
+
+void lru_marie_simd_young_pte_mask_raw(const void *table, unsigned long *bitmap)
+{
+	const pte_t *pte = (const pte_t *)table;
+	int i;
+
+	for (i = 0; i < PTES_PER_PMD; i++) {
+		if (pte_young(pte[i]))
+			__set_bit(i, bitmap);
+	}
+}
+EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask_raw);
+
+void lru_marie_simd_young_pte_mask(const void *table, unsigned long *bitmap)
+{
+	lru_marie_simd_young_pte_mask_raw(table, bitmap);
+}
+EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask);
diff --git a/mm/lru_marie/simd_x86.c b/mm/lru_marie/simd_x86.c
new file mode 100644
index 0000000000..b119b6c265
--- /dev/null
+++ b/mm/lru_marie/simd_x86.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/lru_marie/simd_x86.c -- x86-64 PTE young-bit scan dispatch.
+ *
+ * Three SIMD .S kernels are linked in: lru_marie_simd_x86_{sse2,avx2,avx512}.S.
+ * arch_initcall picks the widest available at boot:
+ *   AVX-512F: 8 PTEs/iter via VPTESTMQ kmask
+ *   AVX2:     4 PTEs/iter via VPCMPEQQ
+ *   SSE2:     4 PTEs/iter via PSHUFD pack
+ *
+ * SSE2 is the floor -- x86-64 ABI-mandatory since 2003, always works,
+ * no cpu_has() check needed. It's the default initial value of the
+ * static call, so even if arch_initcall runs late the walker never
+ * falls back to the slower scalar path.
+ *
+ * FPU bracket lifecycle is caller-driven:
+ *   lru_marie_simd_batch_begin()   -> kernel_fpu_begin()
+ *   lru_marie_simd_young_pte_mask_raw() x N
+ *   lru_marie_simd_batch_end()     -> kernel_fpu_end()
+ *
+ * FPU is batched across N consecutive PMD scans (the walker picks N --
+ * see MARIE_FPU_BATCH in mm/lru_marie/walker.c) so the ~100 ns
+ * kernel_fpu_begin/end overhead is amortised across the batch. Widening
+ * the bracket to whole-pass scope is avoided: the bitmap iteration
+ * (folio_marie_inc_tier per young bit) runs inside the bracket, so a
+ * pass-wide bracket would extend the preempt-disabled window by the
+ * full iteration (~100 ms). Batching keeps the bitmap iteration inside
+ * the same PTL window; the preempt window grows by N SIMD scans
+ * (~100-300 ns each), not N full per-PMD bodies.
+ *
+ * The single-shot lru_marie_simd_young_pte_mask() is retained for any
+ * non-batched caller (currently none in-tree) and just wraps
+ * batch_begin / _raw / batch_end.
+ */
+
+#include <linux/init.h>
+#include <linux/jump_label.h>
+#include <linux/printk.h>
+#include <linux/static_call.h>
+#include <linux/bitmap.h>
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/pgtable.h>
+#include <asm/pgtable_types.h>
+
+#include "simd.h"
+
+#define PTES_PER_PMD	512
+
+/*
+ * Default true: walker uses the boot-detected SIMD wrapper. Flipped
+ * by writes to /sys/kernel/mm/lru_marie/simd; a write of 0 routes
+ * lru_marie_simd_young_pte_mask through the scalar pte_young loop
+ * below for benchmark A/B comparisons.
+ */
+DEFINE_STATIC_KEY_TRUE(marie_simd_enabled_key);
+EXPORT_SYMBOL_GPL(marie_simd_enabled_key);
+
+/* Defined in mm/lru_marie/simd_x86_{sse2,avx2,avx512}.S.
+ * Caller must hold kernel_fpu_begin/end. */
+asmlinkage void lru_marie_simd_scan_sse2(const pte_t *pte_table,
+				   unsigned long *bitmap);
+asmlinkage void lru_marie_simd_scan_avx2(const pte_t *pte_table,
+				   unsigned long *bitmap);
+asmlinkage void lru_marie_simd_scan_avx512(const pte_t *pte_table,
+				     unsigned long *bitmap);
+
+/* ------------------------------------------------------------------ */
+/* Scalar fallback                                                    */
+/* ------------------------------------------------------------------ */
+
+/*
+ * Reference scalar implementation. Used as the SIMD off-path when
+ * marie_simd_enabled_key is flipped via /sys/kernel/mm/lru_marie/simd
+ * for A/B-comparing the SIMD walker against a scalar pte_young loop
+ * without rebuilding the kernel. Also doubles as a correctness oracle
+ * for future SIMD bug fixes. No FPU state -- safe to call regardless
+ * of bracket state.
+ */
+static void marie_simd_scan_scalar(const pte_t *pte, unsigned long *bitmap)
+{
+	int i;
+
+	for (i = 0; i < PTES_PER_PMD; i++) {
+		if (pte_val(pte[i]) & _PAGE_ACCESSED)
+			__set_bit(i, bitmap);
+	}
+}
+
+/* ------------------------------------------------------------------ */
+/* Boot-time dispatch                                                 */
+/* ------------------------------------------------------------------ */
+
+/*
+ * Boot-patched direct call to the .S kernel. arch_initcall upgrades
+ * from the SSE2 default to AVX2 / AVX-512F if those feature bits are
+ * set. Each call site compiles to a single direct CALL instruction
+ * (text-patched at static_call_update time), avoiding the indirect-
+ * call retpoline tax in the per-PMD walker hot path.
+ *
+ * The static call points DIRECTLY at the .S kernel -- no FPU-bracket
+ * wrapper. Callers must hold an FPU bracket via
+ * lru_marie_simd_batch_begin/end.
+ */
+DEFINE_STATIC_CALL(marie_simd_scan, lru_marie_simd_scan_sse2);
+
+static int __init marie_simd_x86_init(void)
+{
+	/*
+	 * Pick the widest SIMD impl available at boot. Order matters:
+	 * AVX-512F first (8 PTEs/iter, simplest .S via VPTESTMQ kmask),
+	 * then AVX2 (4 PTEs/iter via VPCMPEQQ), then SSE2 (4 PTEs/iter
+	 * via PSHUFD packing trick -- already the default before this
+	 * runs, so we do nothing for the SSE2 case).
+	 */
+	if (boot_cpu_has(X86_FEATURE_AVX512F)) {
+		static_call_update(marie_simd_scan, lru_marie_simd_scan_avx512);
+		pr_info("SIMD PTE scan: AVX-512F (8 PTEs/iter)\n");
+	} else if (boot_cpu_has(X86_FEATURE_AVX2)) {
+		static_call_update(marie_simd_scan, lru_marie_simd_scan_avx2);
+		pr_info("SIMD PTE scan: AVX2 (4 PTEs/iter)\n");
+	} else {
+		/* default already = lru_marie_simd_scan_sse2 */
+		pr_info("SIMD PTE scan: SSE2 (4 PTEs/iter, x86-64 baseline)\n");
+	}
+	return 0;
+}
+/*
+ * arch_initcall fires before subsys_initcall (marie_init), so the
+ * static call is patched well before the walker first runs.
+ */
+arch_initcall(marie_simd_x86_init);
+
+/* ------------------------------------------------------------------ */
+/* Public API                                                         */
+/* ------------------------------------------------------------------ */
+
+void lru_marie_simd_batch_begin(void)
+{
+	if (static_branch_likely(&marie_simd_enabled_key))
+		kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(lru_marie_simd_batch_begin);
+
+void lru_marie_simd_batch_end(void)
+{
+	if (static_branch_likely(&marie_simd_enabled_key))
+		kernel_fpu_end();
+}
+EXPORT_SYMBOL_GPL(lru_marie_simd_batch_end);
+
+void lru_marie_simd_young_pte_mask_raw(const void *table, unsigned long *bitmap)
+{
+	if (static_branch_likely(&marie_simd_enabled_key))
+		static_call(marie_simd_scan)((const pte_t *)table, bitmap);
+	else
+		marie_simd_scan_scalar((const pte_t *)table, bitmap);
+}
+EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask_raw);
+
+void lru_marie_simd_young_pte_mask(const void *table, unsigned long *bitmap)
+{
+	lru_marie_simd_batch_begin();
+	lru_marie_simd_young_pte_mask_raw(table, bitmap);
+	lru_marie_simd_batch_end();
+}
+EXPORT_SYMBOL_GPL(lru_marie_simd_young_pte_mask);
diff --git a/mm/lru_marie/simd_x86_avx2.S b/mm/lru_marie/simd_x86_avx2.S
new file mode 100644
index 0000000000..95242d53b0
--- /dev/null
+++ b/mm/lru_marie/simd_x86_avx2.S
@@ -0,0 +1,214 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * mm/lru_marie/simd_x86_avx2.S -- AVX2 PTE young-bit scan for Marie walker.
+ *
+ * Processes 512 PTEs (one full PMD page table) per call, producing an
+ * 8-ulong (512-bit) young-bit bitmap.  4 PTEs are handled per SIMD
+ * iteration via a single 256-bit YMM load; 16 iterations fill one
+ * output ulong; 8 ulongs total.
+ *
+ * Variant chosen: counter-based rolled loop (outer=8, inner=16).
+ * Reason: mirrors the SSE2 kernel structure for auditability; 128
+ * fully-unrolled SIMD blocks would exceed 3000 instructions with no
+ * measurable throughput gain on modern out-of-order cores (loop
+ * overhead is < 1% of SIMD work).
+ *
+ * Why AVX2 is cleaner than SSE2 here:
+ *   AVX2 has VPCMPEQQ (256-bit, 64-bit lane equality).  SSE2 only has
+ *   PCMPEQD (32-bit lanes), which required the PSHUFD low-dword packing
+ *   trick to avoid false positives from PFN bits landing in the high 32
+ *   bits of a PTE.  With VPCMPEQQ we compare the full 64-bit PTE against
+ *   the broadcast mask (1ULL << _PAGE_ACCESSED) directly -- no packing.
+ *
+ *   After VPMOVMSKB, eax holds 32 bits: one 8-bit octet per 64-bit PTE
+ *   lane (uniformly 0xFF or 0x00 because VPCMPEQQ produces all-1s or
+ *   all-0s per 64-bit lane).  Extracting bits:
+ *     testl $0x000000FF, eax  -> PTE 0 young
+ *     testl $0x0000FF00, eax  -> PTE 1 young
+ *     testl $0x00FF0000, eax  -> PTE 2 young
+ *     testl $0xFF000000, eax  -> PTE 3 young
+ *
+ * Caller (marie_simd_scan_avx2_wrapper in mm/lru_marie/simd_x86.c, added in
+ * Task 5) holds kernel_fpu_begin/end around this.  Do NOT call directly
+ * from C without an FPU context bracket.
+ *
+ * ABI (System V AMD64):
+ *   rdi = const pte_t *pte_table  (512 PTEs, 4096 bytes, page-table base;
+ *                                  page-aligned by the kernel page-table
+ *                                  allocator; vmovdqu is used so no stricter
+ *                                  SIMD alignment is required for the input)
+ *   rsi = unsigned long *bitmap   (8 ulongs = 64 bytes, caller pre-cleared)
+ *
+ * Register usage:
+ *   ymm0  = young-bit mask broadcast (0x20 x4, 64-bit lanes), constant
+ *   ymm1  = 4-PTE scratch register
+ *   rdi   = current PTE pointer (walks forward by 32 bytes per inner iter)
+ *   rsi   = bitmap base (constant)
+ *   rdx   = outer loop counter (8 -> 0)
+ *   rcx   = inner loop counter (16 -> 0)
+ *   rax   = vpmovmskb result / bit-extraction scratch
+ *   r8    = per-ulong accumulator
+ *   r9    = bit-position counter within current ulong (0..63)
+ *   r10   = single-bit scratch for OR-into-accumulator
+ *   r11   = bitmap write pointer (rsi + outer*8, updated each outer iter)
+ *
+ * AVX2 only (no AVX-512 zmm/kmask/opmask):
+ *   vmovdqa, vpand, vpcmpeqq, vpmovmskb, vzeroupper -- all AVX2.
+ *   All VEX-encoded to avoid SSE/AVX transition penalties.
+ *   VZEROUPPER before RET to clear upper YMM halves for subsequent
+ *   legacy SSE code.
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+	/* -------------------------------------------------------------- */
+	/* Read-only data: young-bit mask broadcast                        */
+	/* -------------------------------------------------------------- */
+	.section .rodata
+	.align 32
+marie_avx2_young_mask:
+	/* _PAGE_ACCESSED = bit 5 = 0x20, broadcast to 4x 64-bit lanes */
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+
+	/* -------------------------------------------------------------- */
+	/* Text                                                            */
+	/* -------------------------------------------------------------- */
+	.text
+
+/*
+ * void lru_marie_simd_scan_avx2(const pte_t *pte_table, unsigned long *bitmap)
+ *
+ * pte_table: rdi -- 512 PTEs (4096 bytes), page-table base (8-byte aligned)
+ * bitmap:    rsi -- 8 ulongs (64 bytes), caller pre-cleared
+ */
+SYM_TYPED_FUNC_START(lru_marie_simd_scan_avx2)
+
+	/* Load young-bit mask once; ymm0 is constant for the whole call. */
+	vmovdqa	marie_avx2_young_mask(%rip), %ymm0
+
+	movq	%rsi, %r11		/* r11 = current bitmap word ptr    */
+	movl	$8, %edx		/* outer counter: 8 ulongs          */
+
+	/*
+	 * Outer loop: one iteration per output ulong (64 PTEs / 8 words).
+	 */
+.Louter_avx2:
+	xorq	%r8, %r8		/* accumulator = 0                  */
+	xorq	%r9, %r9		/* bit-position = 0                 */
+	movl	$16, %ecx		/* inner counter: 16 SIMD iters     */
+
+	/*
+	 * Inner loop: one iteration processes 4 PTEs => 4 result bits.
+	 *
+	 * Each PTE is 8 bytes; 4 PTEs = 32 bytes = one 256-bit YMM load.
+	 * rdi advances by 32 bytes per inner iteration.
+	 */
+.Linner_avx2:
+	/* Load 4 PTEs into ymm1 (unaligned; caller guarantees 8-byte align). */
+	vmovdqu	(%rdi), %ymm1		/* ymm1 = { pte3 | pte2 | pte1 | pte0 } */
+	addq	$32, %rdi
+
+	/*
+	 * Isolate the young bit (bit 5 = 0x20) in each 64-bit PTE lane.
+	 * VPAND operates on the full 256-bit register.
+	 */
+	vpand	%ymm0, %ymm1, %ymm1	/* ymm1 &= 0x20 per 64-bit lane    */
+
+	/*
+	 * VPCMPEQQ: compare each 64-bit lane against ymm0 (the mask 0x20).
+	 * If the masked PTE equals 0x20 the young bit was set; that lane
+	 * becomes all-ones (0xFFFFFFFFFFFFFFFF), otherwise all-zeros.
+	 * No PSHUFD trick needed: 64-bit lanes sidestep the PFN false-
+	 * positive that afflicts 32-bit PCMPEQD-based comparisons.
+	 */
+	vpcmpeqq %ymm0, %ymm1, %ymm1	/* ymm1[i] = young_i ? ~0 : 0      */
+
+	/*
+	 * VPMOVMSKB: extracts the MSB of each byte from the 256-bit ymm1
+	 * into a 32-bit integer.  Because VPCMPEQQ produces uniform
+	 * all-ones or all-zeros per 64-bit lane, each 8-byte group in ymm1
+	 * is either 0xFFFFFFFFFFFFFFFF or 0x0000000000000000.  The
+	 * resulting eax octet layout:
+	 *   eax[7:0]   = 0xFF (young) or 0x00  -- PTE 0
+	 *   eax[15:8]  = 0xFF (young) or 0x00  -- PTE 1
+	 *   eax[23:16] = 0xFF (young) or 0x00  -- PTE 2
+	 *   eax[31:24] = 0xFF (young) or 0x00  -- PTE 3
+	 */
+	vpmovmskb %ymm1, %eax
+
+	/*
+	 * Extract one result bit per PTE and shift into the accumulator.
+	 *
+	 * We use the CL-shift idiom (baseline x86-64): save rcx (inner
+	 * loop counter) temporarily, use cl for the variable shift, then
+	 * restore.  For each of the 4 PTEs:
+	 *   1. test eax against the octet mask
+	 *   2. setnz -> 1-byte 0 or 1
+	 *   3. zero-extend to 64 bits
+	 *   4. shift left by current bit-position (r9) using CL
+	 *   5. OR into accumulator
+	 *   6. increment bit-position
+	 */
+
+	/* PTE 0: octet [7:0] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0x000000FF, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	/* PTE 1: octet [15:8] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0x0000FF00, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	/* PTE 2: octet [23:16] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0x00FF0000, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	/* PTE 3: octet [31:24] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0xFF000000, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	decl	%ecx
+	jnz	.Linner_avx2
+
+	/* Store the completed ulong into bitmap. */
+	movq	%r8, (%r11)
+	addq	$8, %r11		/* advance bitmap write pointer     */
+
+	decl	%edx
+	jnz	.Louter_avx2
+
+	/* Clear upper YMM halves to avoid SSE/AVX transition penalties. */
+	vzeroupper
+	RET
+SYM_FUNC_END(lru_marie_simd_scan_avx2)
diff --git a/mm/lru_marie/simd_x86_avx512.S b/mm/lru_marie/simd_x86_avx512.S
new file mode 100644
index 0000000000..18d73dbd8b
--- /dev/null
+++ b/mm/lru_marie/simd_x86_avx512.S
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * mm/lru_marie/simd_x86_avx512.S -- AVX-512F PTE young-bit scan for Marie walker.
+ *
+ * Processes 512 PTEs (one full PMD page table) per call, producing an
+ * 8-ulong (512-bit) young-bit bitmap.  8 PTEs are handled per SIMD
+ * iteration via a single 512-bit ZMM load and VPTESTMQ; 8 iterations
+ * fill one output ulong; 8 ulongs total.
+ *
+ * Variant chosen: counter-based rolled loop (outer=8, inner=8).
+ * Reason: mirrors the SSE2 and AVX2 kernel structure for auditability;
+ * a fully-unrolled variant of 64 SIMD blocks would be ~320 instructions
+ * but harder to review and provides no measurable throughput gain on
+ * modern out-of-order cores (loop overhead is < 1% of SIMD work).
+ *
+ * Why AVX-512F is the most efficient of the three .S kernels:
+ *   VPTESTMQ ANDs two ZMM registers and writes an 8-bit opmask register
+ *   (one bit per 64-bit lane = one bit per PTE), directly encoding eight
+ *   young-bit results in a single instruction.  KMOVW transfers the opmask
+ *   into a general-purpose register.  This eliminates the PMOVMSKB +
+ *   test/setnz dance that SSE2 and AVX2 require.
+ *
+ *   ~5 instructions per 8 PTEs vs ~24 instructions per 4 PTEs (SSE2/AVX2).
+ *   Per-PMD scan is approximately 10 ns of compute on a modern core.
+ *
+ * KMOVW vs KMOVB:
+ *   KMOVW (move 16-bit opmask to/from GPR) is part of AVX-512F.
+ *   KMOVB requires AVX-512BW.  VPTESTMQ on an 8-lane ZMM only sets
+ *   k1[7:0]; the upper 8 bits written by KMOVW are zero, so KMOVW
+ *   gives the correct 8-bit result.  All instructions in this file are
+ *   strictly AVX-512F (Foundation); no BW/DQ/VL/VBMI sub-extensions.
+ *
+ * Caller (marie_simd_scan_avx512_wrapper in mm/lru_marie/simd_x86.c, added in
+ * Task 5) holds kernel_fpu_begin/end around this.  Do NOT call directly
+ * from C without an FPU context bracket.
+ *
+ * ABI (System V AMD64):
+ *   rdi = const pte_t *pte_table  (512 PTEs, 4096 bytes, page-table base)
+ *   rsi = unsigned long *bitmap   (8 ulongs = 64 bytes, caller pre-cleared)
+ *
+ * Register usage:
+ *   zmm0  = young-bit mask broadcast (0x20 x8, 64-bit lanes), constant
+ *   zmm1  = 8-PTE scratch register
+ *   k1    = VPTESTMQ result (8-bit opmask, one bit per PTE)
+ *   rdi   = current PTE pointer (walks forward by 64 bytes per inner iter)
+ *   rsi   = bitmap base (constant)
+ *   rdx   = outer loop counter (8 -> 0)
+ *   rcx   = inner loop counter (8 -> 0)
+ *   rax   = kmovw result / shift scratch
+ *   r8    = per-ulong accumulator
+ *   r9    = bit-position counter within current ulong (0, 8, 16, ..., 56)
+ *   r11   = bitmap write pointer (rsi + outer*8, updated each outer iter)
+ *
+ * Opmask register constraints:
+ *   k0 is reserved as the "no mask" predicate (all lanes active); do not
+ *   use k0 as an output register.  k1..k7 are used here (k1 only).
+ *
+ * AVX-512F only (strict Foundation subset):
+ *   vmovdqa64 (mask), vmovdqu64 (PTE data), vptestmq, kmovw, vzeroupper -- all AVX-512F.
+ *   zmm0..zmm7 only; k1 only.
+ *   VZEROUPPER before RET to avoid SSE/AVX transition penalties.
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+	/* -------------------------------------------------------------- */
+	/* Read-only data: young-bit mask broadcast                        */
+	/* -------------------------------------------------------------- */
+	.section .rodata
+	.align 64
+marie_avx512_young_mask:
+	/* _PAGE_ACCESSED = bit 5 = 0x20, broadcast to 8x 64-bit lanes */
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+	.quad 0x0000000000000020
+
+	/* -------------------------------------------------------------- */
+	/* Text                                                            */
+	/* -------------------------------------------------------------- */
+	.text
+
+/*
+ * void lru_marie_simd_scan_avx512(const pte_t *pte_table, unsigned long *bitmap)
+ *
+ * pte_table: rdi -- 512 PTEs (4096 bytes), page-table base; page-aligned by
+ *                   the kernel page-table allocator; vmovdqu64 is used so no
+ *                   stricter SIMD alignment is required for the input
+ * bitmap:    rsi -- 8 ulongs (64 bytes), caller pre-cleared
+ */
+SYM_TYPED_FUNC_START(lru_marie_simd_scan_avx512)
+
+	/* Load young-bit mask once; zmm0 is constant for the whole call. */
+	vmovdqa64	marie_avx512_young_mask(%rip), %zmm0
+
+	movq	%rsi, %r11		/* r11 = current bitmap word ptr    */
+	movl	$8, %edx		/* outer counter: 8 ulongs          */
+
+	/*
+	 * Outer loop: one iteration per output ulong (64 PTEs / 8 words).
+	 *
+	 * Each outer iteration processes 64 PTEs in 8 inner SIMD steps
+	 * (8 PTEs per step) and writes one 64-bit ulong to the bitmap.
+	 */
+.Louter_avx512:
+	xorq	%r8, %r8		/* accumulator = 0                  */
+	xorq	%r9, %r9		/* bit-position = 0 (steps of 8)    */
+	movl	$8, %ecx		/* inner counter: 8 SIMD iters      */
+
+	/*
+	 * Inner loop: one iteration processes 8 PTEs => 8 result bits.
+	 *
+	 * Each PTE is 8 bytes; 8 PTEs = 64 bytes = one 512-bit ZMM load.
+	 * rdi advances by 64 bytes per inner iteration.
+	 *
+	 * Algorithm per iteration:
+	 *   1. vmovdqu64 loads 8 PTEs into zmm1 (unaligned; page tables are
+	 *      page-aligned but rdi is the base of the 512-PTE array which
+	 *      is always 8-byte aligned, not necessarily 64-byte aligned).
+	 *   2. vptestmq ANDs zmm1 with zmm0 (mask) and sets k1[i] = 1 iff
+	 *      the masked value in lane i is nonzero (i.e. young bit set).
+	 *   3. kmovw extracts the 8-bit opmask from k1 into eax[7:0].
+	 *   4. Zero-extend al -> rax (upper bits already zero from kmovw).
+	 *   5. Shift rax left by r9 (0, 8, 16, ..., 56) to place the 8 bits
+	 *      at the correct position in the 64-bit accumulator.
+	 *   6. OR into r8 accumulator; advance bit-position by 8.
+	 *
+	 * The variable-shift idiom (shlq %cl, rax) requires the shift count
+	 * in cl.  We temporarily borrow rcx (inner loop counter) by pushing
+	 * and popping around each shift.
+	 */
+.Linner_avx512:
+	/* Load 8 PTEs into zmm1 (64 bytes, EVEX-encoded unaligned load). */
+	vmovdqu64	(%rdi), %zmm1		/* zmm1 = pte[7]..pte[0]        */
+	addq	$64, %rdi
+
+	/*
+	 * VPTESTMQ: for each 64-bit lane i, compute:
+	 *   k1[i] = (zmm1[i] & zmm0[i]) != 0 ? 1 : 0
+	 *
+	 * zmm0 holds 0x20 in all lanes.  A PTE with _PAGE_ACCESSED set has
+	 * bit 5 = 1, so the AND is nonzero => k1[i] = 1.
+	 * The 8-bit opmask k1 encodes young status for all 8 PTEs directly.
+	 *
+	 * No PSHUFD trick or PCMPEQD needed: the opmask result is exact.
+	 * No false positives from PFN high bits: VPTESTMQ tests for ANY
+	 * nonzero bit after AND with the narrow mask, but since the mask is
+	 * exactly 0x20 (single bit), the result is correct for any PFN.
+	 */
+	vptestmq	%zmm1, %zmm0, %k1	/* k1[i] = pte[i] has young bit */
+
+	/*
+	 * KMOVW: move the 16-bit opmask register k1 to eax.
+	 * VPTESTMQ on an 8-lane ZMM only sets k1[7:0]; k1[15:8] is zero.
+	 * So eax[7:0] = 8-bit young bitmap; eax[31:8] = 0 (zero-extended).
+	 * rax[63:32] is also zero because movl/kmovw zero-extends to 64 bits.
+	 *
+	 * KMOVW is in AVX-512F.  KMOVB requires AVX-512BW -- not used here.
+	 */
+	kmovw	%k1, %eax			/* eax[7:0] = 8-bit young mask  */
+
+	/*
+	 * Shift the 8-bit result to the correct position in the accumulator.
+	 * r9 = 0 for the first inner iter, 8 for the second, ..., 56 for
+	 * the eighth.  After shifting, rax holds the 8 result bits at their
+	 * correct positions within the 64-bit ulong.
+	 *
+	 * Use CL-based variable shift; save/restore rcx (inner loop counter).
+	 */
+	pushq	%rcx
+	movq	%r9, %rcx
+	shlq	%cl, %rax			/* rax <<= bit-position (0..56)  */
+	orq	%rax, %r8			/* accumulate into output ulong  */
+	popq	%rcx
+	addq	$8, %r9				/* advance bit-position by 8     */
+
+	decl	%ecx
+	jnz	.Linner_avx512
+
+	/* Store the completed ulong into bitmap. */
+	movq	%r8, (%r11)
+	addq	$8, %r11		/* advance bitmap write pointer     */
+
+	decl	%edx
+	jnz	.Louter_avx512
+
+	/*
+	 * VZEROUPPER: clear the upper halves of all YMM/ZMM registers to
+	 * avoid SSE/AVX transition penalties in subsequent legacy SSE code.
+	 * Required whenever AVX or AVX-512 code may be followed by SSE code.
+	 */
+	vzeroupper
+	RET
+SYM_FUNC_END(lru_marie_simd_scan_avx512)
diff --git a/mm/lru_marie/simd_x86_sse2.S b/mm/lru_marie/simd_x86_sse2.S
new file mode 100644
index 0000000000..4646ca1b9c
--- /dev/null
+++ b/mm/lru_marie/simd_x86_sse2.S
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * mm/lru_marie/simd_x86_sse2.S -- SSE2 PTE young-bit scan for the Marie walker.
+ *
+ * Processes 512 PTEs (one full PMD page table) per call, producing an
+ * 8-ulong (512-bit) young-bit bitmap.  4 PTEs are handled per SIMD
+ * iteration; 16 iterations fill one output ulong; 8 ulongs total.
+ *
+ * Variant chosen: counter-based rolled loop (outer=8, inner=16).
+ * Reason: keeps the file ~130 lines and fully auditable for SSE2
+ * compliance; a fully-unrolled variant would be ~3000 instructions and
+ * harder to review without meaningfully faster throughput on modern OOO
+ * cores (loop overhead is < 1% of SIMD work).
+ *
+ * Caller (marie_simd_scan_sse2_wrapper in mm/lru_marie/simd_x86.c, added in
+ * Task 5) holds kernel_fpu_begin/end around this.  Do NOT call directly
+ * from C without an FPU context bracket.
+ *
+ * ABI (System V AMD64):
+ *   rdi = const pte_t *pte_table  (512 PTEs, 4096 bytes, page-table base;
+ *                                  page-aligned by the kernel page-table
+ *                                  allocator; movdqu is used so no stricter
+ *                                  SIMD alignment is required for the input)
+ *   rsi = unsigned long *bitmap   (8 ulongs = 64 bytes, caller pre-cleared)
+ *
+ * Register usage:
+ *   xmm0  = young-bit mask broadcast (0x00000020 x4), constant
+ *   xmm1  = PTE pair 0-1 scratch
+ *   xmm2  = PTE pair 2-3 scratch
+ *   rdi   = current PTE pointer (walks forward)
+ *   rsi   = bitmap base (constant)
+ *   rdx   = outer loop counter (8 -> 0)
+ *   rcx   = inner loop counter (16 -> 0)
+ *   rax   = pmovmskb result / bit-extraction scratch
+ *   r8    = per-ulong accumulator
+ *   r9    = bit-position counter within current ulong (0..63)
+ *   r10   = single-bit scratch for OR-into-accumulator
+ *   r11   = bitmap write pointer (rsi + outer*8, updated each outer iter)
+ *
+ * SSE2 only (x86-64 ABI baseline since AMD Opteron 2003):
+ *   movdqa, pshufd, punpcklqdq, pand, pcmpeqd, pmovmskb -- all SSE2.
+ *   No SSE3, SSSE3, SSE4.x, AVX, BMI1/BMI2 instructions used.
+ *
+ * PSHUFD low-dword packing: the high 32 bits of an x86-64 PTE carry
+ * PFN bits 32-51, pkey, and NX.  If a PFN has bit 25 set (PA >= 128 GiB)
+ * with all other high PFN bits clear, the high dword would equal 0x20
+ * and produce a false positive against the young-bit mask.  PSHUFD with
+ * imm8=0x88 extracts only the low dword of each 64-bit PTE lane before
+ * the comparison, sidestepping this entirely.
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+	/* -------------------------------------------------------------- */
+	/* Read-only data: young-bit mask broadcast                        */
+	/* -------------------------------------------------------------- */
+	.section .rodata
+	.align 16
+marie_sse2_young_mask:
+	/* _PAGE_ACCESSED = bit 5 = 0x20, broadcast to all four dwords */
+	.long 0x00000020
+	.long 0x00000020
+	.long 0x00000020
+	.long 0x00000020
+
+	/* -------------------------------------------------------------- */
+	/* Text                                                            */
+	/* -------------------------------------------------------------- */
+	.text
+
+/*
+ * void lru_marie_simd_scan_sse2(const pte_t *pte_table, unsigned long *bitmap)
+ *
+ * pte_table: rdi -- 512 PTEs (4096 bytes), page-table base (8-byte aligned)
+ * bitmap:    rsi -- 8 ulongs (64 bytes), caller pre-cleared
+ */
+SYM_TYPED_FUNC_START(lru_marie_simd_scan_sse2)
+
+	/* Load young-bit mask once; xmm0 is constant for the whole call. */
+	movdqa	marie_sse2_young_mask(%rip), %xmm0
+
+	movq	%rsi, %r11		/* r11 = current bitmap word ptr    */
+	movl	$8, %edx		/* outer counter: 8 ulongs          */
+
+	/*
+	 * Outer loop: one iteration per output ulong (64 PTEs / 8 words).
+	 */
+.Louter:
+	xorq	%r8, %r8		/* accumulator = 0                  */
+	xorq	%r9, %r9		/* bit-position = 0                 */
+	movl	$16, %ecx		/* inner counter: 16 SIMD iters     */
+
+	/*
+	 * Inner loop: one iteration processes 4 PTEs => 4 result bits.
+	 *
+	 * Each PTE is 8 bytes; 4 PTEs = 32 bytes = 2 x 16-byte XMM loads.
+	 * rdi advances by 32 bytes per inner iteration.
+	 */
+.Linner:
+	/* Load PTE[0..1] and PTE[2..3] (unaligned; 8-byte align guaranteed). */
+	movdqu	(%rdi), %xmm1		/* xmm1 = { pte1_hi:pte1_lo | pte0_hi:pte0_lo } */
+	movdqu	16(%rdi), %xmm2		/* xmm2 = { pte3_hi:pte3_lo | pte2_hi:pte2_lo } */
+	addq	$32, %rdi
+
+	/*
+	 * Pack the low 32-bit dword of each PTE into one XMM register.
+	 *
+	 * pshufd imm8=0x88 = 0b_10_00_10_00:
+	 *   dst[0] = src[0]  (dword 0 = PTE low bits)
+	 *   dst[1] = src[2]  (dword 2 = next PTE low bits)
+	 *   dst[2] = src[0]  (repeated -- don't care)
+	 *   dst[3] = src[2]  (repeated -- don't care)
+	 * => xmm1 = { lo1_dup | lo0_dup | lo1 | lo0 }
+	 *    xmm2 = { lo3_dup | lo2_dup | lo3 | lo2 }
+	 *
+	 * punpcklqdq merges the low 64 bits of each:
+	 *   dst = { xmm2[63:0] | xmm1[63:0] }
+	 *       = { lo3 | lo2 | lo1 | lo0 }
+	 */
+	pshufd	$0x88, %xmm1, %xmm1
+	pshufd	$0x88, %xmm2, %xmm2
+	punpcklqdq %xmm2, %xmm1	/* xmm1 = { lo3, lo2, lo1, lo0 }   */
+
+	/* Isolate young bit (bit 5 = 0x20) in each dword. */
+	pand	%xmm0, %xmm1		/* xmm1 &= 0x20                     */
+
+	/* Compare: dword == 0x20 => all-ones (-1), else 0. */
+	pcmpeqd	%xmm0, %xmm1		/* xmm1[i] = (lo_i & 0x20 == 0x20) ? 0xFFFFFFFF : 0 */
+
+	/*
+	 * pmovmskb: takes the MSB of each byte => 16-bit result in eax.
+	 * Since each dword is either 0x00000000 or 0xFFFFFFFF, the four
+	 * nibbles of eax are uniformly 0x0 or 0xF:
+	 *   eax[3:0]   = PTE0 young (0xF) or not (0x0)
+	 *   eax[7:4]   = PTE1
+	 *   eax[11:8]  = PTE2
+	 *   eax[15:12] = PTE3
+	 */
+	pmovmskb %xmm1, %eax
+
+	/*
+	 * Extract one result bit per PTE and shift into the accumulator.
+	 *
+	 * We use the CL-shift idiom (SSE2/baseline x86):
+	 *   save rcx (inner loop counter) -> r10 temporarily, then restore.
+	 *
+	 * For each of the 4 PTEs:
+	 *   1. test eax against the nibble mask (0x000F, 0x00F0, 0x0F00, 0xF000)
+	 *   2. setnz -> 1-byte 0 or 1
+	 *   3. zero-extend to 64 bits
+	 *   4. shift left by current bit-position (r9) using CL
+	 *   5. OR into accumulator
+	 *   6. increment bit-position
+	 *
+	 * We push/pop rcx around each variable-CL shift to preserve the
+	 * inner loop counter.
+	 */
+
+	/* PTE 0: nibble [3:0] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0x000F, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	/* PTE 1: nibble [7:4] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0x00F0, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	/* PTE 2: nibble [11:8] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0x0F00, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	/* PTE 3: nibble [15:12] */
+	pushq	%rcx
+	movq	%r9, %rcx
+	testl	$0xF000, %eax
+	setnz	%r10b
+	movzbq	%r10b, %r10
+	shlq	%cl, %r10
+	orq	%r10, %r8
+	popq	%rcx
+	incq	%r9
+
+	decl	%ecx
+	jnz	.Linner
+
+	/* Store the completed ulong into bitmap. */
+	movq	%r8, (%r11)
+	addq	$8, %r11		/* advance bitmap write pointer     */
+
+	decl	%edx
+	jnz	.Louter
+
+	RET
+SYM_FUNC_END(lru_marie_simd_scan_sse2)
diff --git a/mm/lru_marie/state.c b/mm/lru_marie/state.c
new file mode 100644
index 0000000000..8f07247597
--- /dev/null
+++ b/mm/lru_marie/state.c
@@ -0,0 +1,2745 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Marie per-PFN state array — allocation, init, and global counters.
+ *
+ * Implements the public storage declared in state.h: the flat
+ * marie_state[] array indexed by PFN, the cycling head-gen counter,
+ * and the per-(gen, type) install counters that drive aging. All of
+ * these are allocated once at subsys_initcall time and never freed
+ * for the lifetime of the kernel.
+ *
+ * Sizing rule: the array covers PFNs [0, max_pfn). max_pfn is bounded
+ * by MARIE_MAX_SUPPORTED_PFN (the 32-bit PFN gate latched in
+ * marie_init), so worst-case footprint is 4 GiB. Realistic configs
+ * are 4-64 MiB. NUMA holes and reserved regions read as zero
+ * (untracked) and incur only sequential-read cost during scans.
+ */
+
+#define pr_fmt(fmt) "marie_state: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/log2.h>
+#include <linux/jump_label.h>
+#include <linux/list.h>
+#include <linux/lru_marie.h>
+#include <linux/memblock.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmzone.h>
+#include <linux/oom.h>
+#include <linux/printk.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/vm_event_item.h>
+#include <linux/vmstat.h>
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#endif
+
+#include "../internal.h"	/* struct scan_control, shrink_folio_list */
+#include "account.h"
+#include "pfn_install.h"
+#include "prefetch.h"
+#include "state.h"
+
+/*
+ * Runtime prefetch-ring parameters, set once at boot by
+ * marie_prefetch_params_init() based on CPUID. All values are
+ * powers of 2 so the hot path can use & marie_l3_mask instead of
+ * % marie_l3_ahead. Defaults are conservative (Silvermont / non-x86).
+ */
+static unsigned int marie_l3_ahead __read_mostly = 8;
+static unsigned int marie_l3_mask  __read_mostly = 7;
+static unsigned int marie_l1_ahead __read_mostly = 2;
+
+void __init marie_prefetch_params_init(void)
+{
+	unsigned int l3 = 8, l1 = 2;
+
+#ifdef CONFIG_X86
+	if (!boot_cpu_has(X86_FEATURE_AVX2))
+		goto done;
+
+	if (boot_cpu_has(X86_FEATURE_AVX512F)) {
+		/* Zen 4/5, Sapphire Rapids: L2 MSHR ~32 */
+		l3 = 32; l1 = 8;
+		goto done;
+	}
+
+	/* AVX2 present but no AVX-512 */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 >= 0x1A) {
+			/* Zen 5+ mobile without AVX-512 */
+			l3 = 32; l1 = 8;
+		} else if (boot_cpu_data.x86 == 0x19) {
+			/* Zen 3 (family 0x19): L2 MSHR ~24 */
+			l3 = 24; l1 = 8;
+		} else if (boot_cpu_data.x86 == 0x17) {
+			/* Zen 1/2 (family 0x17): L2 MSHR ~20 */
+			l3 = 20; l1 = 8;
+		} else {
+			/* Excavator era (family 0x15): L2 MSHR ~12 */
+			l3 = 16; l1 = 6;
+		}
+		break;
+	case X86_VENDOR_INTEL:
+		/*
+		 * CLFLUSHOPT as a Skylake proxy: Haswell and Broadwell
+		 * (all models) predate it; Skylake introduced it.
+		 */
+		if (boot_cpu_has(X86_FEATURE_CLFLUSHOPT)) {
+			/* Skylake and newer: L2 MSHR ~20-32 */
+			l3 = 24; l1 = 8;
+		} else {
+			/* Haswell / Broadwell: L2 MSHR ~16 */
+			l3 = 16; l1 = 6;
+		}
+		break;
+	default:
+		/* Unknown vendor with AVX2: conservative v3 baseline */
+		l3 = 16; l1 = 6;
+	}
+done:
+#endif
+	marie_l3_ahead = l3;
+	marie_l3_mask  = l3 - 1;
+	marie_l1_ahead = l1;
+	pr_info("prefetch ring: l3_ahead=%u l1_ahead=%u\n", l3, l1);
+}
+
+u8 *marie_state;
+unsigned long marie_state_size;
+
+/*
+ * Latches true once marie_state[] is allocated (first enable) and never
+ * flips back -- the array lives for the kernel's lifetime. Gates the
+ * page-free hook so stale TRACKED bits are wiped at the buddy handoff
+ * even across a Marie disable transition (when lru_marie_enabled() is
+ * already false but the drain walk is still in flight). See
+ * marie_state_ready() in <linux/lru_marie.h>.
+ */
+DEFINE_STATIC_KEY_FALSE(marie_state_ready_key);
+EXPORT_SYMBOL_GPL(marie_state_ready_key);
+
+atomic_t marie_head_gen[2];
+
+/*
+ * Per-(gen, type) install gauge, now PER-CPU instead of one global
+ * atomic_long. The install hot path bumps it with this_cpu_inc (no
+ * shared cacheline), so concurrent installs from different lruvecs
+ * (different lru_locks, not mutually serialised) no longer ping-pong a
+ * single global line -- the cross-lruvec contention point on the alloc
+ * path. The count is advisory (drives only the aging-cadence hint), so
+ * the throttled advance check reading an approximate cross-CPU sum is
+ * sufficient. marie_aging_tick is the per-CPU throttle for that check.
+ */
+DEFINE_PER_CPU(long[MARIE_PFN_NR_GENS][ANON_AND_FILE], marie_gen_installs_pc);
+DEFINE_PER_CPU(unsigned int[ANON_AND_FILE], marie_aging_tick);
+atomic_long_t marie_gen_occupied[MARIE_PFN_NR_GENS][2];
+atomic_t marie_gen_walker_visits[MARIE_PFN_NR_GENS][2];
+
+
+struct marie_bitmap marie_track_bm[2][MARIE_PFN_NR_GENS][MARIE_PFN_NR_TIERS];
+unsigned int marie_l2_shift;
+
+/*
+ * Per-CPU shrink scratch buffer, pre-allocated at boot. Reclaim path
+ * cannot kmalloc / kvmalloc on the hot path (allocation under memory
+ * pressure is what we are trying to relieve), so the isolate batch
+ * lives in a fixed per-CPU buffer claimed via an atomic in_use flag.
+ * On contention (preempted reclaimer on the same CPU holds the buf
+ * across a shrink_folio_list sleep) marie_state_shrink_lruvec falls
+ * back to a 160-entry stack array.
+ *
+ * Sizing: 8192 entries = SWAP_CLUSTER_MAX << 8. Doubled from the
+ * MGLRU MAX_LRU_BATCH (4096) reference after boot testing showed
+ * 4096-cap reclaim falling behind tail /dev/zero alloc rate. 32 MiB
+ * per shrink_folio_list flush at peak amortises lock + IPI overhead
+ * twice as well. Per-CPU memory cost:
+ *   batch:       8192 * 8 B = 64 KiB
+ *   atomic:                = ~4 B
+ *   ~= 64 KiB / CPU. 16 CPUs = ~1 MiB system-wide static.
+ *
+ * Neither PFN nor prev_tier needs its own array at putback: PFN is
+ * recovered via folio_pfn(batch[i]), and prev_tier is read back from the
+ * per-PFN state byte (counters_only preserves it across isolate).
+ */
+#define MARIE_PFN_SHRINK_BATCH	(SWAP_CLUSTER_MAX << 8)	/* 8192 */
+#define MARIE_PFN_BATCH_FLOOR	(SWAP_CLUSTER_MAX * 8)	/* 256, matches
+							 * legacy
+							 * MARIE_BATCH_FLOOR */
+/*
+ * Fallback batch size when the per-CPU buf is contended. 5 *
+ * SWAP_CLUSTER_MAX = 160 entries occupy 160 * 8 = 1280 B on the
+ * stack; combined with the surrounding ~464 B of non-array locals
+ * in shrink_lruvec the frame lands at ~1744 B, staying under the
+ * gcc -Wframe-larger-than=2048 threshold without restructuring.
+ * 5x SWAP_CLUSTER_MAX.
+ */
+#define MARIE_PFN_FALLBACK_BATCH (SWAP_CLUSTER_MAX * 5)	/* 160 */
+
+struct marie_shrink_buf {
+	atomic_t in_use;
+	struct folio *batch[MARIE_PFN_SHRINK_BATCH];
+};
+static DEFINE_PER_CPU(struct marie_shrink_buf, marie_shrink_buf);
+
+/*
+ * Per-PFN adaptive batch threshold.
+ *
+ *   priority = DEF_PRIORITY -> floor (MARIE_PFN_BATCH_FLOOR = 256)
+ *   priority = 0            -> cap   (MARIE_PFN_SHRINK_BATCH = 8192)
+ *
+ * Cap is the per-CPU buffer size; floor is large enough to amortise
+ * the per-call scan setup. Linear interpolation between the two over
+ * sc->priority.
+ */
+static unsigned long marie_pfn_batch_threshold(struct scan_control *sc)
+{
+	unsigned long floor = MARIE_PFN_BATCH_FLOOR;
+	unsigned long cap = MARIE_PFN_SHRINK_BATCH;
+	unsigned long pressure;
+
+	pressure = DEF_PRIORITY + 1 -
+		   clamp(sc_priority(sc), 0, DEF_PRIORITY);
+	return floor + (cap - floor) * (pressure - 1) / DEF_PRIORITY;
+}
+
+/*
+ * Per-memcg L1/L2 bitmap pair.
+ *
+ *   L1: 1 bit per PFN, separately allocated (~512 KiB / 16 GiB
+ *       max_pfn). Set on every install for this memcg, cleared
+ *       on every del.
+ *   L2: 1 bit per 32 MiB PFN range, inline (64 B). Maintained
+ *       via the per-bit l2_count[] refcounter so the L2 bit is
+ *       set on the 0->1 transition and cleared on the 1->0
+ *       transition -- precise (no stale bits).
+ *
+ * Scan AND's L1 word-by-word into the (type, gen, tier) inner
+ * producer and L2 word-by-word into the outer 8-word L2 loop, so
+ * memcg-targeted reclaim iterates exactly (type, gen, tier) ∩ memcg
+ * at source.
+ *
+ * Allocated for every non-root memcg at memcg create
+ * (lru_marie_memcg_alloc); freed at exit (marie_memcg_bitmap_free).
+ * Root memcg has no bitmap and the helpers no-op for it; root
+ * reclaim runs without per-memcg filtering anyway.
+ */
+/*
+ * Per-memcg bitmap is just the unified struct marie_bitmap on the
+ * heap, xa-keyed by memcg pointer. No wrapper needed.
+ */
+static DEFINE_XARRAY(marie_memcg_bitmap_xa);
+
+int lru_marie_memcg_alloc(struct mem_cgroup *memcg)
+{
+	struct marie_bitmap *bm;
+	int err;
+
+	might_sleep();
+
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return 0;
+	if (!max_pfn)
+		return 0;
+
+	bm = kzalloc(sizeof(*bm), GFP_KERNEL);
+	if (!bm)
+		return -ENOMEM;
+	if (marie_bm_init(bm)) {
+		kfree(bm);
+		return -ENOMEM;
+	}
+
+	err = xa_err(xa_store(&marie_memcg_bitmap_xa,
+			      (unsigned long)memcg, bm, GFP_KERNEL));
+	if (err) {
+		marie_bm_free(bm);
+		kfree(bm);
+		return err;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(lru_marie_memcg_alloc);
+
+void marie_memcg_bitmap_free(struct mem_cgroup *memcg)
+{
+	struct marie_bitmap *bm;
+
+	if (!memcg)
+		return;
+	bm = xa_erase(&marie_memcg_bitmap_xa, (unsigned long)memcg);
+	if (bm) {
+		marie_bm_free(bm);
+		kfree(bm);
+	}
+}
+
+void marie_memcg_bitmap_set(struct mem_cgroup *memcg, unsigned long pfn)
+{
+	struct marie_bitmap *bm;
+
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return;
+	bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg);
+	if (bm)
+		marie_bm_set(bm, pfn);
+}
+
+void marie_memcg_bitmap_clear(struct mem_cgroup *memcg, unsigned long pfn)
+{
+	struct marie_bitmap *bm;
+
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return;
+	bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg);
+	if (bm)
+		marie_bm_clear(bm, pfn);
+}
+
+unsigned long *marie_memcg_bitmap_get(struct mem_cgroup *memcg)
+{
+	struct marie_bitmap *bm;
+
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return NULL;
+	bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg);
+	return bm ? bm->l1 : NULL;
+}
+
+unsigned long *marie_memcg_bitmap_get_l2(struct mem_cgroup *memcg)
+{
+	struct marie_bitmap *bm;
+
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return NULL;
+	bm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)memcg);
+	return bm ? bm->l2 : NULL;
+}
+
+/*
+ * marie_memcg_bitmap_merge - hand off every PFN tracked in @child's
+ * per-memcg bitmap to @parent's. Thin wrapper over marie_bm_merge:
+ * the per-memcg storage is just a struct marie_bitmap so the merge
+ * logic (L2-pruned word-OR + l2_count transfer + L2 bit sync) lives
+ * once in bitmap.c and is shared between memcg reparent and any
+ * future caller.
+ *
+ * The per-PFN state byte (gen / tier / type / zone) and the global
+ * (type, gen, tier) tracking bitmaps are not touched here -- they
+ * are memcg-agnostic. After the merge, the walker / scanner running
+ * against @parent finds the reparented PFNs at their existing
+ * gen / tier positions through the union'd memcg L1.
+ *
+ * @parent == NULL or root: @child's bitmap is simply zeroed (folios
+ * fall back to global tracking equivalent to root_memcg).
+ *
+ * Caller must serialise against concurrent set/clear on either
+ * bitmap. memcontrol's reparent path holds objcg_lock + both lruvecs'
+ * lru_lock with IRQs off, which is sufficient.
+ */
+void marie_memcg_bitmap_merge(struct mem_cgroup *parent,
+			      struct mem_cgroup *child)
+{
+	struct marie_bitmap *cbm, *pbm = NULL;
+
+	if (!child || mem_cgroup_is_root(child))
+		return;
+	cbm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)child);
+	if (!cbm)
+		return;
+	if (parent && !mem_cgroup_is_root(parent))
+		pbm = xa_load(&marie_memcg_bitmap_xa, (unsigned long)parent);
+
+	marie_bm_merge(pbm, cbm);
+}
+EXPORT_SYMBOL_GPL(marie_memcg_bitmap_merge);
+
+/*
+ * Allocate the per-PFN state array. Called from marie_init() after
+ * the 32-bit PFN gate is latched, so max_pfn is guaranteed to fit
+ * in the supported range.
+ *
+ * kvmalloc lets the array fall back to vmalloc on systems where a
+ * physically contiguous allocation is unavailable; the array is
+ * accessed strictly by PFN index and does not require contiguity.
+ * GFP_KERNEL is safe here — initcall context can sleep.
+ */
+int __init marie_state_init(void)
+{
+	unsigned long bytes;
+	int g, t, ty;
+
+	bytes = max_pfn * sizeof(u8);
+	if (!bytes) {
+		pr_err("max_pfn is zero; refusing to initialise\n");
+		return -EINVAL;
+	}
+
+	marie_state = kvmalloc(bytes, GFP_KERNEL | __GFP_ZERO);
+	if (!marie_state) {
+		pr_err("failed to allocate %lu-byte per-PFN state array\n",
+		       bytes);
+		return -ENOMEM;
+	}
+	marie_state_size = max_pfn;
+
+	/*
+	 * L2 bitmap shift: (1 << shift) PFNs map to one L2 bit so 512
+	 * L2 bits cover the full max_pfn range. Round up to the next
+	 * power of two so the index is a simple right shift in the hot
+	 * path. Floor at shift 0 for tiny VMs where max_pfn < 512.
+	 * Must be set before any marie_bm_* call so marie_pfn_to_l2_bit
+	 * works correctly.
+	 */
+	{
+		unsigned long ppb = max_pfn / MARIE_L2_BITS;
+
+		if (ppb < 1)
+			ppb = 1;
+		marie_l2_shift = order_base_2(ppb);
+	}
+
+	marie_bm_range_locks_init();
+
+	/* Per-(type, gen, tier) L1 bitmaps: 16 total. */
+	for (ty = 0; ty < 2; ty++) {
+		for (g = 0; g < MARIE_PFN_NR_GENS; g++) {
+			for (t = 0; t < MARIE_PFN_NR_TIERS; t++) {
+				if (marie_bm_init(&marie_track_bm[ty][g][t]))
+					goto bm_oom;
+			}
+		}
+	}
+
+	/*
+	 * Latch the page-free hook on now that marie_state[] exists. Never
+	 * disabled -- the array is never freed, and TRACKED bits can persist
+	 * into a disable transition, so the hook must keep wiping them.
+	 */
+	static_branch_enable(&marie_state_ready_key);
+
+	pr_info("allocated state %lu B + 16 tracking bitmaps (max_pfn=%lu, l2_shift=%u)\n",
+		bytes, max_pfn, marie_l2_shift);
+	return 0;
+
+bm_oom:
+	for (ty = 0; ty < 2; ty++)
+		for (g = 0; g < MARIE_PFN_NR_GENS; g++)
+			for (t = 0; t < MARIE_PFN_NR_TIERS; t++)
+				marie_bm_free(&marie_track_bm[ty][g][t]);
+	kvfree(marie_state);
+	marie_state = NULL;
+	return -ENOMEM;
+}
+
+/*
+ * marie_state_isolate_scan_l2lock - L2-bitmap pre-filtered scan with
+ * 512-way parallel exclusion via try_lock on per-L2-bit locks.
+ *
+ * Walks the L2 bitmap (1 cacheline) for the oldest (gen, type). For
+ * each set L2 bit it try_locks the matching L2 lock; on success it
+ * holds exclusive ownership of that PFN range and walks the L1
+ * bitmap within it, applying the same (mask, target) byte filter as
+ * the cursor scan. On try_lock failure another scanner already owns
+ * the range -- skip and try the next L2 bit. No wasted candidate
+ * scan work, no per-CPU cursor, no overlap-arbitration via
+ * folio_test_clear_lru collisions.
+ *
+ * Loop exits when batch_size is reached, nr_to_scan is exhausted,
+ * or every L2 bit in the pgdat's PFN range has been visited (locked
+ * or skipped).
+ */
+unsigned long marie_state_isolate_scan_l2lock(struct pglist_data *pgdat,
+					      int type, int max_zone,
+					      unsigned int tier,
+					      struct mem_cgroup *target_memcg,
+					      struct folio **batch,
+					      unsigned long batch_size,
+					      unsigned long nr_to_scan)
+{
+	unsigned long *mbm = target_memcg ?
+			     marie_memcg_bitmap_get(target_memcg) : NULL;
+	unsigned long *memcg_l2 = target_memcg ?
+				  marie_memcg_bitmap_get_l2(target_memcg) :
+				  NULL;
+	unsigned long *l1, *l2;
+	u8 oldest_gen, mask, target;
+	int oldest;
+	unsigned long start_pfn, end_pfn;
+	unsigned int start_l2, end_l2;
+	unsigned int l2_word, l2_word_end;
+	unsigned long n_batch = 0;
+
+	if (!marie_state)
+		return 0;
+
+	oldest = marie_find_oldest_occupied(type);
+	if (oldest < 0)
+		return 0;
+	oldest_gen = (u8)oldest;
+	{
+		struct marie_bitmap *bm =
+			&marie_track_bm[type][oldest_gen][tier & 0x3];
+
+		l1 = bm->l1;
+		l2 = bm->l2;
+	}
+	if (!l1)
+		return 0;
+
+	mask = MARIE_PFN_TRACKED | MARIE_PFN_GEN_MASK |
+	       MARIE_PFN_TIER_MASK | MARIE_PFN_TYPE_MASK;
+	target = MARIE_PFN_TRACKED |
+		 (oldest_gen << MARIE_PFN_GEN_SHIFT) |
+		 ((tier & 0x3) << MARIE_PFN_TIER_SHIFT) |
+		 (type ? MARIE_PFN_TYPE_FILE : 0);
+
+	start_pfn = pgdat->node_start_pfn;
+	end_pfn   = pgdat_end_pfn(pgdat);
+	if (end_pfn > marie_state_size)
+		end_pfn = marie_state_size;
+	if (start_pfn >= end_pfn)
+		return 0;
+
+	start_l2 = marie_pfn_to_l2_bit(start_pfn);
+	end_l2 = marie_pfn_to_l2_bit(end_pfn - 1) + 1;
+	if (end_l2 > MARIE_L2_BITS)
+		end_l2 = MARIE_L2_BITS;
+	l2_word = start_l2 / BITS_PER_LONG;
+	l2_word_end = DIV_ROUND_UP(end_l2, BITS_PER_LONG);
+
+	/*
+	 * Outer L2 loop is word-level: AND the global (type, gen, tier)
+	 * L2 with the per-memcg L2 (when memcg-targeted) so the inner
+	 * __ffs/blsr extraction visits only L2 bits where
+	 * (type, gen, tier) ∩ memcg is non-empty. 512 L2 bits collapse
+	 * to 8 u64 word iterations; empty AND results skip the entire
+	 * word at one cycle each.
+	 */
+	for (; l2_word < l2_word_end; l2_word++) {
+		unsigned long l2w = l2[l2_word];
+
+		if (memcg_l2)
+			l2w &= memcg_l2[l2_word];
+		/* Mask off pre-start_l2 / post-end_l2 bits in edge words. */
+		if (l2_word == start_l2 / BITS_PER_LONG &&
+		    (start_l2 % BITS_PER_LONG))
+			l2w &= ~((1UL << (start_l2 % BITS_PER_LONG)) - 1);
+		if (l2_word + 1 == l2_word_end &&
+		    (end_l2 % BITS_PER_LONG))
+			l2w &= (1UL << (end_l2 % BITS_PER_LONG)) - 1;
+
+	while (l2w && n_batch < batch_size && nr_to_scan > 0) {
+		unsigned int bit = l2_word * BITS_PER_LONG + __ffs(l2w);
+		unsigned long lo, hi;
+		unsigned long ring[MARIE_L3_AHEAD_MAX];
+		int rh = 0, rt = 0, rc = 0;
+		unsigned long word_rem;
+		unsigned long word_base;
+		unsigned long word_i, end_word;
+		bool producer_done = false;
+		int i, n;
+		/*
+		 * Local copies of the runtime ring parameters. Declaring them
+		 * here as loop-scope constants lets the compiler see them as
+		 * truly invariant within this L2 lock window and allocate
+		 * registers for them, rather than spilling the file-static
+		 * globals to the stack under register pressure.
+		 */
+		const unsigned int r_l3_ahead = marie_l3_ahead;
+		const unsigned int r_l3_mask  = marie_l3_mask;
+		const unsigned int r_l1_ahead = marie_l1_ahead;
+		/*
+		 * Per-L2-range cache-line cursors for marie_state[] prefetch.
+		 * PFNs within one L2 range are monotonically increasing, so
+		 * the cursor only advances; resetting per L2 range avoids
+		 * stale comparisons when the next range starts at a lower
+		 * cache line than the previous one ended at.
+		 */
+		unsigned long state_cl_cursor_l3 = 0;
+		unsigned long state_cl_cursor_l1 = 0;
+		/*
+		 * Per-L2-range cache-line cursors for bitmap arrays. word_i
+		 * is monotonically increasing within the range so cursors
+		 * only advance. mbm cursor is unused when memcg-targeting is
+		 * off (mbm == NULL) — the macro guards on the array.
+		 */
+		unsigned long l1_cl_cursor = 0;
+		unsigned long mbm_cl_cursor = 0;
+
+		l2w &= l2w - 1;
+
+		if (!marie_bm_range_trylock(bit))
+			continue;
+
+		lo = marie_l2_bit_pfn_start(bit);
+		hi = marie_l2_bit_pfn_end(bit);
+		if (lo < start_pfn)
+			lo = start_pfn;
+		if (hi > end_pfn)
+			hi = end_pfn;
+
+		/*
+		 * Inline bit producer state with optional word-level mbm
+		 * AND: word_rem is the live remainder of l1[word_i] with
+		 * mbm[word_i] AND-ed in (when memcg-targeted). Persists
+		 * across Phase 1 fill and Phase 3 refill so we never
+		 * re-scan a cleared word and never pay find_next_bit's
+		 * call overhead. The AND narrows iteration to
+		 * (type, gen, tier) ∩ memcg at source -- per-candidate
+		 * mbm post-filter falls away.
+		 */
+		word_i = lo / BITS_PER_LONG;
+		end_word = BITS_TO_LONGS(hi);
+		word_base = word_i * BITS_PER_LONG;
+		word_rem = (word_i < end_word) ? l1[word_i] : 0;
+		if (mbm && word_i < end_word)
+			word_rem &= mbm[word_i];
+		/* Mask off pre-lo bits in the first word. */
+		if (lo > word_base)
+			word_rem &= ~((1UL << (lo - word_base)) - 1);
+		word_i++;
+
+		/*
+		 * Two-stage prefetch ring within this L2 lock window:
+		 *
+		 *   Phase 1: fill the ring (up to marie_l3_ahead candidate
+		 *     PFNs via inline __ffs/blsr), firing prefetcht2 on
+		 *     each struct page + state byte -- DRAM fetch in
+		 *     flight by the time the iterator pulls the entry.
+		 *
+		 *   Phase 2: L1-escalate the first marie_l1_ahead entries
+		 *     with prefetcht0 so they land in L1 before processing.
+		 *
+		 *   Phase 3: drain. Per pulled entry, refill the head (one
+		 *     more L3 prefetch) and L1-escalate the entry now
+		 *     marie_l1_ahead ahead of the new tail. State byte
+		 *     confirm and pfn_folio() both hit cache.
+		 *
+		 * Ring is local to this L2 lock acquisition; struct page +
+		 * state byte are vmemmap/contiguous so prefetches incur no
+		 * locking cost.
+		 */
+	/*
+	 * Cache-line cursor prefetch for bitmap arrays. Issued at each word
+	 * refill; the cursor only advances so a dense word transition does
+	 * not re-prefetch the same cache line.
+	 */
+#define MARIE_PREFETCH_BMWORD_L3(arr, cursor) do {				\
+		unsigned long _bi = word_i + MARIE_BM_L3_AHEAD_WORDS;		\
+		if (_bi < end_word) {						\
+			unsigned long _cl = (unsigned long)&(arr)[_bi]		\
+					    & ~63UL;				\
+			if (_cl != (cursor)) {					\
+				marie_prefetch_l3((void *)_cl);			\
+				(cursor) = _cl;					\
+			}							\
+		}								\
+	} while (0)
+
+#define MARIE_RING_PRODUCE(out_pfn, done_label) do {			\
+		while (!word_rem) {					\
+			if (word_i >= end_word) {			\
+				producer_done = true;			\
+				goto done_label;			\
+			}						\
+			word_rem = l1[word_i];				\
+			MARIE_PREFETCH_BMWORD_L3(l1, l1_cl_cursor);	\
+			if (mbm) {					\
+				word_rem &= mbm[word_i];		\
+				MARIE_PREFETCH_BMWORD_L3(mbm,		\
+							 mbm_cl_cursor);\
+			}						\
+			word_base = word_i * BITS_PER_LONG;		\
+			word_i++;					\
+		}							\
+		(out_pfn) = word_base + __ffs(word_rem);		\
+		word_rem &= word_rem - 1;				\
+		if ((out_pfn) >= hi) {					\
+			producer_done = true;				\
+			goto done_label;				\
+		}							\
+	} while (0)
+
+	/*
+	 * Cache-line cursor prefetch for marie_state[]. AHEAD_PFN pushes the
+	 * prefetched cache line N PFN ahead of the current producer position
+	 * so DRAM (L3-tier) and L3->L1 latencies are hidden even when the
+	 * consumer's fast-skip iter (mask filter early-continue) burns only
+	 * a few cycles per PFN. struct page is per-PFN = per-cache-line
+	 * already, so its prefetches stay per-PFN unchanged.
+	 */
+#define MARIE_PREFETCH_STATE_L3(pfn) do {					\
+		unsigned long _ah = (pfn) + MARIE_STATE_L3_AHEAD_PFN;		\
+		if (_ah < marie_state_size) {					\
+			unsigned long _cl = (unsigned long)&marie_state[_ah]	\
+					    & ~63UL;				\
+			if (_cl != state_cl_cursor_l3) {			\
+				marie_prefetch_l3((void *)_cl);			\
+				state_cl_cursor_l3 = _cl;			\
+			}							\
+		}								\
+	} while (0)
+#define MARIE_PREFETCH_STATE_L1(pfn) do {					\
+		unsigned long _ah = (pfn) + MARIE_STATE_L1_AHEAD_PFN;		\
+		if (_ah < marie_state_size) {					\
+			unsigned long _cl = (unsigned long)&marie_state[_ah]	\
+					    & ~63UL;				\
+			if (_cl != state_cl_cursor_l1) {			\
+				marie_prefetch_l1((void *)_cl);			\
+				state_cl_cursor_l1 = _cl;			\
+			}							\
+		}								\
+	} while (0)
+
+		while (rc < r_l3_ahead) {
+			unsigned long p;
+
+			MARIE_RING_PRODUCE(p, phase1_done);
+			ring[rh] = p;
+			rh = (rh + 1) & r_l3_mask;
+			rc++;
+			MARIE_PREFETCH_STATE_L3(p);
+			marie_prefetch_l3(pfn_to_page(p));
+		}
+phase1_done:
+
+		n = rc < r_l1_ahead ? rc : r_l1_ahead;
+		for (i = 0; i < n; i++) {
+			unsigned long p = ring[(rt + i) & r_l3_mask];
+
+			MARIE_PREFETCH_STATE_L1(p);
+			marie_prefetch_l1(pfn_to_page(p));
+		}
+
+		while (rc > 0 && n_batch < batch_size && nr_to_scan > 0) {
+			unsigned long pfn = ring[rt];
+			u8 s;
+			unsigned int z;
+			struct folio *f;
+
+			rt = (rt + 1) & r_l3_mask;
+			rc--;
+			nr_to_scan--;
+
+			if (!producer_done) {
+				unsigned long np;
+
+				MARIE_RING_PRODUCE(np, refill_done);
+				ring[rh] = np;
+				rh = (rh + 1) & r_l3_mask;
+				rc++;
+				MARIE_PREFETCH_STATE_L3(np);
+				marie_prefetch_l3(pfn_to_page(np));
+			}
+refill_done:
+
+			if (rc > r_l1_ahead) {
+				int idx = (rt + r_l1_ahead - 1) &
+					  r_l3_mask;
+				unsigned long lp = ring[idx];
+
+				MARIE_PREFETCH_STATE_L1(lp);
+				marie_prefetch_l1(pfn_to_page(lp));
+			}
+
+			s = READ_ONCE(marie_state[pfn]);
+			if ((s & mask) != target)
+				continue;
+			z = (s & MARIE_PFN_ZONE_MASK)
+				>> MARIE_PFN_ZONE_SHIFT;
+			if ((int)z > max_zone)
+				continue;
+
+			f = pfn_folio(pfn);
+			/*
+			 * mbm word-AND in the producer already restricted
+			 * candidates to (type, gen, tier) ∩ memcg; the
+			 * per-candidate test_bit(pfn, mbm) is therefore
+			 * unnecessary. Only the cmdline-disabled fallback
+			 * (target_memcg but no per-memcg bitmap) needs a
+			 * folio_memcg compare.
+			 */
+			if (target_memcg && !mbm &&
+			    folio_memcg(f) != target_memcg)
+				continue;
+			batch[n_batch++] = f;
+		}
+
+		marie_bm_range_unlock(bit);
+	}	/* while (l2w) -- next set bit in this L2 word */
+	}	/* for (l2_word) -- next L2 word */
+#undef MARIE_RING_PRODUCE
+#undef MARIE_PREFETCH_BMWORD_L3
+#undef MARIE_PREFETCH_STATE_L3
+#undef MARIE_PREFETCH_STATE_L1
+
+	return n_batch;
+}
+
+/*
+ * marie_state_drop_pfn - zero out every per-PFN tracking artifact
+ * for one folio (state byte, (type, gen, tier) L1 bit, occupancy
+ * counter, per-memcg L1/L2/l2_count, and the global L2 range
+ * counter with bulk L2 bit clear on 0).
+ *
+ * Called from:
+ *   marie_evict_locked      -- normal evict path
+ *   marie_drain_pfn_locked  -- enable=0 sysfs flip; folio gets
+ *                              returned to legacy LRU, the per-PFN
+ *                              artifacts MUST be wiped or they
+ *                              survive across the disabled window
+ *                              as ghosts that wedge counters on
+ *                              re-enable.
+ *
+ * No-op when the state byte is not TRACKED (defensive against
+ * double-drop). Reads the (gen, tier, type) tuple from the byte
+ * BEFORE zeroing it so the per-(type, gen, tier) bitmap and
+ * occupancy counter are decremented at the same coordinate the
+ * install incremented.
+ */
+void marie_state_drop_pfn(struct folio *folio)
+{
+	unsigned long pfn;
+	u8 s, g, tier, type_bit;
+
+	if (!marie_state || !folio)
+		return;
+
+	pfn = folio_pfn(folio);
+	if (pfn >= marie_state_size)
+		return;
+
+	s = marie_state[pfn];
+	marie_state[pfn] = 0;
+	if (!(s & MARIE_PFN_TRACKED))
+		return;
+
+	g = (s & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT;
+	tier = (s & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT;
+	type_bit = (s & MARIE_PFN_TYPE_MASK) ? 1 : 0;
+
+	marie_bm_clear(&marie_track_bm[type_bit][g][tier], pfn);
+	atomic_long_dec(&marie_gen_occupied[g][type_bit]);
+	marie_memcg_bitmap_clear(folio_memcg(folio), pfn);
+}
+EXPORT_SYMBOL_GPL(marie_state_drop_pfn);
+
+/*
+ * marie_state_drop_pfn_at_free - canonical buddy-handoff cleanup.
+ *
+ * Invoked from mm/page_alloc.c::free_pages_prepare for every page about
+ * to enter the buddy allocator. Eliminates the deferred-cleanup race
+ * between marie_evict_counters_only (counters -1, TRACKED preserved) and
+ * the next allocation at the same PFN: the moment the page is destined
+ * for buddy, we wipe Marie's per-PFN bookkeeping so a subsequent
+ * install_local starts from a clean state byte.
+ *
+ * Counters are NOT touched here -- they were either already balanced
+ * by marie_evict_locked (the normal Marie del path) or pre-decremented
+ * by marie_evict_counters_only (the reclaim isolate path), and the
+ * page-free hook runs once per page regardless of which del path was
+ * taken upstream.
+ *
+ * memcg_bitmap is intentionally untouched. folio_memcg is unsafe to
+ * dereference at free time (the page is mid-uncharge); the stale bit
+ * is harmless because the next install at this PFN under a different
+ * memcg will re-set the new memcg's bitmap bit, and a memcg teardown
+ * will free the bitmap wholesale.
+ *
+ * Lock-free: byte write, bitmap atomic-bit-clear, atomic_long_dec --
+ * safe from any context including IRQ.
+ */
+void marie_state_drop_pfn_at_free(unsigned long pfn)
+{
+	u8 s, g, tier, type_bit;
+
+	if (!marie_state || pfn >= marie_state_size)
+		return;
+
+	s = marie_state[pfn];
+	if (!(s & MARIE_PFN_TRACKED))
+		return;
+
+	/*
+	 * A TRACKED folio reaching the buddy free path still carrying PG_lru
+	 * bypassed Marie's evict (which clears both TRACKED and PG_lru under
+	 * the folio_test_clear_lru claim). Leaving PG_lru set trips the
+	 * "Bad page state |lru|" PAGE_FLAGS_CHECK_AT_FREE oops. Clear it
+	 * here as the canonical last-resort: the folio is being freed
+	 * (refcount 0) and Marie folios keep folio->lru as a self-loop
+	 * (never linked onto a real lruvec list), so dropping PG_lru cannot
+	 * corrupt any list. This is a mitigation for a residual reclaim
+	 * accounting race (a Marie folio reaching free with TRACKED still
+	 * set); the per-folio vmstat that install +nr'd is not undone here,
+	 * a minor drift accepted in exchange for not oopsing.
+	 */
+	{
+		struct folio *f = page_folio(pfn_to_page(pfn));
+
+		/*
+		 * Invariant: a TRACKED folio must never reach the buddy free
+		 * path still carrying PG_lru. Marie's evict clears both under
+		 * the folio_test_clear_lru claim, and folio_batch_move_lru no
+		 * longer re-stamps PG_lru onto a tracked folio (the mm/swap.c
+		 * fix). VM_WARN_ON_ONCE flags a regression of that invariant in
+		 * DEBUG_VM builds; it compiles to nothing in production, so the
+		 * folio_test_lru below costs only a predicted-not-taken branch
+		 * on an already-hot folio->flags. The trailing clear is the
+		 * production last resort -- it degrades any future regression
+		 * to a counter blip instead of a PAGE_FLAGS_CHECK_AT_FREE oops.
+		 * Marie folios keep folio->lru detached from real lruvec lists,
+		 * so clearing PG_lru here cannot corrupt a list.
+		 */
+		if (unlikely(folio_test_lru(f))) {
+			VM_WARN_ON_ONCE_FOLIO(1, f);
+			folio_clear_lru(f);
+		}
+		/*
+		 * shrink_folio_list can re-set PG_active on a folio whose
+		 * PG_lru is clear (Marie isolated it). PG_active is in
+		 * PAGE_FLAGS_CHECK_AT_FREE; if still set here it would
+		 * trigger bad_page in free_pages_prepare. Clear it
+		 * unconditionally as a last-resort safety net.
+		 */
+		if (unlikely(folio_test_active(f)))
+			folio_clear_active(f);
+
+#ifdef CONFIG_LRU_GEN
+		/*
+		 * Scrub MGLRU gen/refs residue. LRU_GEN_MASK is in
+		 * PAGE_FLAGS_CHECK_AT_FREE, so a leftover gen counter trips
+		 * "Bad page state" in free_pages_prepare. With Marie masking
+		 * lru_gen_enabled() off (see lru_gen_enabled()), no MGLRU
+		 * writer stamps these onto a tracked folio, so this is the
+		 * structural last resort that keeps any future regression a
+		 * counter blip rather than a buddy-path oops -- independent of
+		 * whether every lru_gen_enabled() reader stays correctly gated.
+		 *
+		 * PG_workingset is deliberately NOT cleared: Marie's eviction
+		 * relies on the legacy workingset_eviction shadow encoding,
+		 * which reads PG_workingset, and the bit is not in
+		 * PAGE_FLAGS_CHECK_AT_FREE.
+		 */
+		if (unlikely(f->flags.f & (LRU_GEN_MASK | LRU_REFS_MASK))) {
+			VM_WARN_ON_ONCE_FOLIO(1, f);
+			set_mask_bits(&f->flags.f, LRU_GEN_MASK | LRU_REFS_MASK, 0);
+		}
+#endif
+	}
+
+	marie_state[pfn] = 0;
+
+	g = (s & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT;
+	tier = (s & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT;
+	type_bit = (s & MARIE_PFN_TYPE_MASK) ? 1 : 0;
+
+	/*
+	 * The reclaim isolate path (marie_evict_counters_only) already retired
+	 * this PFN's scan-bitmap slot + gen_occupied at isolate, leaving only
+	 * the TRACKED byte (wiped just above). Test before clearing so that
+	 * common path does NOT double-decrement l2_count / gen_occupied below
+	 * zero. The clear still fires for the residual-race case -- a TRACKED
+	 * folio reaching free without having gone through isolate -- whose
+	 * scan slot is genuinely still live. Safe to test-then-clear here:
+	 * the page is at refcount 0 with no concurrent Marie op on this PFN.
+	 */
+	if (marie_bm_test(&marie_track_bm[type_bit][g][tier], pfn)) {
+		marie_bm_clear(&marie_track_bm[type_bit][g][tier], pfn);
+		atomic_long_dec(&marie_gen_occupied[g][type_bit]);
+	}
+}
+
+/*
+ * marie_state_move_to_gen - relocate a tracked PFN's encoding to
+ * (@target_gen, @target_tier) with matched (gen, type) bitmap +
+ * occupied-counter updates.
+ *
+ * Step 1: CAS the state byte. Defeats races against del (cur becomes
+ * 0) and against another concurrent move (cur changes). Retry on
+ * mismatch.
+ *
+ * Step 2: shuffle the bitmaps / counters. Order is "new first, then
+ * old" so the folio is visible on at least one (gen, type) plane
+ * throughout the transition. Skipped entirely when old_gen ==
+ * target_gen (only the tier changed, no slot movement needed).
+ *
+ * Skipped if the folio is no longer tracked, or the byte already
+ * encodes (target_gen, target_tier).
+ *
+ * Called from:
+ *   marie_state_inc_tier saturate path (target_gen=head, target_tier=0)
+ *   shrink_lruvec residue putback (target_gen=(head+2)&3,
+ *                                  target_tier=max(prev, w_tier))
+ */
+void marie_state_move_to_gen(unsigned long pfn, u8 target_gen, u8 target_tier)
+{
+	u8 cur, type, old_gen, old_tier, new_byte;
+
+	if (pfn >= marie_state_size)
+		return;
+	target_gen &= MARIE_PFN_NR_GENS - 1;
+	target_tier &= MARIE_PFN_TIER_MAX;
+
+retry:
+	cur = READ_ONCE(marie_state[pfn]);
+	if (!(cur & MARIE_PFN_TRACKED))
+		return;
+
+	new_byte = (cur & ~(MARIE_PFN_GEN_MASK | MARIE_PFN_TIER_MASK)) |
+		   ((u8)target_gen << MARIE_PFN_GEN_SHIFT) |
+		   ((u8)target_tier << MARIE_PFN_TIER_SHIFT);
+	if (new_byte == cur)
+		return;
+
+	if (cmpxchg(&marie_state[pfn], cur, new_byte) != cur)
+		goto retry;
+
+	type = (cur & MARIE_PFN_TYPE_MASK) ? 1 : 0;
+	old_gen = (cur & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT;
+	old_tier = (cur & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT;
+	if (old_gen == target_gen && old_tier == target_tier)
+		return;
+
+	/* publish on new (type, gen, tier) first */
+	marie_bm_set(&marie_track_bm[type][target_gen][target_tier], pfn);
+	if (old_gen != target_gen) {
+		atomic_long_inc(&marie_gen_occupied[target_gen][type]);
+		atomic_long_dec(&marie_gen_occupied[old_gen][type]);
+	}
+	/* un-publish old (type, gen, tier) */
+	marie_bm_clear(&marie_track_bm[type][old_gen][old_tier], pfn);
+}
+EXPORT_SYMBOL_GPL(marie_state_move_to_gen);
+
+/*
+ * marie_state_publish_at_gen - (re)publish an already-TRACKED PFN's scan
+ * slot at (@target_gen, @target_tier), PUBLISH-ONLY (no un-publish of an
+ * old slot).
+ *
+ * This is the putback counterpart to marie_evict_counters_only: isolate
+ * already retired the old (gen, tier) bitmap bit + gen_occupied slot, so a
+ * surviving folio has NO old slot to clear -- only the new one to set.
+ * Unlike marie_state_move_to_gen (set-new + clear-old), this never touches
+ * the old coordinate, so it cannot double-decrement the l2_count / occupied
+ * accounting that isolate already balanced.
+ *
+ * The byte stays TRACKED throughout (counters_only preserves it); here we
+ * only rewrite its (gen, tier) field and set the matching bitmap bit +
+ * occupied counter. Always sets the bitmap bit, even when the byte's
+ * (gen, tier) is unchanged, because the bit itself was cleared at isolate.
+ *
+ * Caller context: putback, where the folio is exclusively owned (PG_lru
+ * cleared at claim, not yet republished; the dropped scan bit keeps the
+ * walker away), so the cmpxchg cannot lose a race in practice -- it is
+ * kept only to preserve the byte's TRACKED/TYPE/ZONE bits cleanly.
+ */
+static void marie_state_publish_at_gen(unsigned long pfn, u8 target_gen,
+				       u8 target_tier)
+{
+	u8 cur, type, new_byte;
+
+	if (pfn >= marie_state_size)
+		return;
+	target_gen &= MARIE_PFN_NR_GENS - 1;
+	target_tier &= MARIE_PFN_TIER_MAX;
+
+retry:
+	cur = READ_ONCE(marie_state[pfn]);
+	if (!(cur & MARIE_PFN_TRACKED))
+		return;
+
+	new_byte = (cur & ~(MARIE_PFN_GEN_MASK | MARIE_PFN_TIER_MASK)) |
+		   ((u8)target_gen << MARIE_PFN_GEN_SHIFT) |
+		   ((u8)target_tier << MARIE_PFN_TIER_SHIFT);
+	if (new_byte != cur &&
+	    cmpxchg(&marie_state[pfn], cur, new_byte) != cur)
+		goto retry;
+
+	type = (cur & MARIE_PFN_TYPE_MASK) ? 1 : 0;
+	marie_bm_set(&marie_track_bm[type][target_gen][target_tier], pfn);
+	atomic_long_inc(&marie_gen_occupied[target_gen][type]);
+}
+
+/*
+ * marie_state_inc_tier - saturating tier bump on the per-PFN byte.
+ *
+ * Runs from folio_mark_accessed() WITHOUT lru_lock, so the state byte
+ * is committed with try_cmpxchg to avoid losing a concurrent lock-free
+ * drop_pfn / install publish (see the loop comment below).
+ *
+ * Non-saturated (tier < MAX): bump the tier field in place.
+ *
+ * Saturated (tier == MAX): in-place promote -- roll to head gen at
+ * tier 0 (inlined marie_state_move_to_gen). The "already on head"
+ * early exit avoids the CAS round-trip when the folio cannot be
+ * promoted further.
+ */
+void marie_state_inc_tier(unsigned long pfn)
+{
+	u8 cur, new, t, type, gen, head = 0, old_gen, new_tier = 0;
+	bool roll;
+
+	if (pfn >= marie_state_size)
+		return;
+
+	/*
+	 * folio_mark_accessed() runs this from the fault / pagecache-hit path
+	 * WITHOUT lru_lock, racing the lock-free reclaim isolate
+	 * (marie_state_drop_pfn) and the lru_lock-held install publish. All
+	 * three RMW the same non-atomic state byte, so a plain READ/WRITE_ONCE
+	 * loses updates -- e.g. resurrecting a TRACKED bit drop_pfn just
+	 * cleared. Commit the byte with try_cmpxchg; a concurrent writer forces
+	 * a reload + recheck, and if drop_pfn cleared TRACKED we bail.
+	 */
+	cur = READ_ONCE(marie_state[pfn]);
+	do {
+		if (!(cur & MARIE_PFN_TRACKED))
+			return;
+		t = (cur & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT;
+		type = (cur & MARIE_PFN_TYPE_MASK) ? 1 : 0;
+		if (t < MARIE_PFN_TIER_MAX) {
+			new_tier = t + 1;
+			new = (cur & ~MARIE_PFN_TIER_MASK) |
+			      ((new_tier << MARIE_PFN_TIER_SHIFT) &
+			       MARIE_PFN_TIER_MASK);
+			roll = false;
+		} else {
+			head = (u8)atomic_read(&marie_head_gen[type]);
+			old_gen = (cur & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT;
+			if (head == old_gen)
+				return;
+			new = (cur & ~(MARIE_PFN_GEN_MASK | MARIE_PFN_TIER_MASK)) |
+			      (head << MARIE_PFN_GEN_SHIFT);
+			roll = true;
+		}
+	} while (!try_cmpxchg(&marie_state[pfn], &cur, new));
+
+	/* State byte committed; bitmaps/occupancy best-effort (scanner re-validates). */
+	gen = (cur & MARIE_PFN_GEN_MASK) >> MARIE_PFN_GEN_SHIFT;
+	if (!roll) {
+		marie_bm_set(&marie_track_bm[type][gen][new_tier], pfn);
+		marie_bm_clear(&marie_track_bm[type][gen][t], pfn);
+	} else {
+		marie_bm_set(&marie_track_bm[type][head][0], pfn);
+		marie_bm_clear(&marie_track_bm[type][gen][t], pfn);
+		atomic_long_dec(&marie_gen_occupied[gen][type]);
+		atomic_long_inc(&marie_gen_occupied[head][type]);
+	}
+}
+EXPORT_SYMBOL_GPL(marie_state_inc_tier);
+
+/*
+ * marie_occupied_gen_count - number of gens with at least one folio
+ * for @type. Used by the reclaim-driven aging trigger.
+ *
+ * Four atomic_long_read per call; only invoked at shrink_lruvec
+ * entry, not on any per-fault hot path. Snapshot may race with
+ * concurrent install/del but the only consequence is one missed or
+ * one extra try_advance_head call -- both are benign.
+ */
+static int marie_occupied_gen_count(int type)
+{
+	int g, occupied = 0;
+
+	for (g = 0; g < MARIE_PFN_NR_GENS; g++)
+		if (atomic_long_read(&marie_gen_occupied[g][type]) > 0)
+			occupied++;
+	return occupied;
+}
+
+/*
+ * --------------------------------------------------------------------
+ *  Anon/file swap-bias controller (stubborn proportional)
+ * --------------------------------------------------------------------
+ *
+ * A single signed counter per marie_lruvec drives the anon-vs-file
+ * pick under proportional swappiness (2..199). Granularity rule:
+ * EXACTLY ONE type is scanned per shrink_lruvec call in the
+ * proportional regime -- the bias sign selects which. Scanning both
+ * sides in the same call would dissolve the s:(MAX-s) ratio because
+ * every call would contribute pages from both. The caller's priority
+ * loop re-enters shrink_lruvec for the next pick, and the bias
+ * (updated from this call's outcome) may flip the selection in
+ * between -- yielding "fine-grained" type switching at call
+ * granularity, which matches the user-visible reclaim cadence.
+ *
+ *   SUCCESS (nr_reclaimed > 0):
+ *     bias += sign * nr_reclaimed * weight
+ *     -- page-flow proportional. Long-run pages(anon):pages(file)
+ *        converges to s:(MAX_SWAPPINESS-s) even when per-pick batch
+ *        sizes differ systematically between types.
+ *
+ *   FAILURE (nr_reclaimed == 0):
+ *     bias unchanged (no-op).
+ *     -- The picked side stays the picked side. Failure carries no
+ *        back-pressure -- not even a unit nudge -- so the favored
+ *        side remains favored indefinitely under sustained failure.
+ *        This is the entire point of low-swappiness on modern ZRAM
+ *        systems: file should be the eviction target even when it
+ *        transiently (or persistently) produces nothing, and anon
+ *        must NOT be touched as a consequence of file being stuck on
+ *        dirty / locked / writeback / depleted state. If file truly
+ *        cannot be reclaimed, the caller escalates priority or OOM
+ *        kicks in -- the controller does not surrender protection.
+ *
+ *   sign = -1 for picked=ANON (push bias toward FILE)
+ *          +1 for picked=FILE (push bias toward ANON)
+ *   weight = MAX_SWAPPINESS - s   for picked=ANON
+ *          = s                    for picked=FILE
+ *
+ * Special-value swappiness short-circuits the controller:
+ *   s=0   FILE only, no fallback (caller proceeds to OOM if depleted)
+ *   s=1   FILE first; ANON engages on EITHER of two depletion
+ *         signals (see the FILE_THEN_ANON tail gate):
+ *           - file < clean_min_ratio floor (skip_file true), or
+ *           - file >= floor but the FILE pass FAILED TO MEET this
+ *             call's reclaim target = file reclaim is not keeping
+ *             pace right now.
+ *         Throughput is empirical -- a tracked file folio may be
+ *         hot/dirty/mapped, and how much frees is knowable only by
+ *         trying -- so the FILE pass's own outcome, not occupancy, is
+ *         the signal. Sufficiency (target met) rather than exact-zero
+ *         is what keeps reclaim file-first: a positive-but-insufficient
+ *         file trickle must not pin reclaim file-only while swappable
+ *         anon OOMs with swap free. The fallback fires on the first
+ *         call file cannot satisfy -- it does NOT wait for sc->priority
+ *         to decay -- and a transient file stall costs at most one
+ *         early anon batch; preferred over OOM with swap free.
+ *   s=MAX ANON only, no fallback (symmetric to s=0)
+ *
+ * clean_min_ratio override: when the floor diverts reclaim to
+ * anon-only (skip_file in marie_state_shrink_lruvec), the caller
+ * does NOT invoke marie_swap_bias_update for that call. The
+ * controller stays frozen at its pre-override value so that, when
+ * file recovers above the floor, the proportional regime resumes
+ * from where it left off -- no post-recovery overshoot from anon
+ * reclaim that was driven by external policy, not swappiness.
+ *
+ * Sysctl writes invoke lru_marie_swappiness_changed() which walks
+ * the xarray and resets every swap_bias to zero, so the controller
+ * restarts cleanly under the new weight ratio.
+ *
+ * No CAP: per-cycle delta is bounded by batch_max (~8192) *
+ * MAX_SWAPPINESS (200) ~ 1.6e6, far below S64_MAX in any realistic
+ * running time. The sysctl-write reset is the only reset mechanism.
+ */
+
+enum marie_pick_kind marie_swap_pick_type(struct marie_lruvec *mlv,
+					  u8 swappiness)
+{
+	if (swappiness == 0)
+		return MARIE_PICK_FILE_STRICT;
+	if (swappiness == 1)
+		return MARIE_PICK_FILE_THEN_ANON;
+	if (swappiness >= MAX_SWAPPINESS)
+		return MARIE_PICK_ANON_STRICT;
+
+	if (!mlv)
+		return MARIE_PICK_ANON_FIRST;
+
+	return (atomic64_read(&mlv->swap_bias) < 0)
+		? MARIE_PICK_FILE_FIRST
+		: MARIE_PICK_ANON_FIRST;
+}
+
+void marie_swap_bias_update(struct marie_lruvec *mlv,
+			    int picked_type,
+			    unsigned long nr_reclaimed,
+			    u8 swappiness)
+{
+	s64 cur, delta;
+
+	if (!mlv)
+		return;
+	/*
+	 * Special values bypass the controller. The pick path does not
+	 * read swap_bias under {0, 1, MAX_SWAPPINESS}, so the value
+	 * here is irrelevant to observable behaviour; skipping the
+	 * write also avoids gratuitous cache-line bouncing.
+	 */
+	if (swappiness <= 1 || swappiness >= MAX_SWAPPINESS)
+		return;
+
+	/*
+	 * Failure carries no back-pressure: when nr_reclaimed is zero,
+	 * the bias is left untouched. The picked side stays the picked
+	 * side -- truly stubborn protection of the favored type. See
+	 * the top of this section for the failsafe semantics.
+	 */
+	if (!nr_reclaimed)
+		return;
+
+	if (picked_type == 0)
+		delta = -(s64)nr_reclaimed *
+			(s64)(MAX_SWAPPINESS - swappiness);
+	else
+		delta = +(s64)nr_reclaimed * (s64)swappiness;
+
+	cur = atomic64_read(&mlv->swap_bias);
+	atomic64_set(&mlv->swap_bias, cur + delta);
+}
+
+/*
+ * marie_file_floor_protect - is the clean_min_ratio file floor in force?
+ *
+ * Returns true when this node's clean file pagecache has fallen below
+ * marie_clean_min_ratio (% of node_present_pages) and Marie still has
+ * anon to absorb the pressure, so file reclaim must be withheld. The pick
+ * driver diverts file -> anon on this signal (skip_file) and folds the
+ * result into the MARIE_DRAIN_* mask it returns, so shrink_lruvec's legacy
+ * orphan drain spares file too. No reclaim path may evict file below the
+ * floor -- le9uo's single-path floor invariant applied across Marie's paths.
+ *
+ * Only CLEAN file counts toward the floor (NR_FILE_DIRTY subtracted):
+ * dirty pages cannot be reclaimed without writeback, so counting them
+ * would let the floor be satisfied by unreclaimable pages and strand the
+ * clean working set.
+ *
+ * If anon is empty Marie has no reserve to protect anyway, so the floor
+ * yields and file scan proceeds as a last resort. An OOM victim bypasses
+ * the floor entirely (its file is fair game; see the oom_victim handling
+ * in marie_state_shrink_lruvec).
+ */
+static bool marie_file_floor_protect(struct pglist_data *pgdat)
+{
+	unsigned int min_ratio = READ_ONCE(marie_clean_min_ratio);
+	unsigned long file_pages, dirty, file_min;
+	long anon_occupied = 0;
+	int g;
+
+	if (!min_ratio || unlikely(tsk_is_oom_victim(current)))
+		return false;
+
+	file_pages = node_page_state(pgdat, NR_ACTIVE_FILE) +
+		     node_page_state(pgdat, NR_INACTIVE_FILE);
+	dirty = node_page_state(pgdat, NR_FILE_DIRTY);
+	file_pages = (file_pages > dirty) ? file_pages - dirty : 0;
+	file_min = pgdat->node_present_pages * min_ratio / 100;
+
+	if (file_pages >= file_min)
+		return false;
+
+	for (g = 0; g < MARIE_PFN_NR_GENS; g++)
+		anon_occupied += atomic_long_read(&marie_gen_occupied[g][0]);
+
+	return anon_occupied > 0;
+}
+
+/*
+ * marie_state_shrink_lruvec - per-PFN paradigm reclaim driver.
+ *
+ * At entry, for each type, fire the reclaim-driven aging trigger:
+ * if fewer than 2 gens of that type are occupied,
+ * marie_try_advance_head(type) so install and reclaim do not
+ * collide on a single gen. Without this trigger a freshly booted
+ * system with only the head gen occupied returns zero candidates
+ * because marie_find_oldest_occupied skips head (the install
+ * destination).
+ *
+ * Per (type, tier) the scan walks the per-(gen, type) bitmap, claims
+ * each candidate via folio_try_get + folio_test_clear_lru, then calls
+ * marie_evict_counters_only: counters decremented and the scan-bitmap
+ * slot + gen_occupied retired at isolate (so other CPUs stop re-finding
+ * the in-flight folio), but the per-PFN TRACKED byte is KEPT so
+ * install_local's early-out blocks a concurrent install from re-setting
+ * PG_lru while shrink_folio_list reclaims it.
+ *
+ * Teardown of the TRACKED byte is deferred: a reclaimed folio is wiped
+ * at its buddy handoff (marie_state_drop_pfn_at_free via the
+ * free_pages_prepare hook), which finds the scan bit already clear and
+ * so does not double-decrement l2_count / gen_occupied. Survivors of
+ * shrink_folio_list keep TRACKED and are re-published at the putback gen
+ * via marie_state_publish_at_gen (set-only: no clear-old, because isolate
+ * already retired the old slot), seeding tier from max(prev_tier,
+ * PG_active/PG_workingset).
+ */
+
+unsigned int marie_state_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	/*
+	 * Hybrid memcg filter (design.h section 9 E):
+	 *   - root reclaim: NULL -> per-candidate folio_memcg check
+	 *                   skipped, every Marie folio in scope.
+	 *   - child memcg:  pass the lruvec's memcg so scan can confirm
+	 *                   folio_memcg(f) == target after the byte
+	 *                   filter. Hot only on cgroup-targeted reclaim.
+	 */
+	struct mem_cgroup *target_memcg =
+		(!memcg || mem_cgroup_is_root(memcg)) ? NULL : memcg;
+	/*
+	 * Swap-bias controller state for this shrink cycle. mlv_bias is
+	 * the per-lruvec carrier of the bias counter; marie_get_lruvec
+	 * may return NULL on alloc failure, in which case marie_swap_*
+	 * helpers no-op and the pick falls back to the legacy
+	 * "anon then file" order.
+	 *
+	 * @swappiness is captured once per call; subsequent sysctl
+	 * writes that reset bias to zero are seen on the NEXT call.
+	 * mem_cgroup_swappiness returns the effective value (memcg own
+	 * value on cgroup v1 non-root, vm_swappiness otherwise) and is
+	 * a plain READ_ONCE under the hood.
+	 */
+	struct marie_lruvec *mlv_bias = marie_get_lruvec(lruvec);
+	u8 swappiness = (u8)mem_cgroup_swappiness(memcg);
+	enum marie_pick_kind pick_kind;
+	int type_order[2];
+	int type_count;
+	int idx;
+	bool skip_file = false;
+	unsigned int drain_mask;
+	/*
+	 * When anon cannot be reclaimed at all (no free swap slots,
+	 * cgroup swap limit hit, no demotion target), swappiness is by
+	 * definition meaningless -- it expresses the anon:file reclaim
+	 * ratio, and one side of that ratio no longer exists. Every ANON
+	 * pick would reclaim nothing, and because the bias controller
+	 * takes no back-pressure from a zero-reclaim pick
+	 * (marie_swap_bias_update bails on !nr_reclaimed), the bias never
+	 * flips to FILE: reclaimable file cache is stranded until OOM.
+	 * Drop the stubborn swappiness preference and force FILE only,
+	 * mirroring get_scan_count()'s "!can_reclaim_anon_pages ->
+	 * SCAN_FILE". The clean_min_ratio floor below still applies, so
+	 * file is reclaimed only down to the protected floor; once file is
+	 * at the floor and anon is unreclaimable this pass reclaims nothing,
+	 * and the stock no_progress_loops path in should_reclaim_retry()
+	 * reaches the OOM killer.
+	 */
+	bool anon_unreclaimable =
+		!vmscan_can_reclaim_anon_pages(memcg, pgdat->node_id, sc);
+	/*
+	 * An OOM victim's own direct reclaim runs FILE-only, with no holds
+	 * barred on the file side: scan FILE ignoring the swappiness/bias
+	 * pick, the clean_min_ratio floor, the FILE_THEN_ANON tail gate and
+	 * the bias controller. The task has been selected for death and the
+	 * OOM reaper frees its anon, so swapping anon here would only add
+	 * I/O thrash for no benefit -- reclaim just the cheap, no-I/O file
+	 * side (clean_min_ratio is bypassed below, so all file is fair
+	 * game). If file is exhausted the victim falls back on the reaper,
+	 * which is the normal OOM mechanism. kswapd is never an OOM victim,
+	 * so background reclaim is unaffected.
+	 */
+	bool oom_victim = tsk_is_oom_victim(current);
+	int type;
+
+	for (type = 0; type < ANON_AND_FILE; type++)
+		if (marie_occupied_gen_count(type) < 2)
+			marie_try_advance_head(type);
+
+	/*
+	 * clean_min_ratio hard floor. True when this node's clean file
+	 * pagecache is below the configured percentage of node_present_pages
+	 * (and anon remains, and we are not an OOM victim). The same predicate
+	 * masks the legacy drain's file scan in shrink_lruvec, so no path
+	 * evicts file below the floor (le9uo's single-path floor invariant).
+	 */
+	skip_file = marie_file_floor_protect(pgdat);
+
+	/*
+	 * Choose the type(s) to scan as a strict priority cascade:
+	 *
+	 *   oom_victim         -> FILE only. The victim's anon is reaped by the
+	 *                         OOM reaper, so swapping anon is pure I/O thrash;
+	 *                         reclaim the cheap no-I/O file side. The floor is
+	 *                         bypassed for victims (skip_file is false), so
+	 *                         file scans freely.
+	 *   anon_unreclaimable -> FILE only. No free swap slots / no demotion
+	 *                         target: swappiness is meaningless and every ANON
+	 *                         pick would free nothing. If file is also at the
+	 *                         floor the per-iteration gate no-ops the file
+	 *                         scan and the stock no_progress_loops path OOMs.
+	 *   swappiness == 0     -> FILE only. Hard "never swap" user policy: the
+	 *                         clean_min_ratio floor must NOT punch through it
+	 *                         (core.c). At the floor file is blocked too, so
+	 *                         this OOMs rather than swapping -- the contract.
+	 *   skip_file          -> ANON only. The floor is in force and file is
+	 *                         protected, so divert all reclaim to anon
+	 *                         regardless of the swappiness/bias pick. This
+	 *                         outranks the proportional controller: a
+	 *                         FILE_FIRST pick would otherwise scan the
+	 *                         floor-blocked file side, free nothing, and --
+	 *                         the bias being frozen during skip_file -- stay
+	 *                         pinned on FILE while anon is never picked,
+	 *                         stalling reclaim under pressure at high swappiness.
+	 *   otherwise          -> the swappiness / swap_bias proportional pick.
+	 */
+	if (oom_victim)
+		pick_kind = MARIE_PICK_FILE_STRICT;
+	else if (anon_unreclaimable)
+		pick_kind = MARIE_PICK_FILE_STRICT;
+	else if (swappiness == 0)
+		pick_kind = MARIE_PICK_FILE_STRICT;
+	else if (skip_file)
+		pick_kind = MARIE_PICK_ANON_STRICT;
+	else
+		pick_kind = marie_swap_pick_type(mlv_bias, swappiness);
+
+	switch (pick_kind) {
+	case MARIE_PICK_FILE_STRICT:
+		type_order[0] = 1;
+		type_count = 1;
+		break;
+	case MARIE_PICK_ANON_STRICT:
+		type_order[0] = 0;
+		type_count = 1;
+		break;
+	case MARIE_PICK_FILE_THEN_ANON:
+		/*
+		 * swappiness=1: FILE first, ANON as the depletion fallback
+		 * the moment FILE fails to satisfy this call's reclaim
+		 * target (not only when FILE returns exactly zero).
+		 * type_count=2 with the sufficiency gate at the tail.
+		 */
+		type_order[0] = 1;
+		type_order[1] = 0;
+		type_count = 2;
+		break;
+	case MARIE_PICK_FILE_FIRST:
+		/*
+		 * Proportional regime, bias picks FILE. SINGLE type per
+		 * call: scanning the other side in the same call would
+		 * dissolve the s:(MAX-s) ratio because both sides would
+		 * contribute pages on every invocation. The caller
+		 * (vmscan priority loop) re-enters shrink_lruvec for
+		 * the next pick; bias may flip in between via the
+		 * proportional update from this call's outcome.
+		 */
+		type_order[0] = 1;
+		type_count = 1;
+		break;
+	case MARIE_PICK_ANON_FIRST:
+	default:
+		/* Symmetric: proportional regime, bias picks ANON. */
+		type_order[0] = 0;
+		type_count = 1;
+		break;
+	}
+
+	/*
+	 * Tell shrink_lruvec which orphan type(s) its legacy drain may
+	 * reclaim: exactly the type this call scans. type_order[0] is the
+	 * primary (and, in the single-type regime, only) type. A file pick
+	 * blocked by skip_file (FILE_STRICT under the clean_min_ratio floor)
+	 * scans nothing, so it grants no drain -- preserving the
+	 * no-progress -> OOM path.
+	 */
+	if (type_order[0] == 1)
+		drain_mask = skip_file ? 0 : MARIE_DRAIN_FILE;
+	else
+		drain_mask = MARIE_DRAIN_ANON;
+
+	{
+		/*
+		 * Claim this CPU's pre-allocated shrink buffer. If the
+		 * cmpxchg fails (preempted reclaimer on the same CPU
+		 * holds it across a shrink_folio_list sleep), fall back
+		 * to a small stack batch.
+		 */
+		struct marie_shrink_buf *buf;
+		/*
+		 * Fallback uses MARIE_PFN_FALLBACK_BATCH-sized stack
+		 * arrays. Sized to stay under gcc -Wframe-larger-than=2048
+		 * given the ~464 B baseline frame; see MARIE_PFN_FALLBACK_
+		 * BATCH comment.
+		 */
+		struct folio *small_batch[MARIE_PFN_FALLBACK_BATCH];
+		struct folio **scratch_batch;
+		unsigned long batch_max;
+		bool using_percpu;
+
+		buf = per_cpu_ptr(&marie_shrink_buf, raw_smp_processor_id());
+		if (atomic_cmpxchg(&buf->in_use, 0, 1) == 0) {
+			scratch_batch = buf->batch;
+			batch_max = marie_pfn_batch_threshold(sc);
+			using_percpu = true;
+		} else {
+			scratch_batch = small_batch;
+			batch_max = MARIE_PFN_FALLBACK_BATCH;
+			using_percpu = false;
+		}
+
+		for (idx = 0; idx < type_count; idx++) {
+			unsigned int tier;
+			int oldest;
+			bool ignore_refs = false;
+			LIST_HEAD(folio_list);
+			struct reclaim_stat stat = {};
+			unsigned long n_taken = 0;
+			unsigned int n_reclaimed = 0;
+			int oldest_for_putback;
+			u8 putback_gen;
+			struct folio *f, *tmp;
+			/*
+			 * Tracks whether this iteration actually attempted
+			 * to pick the type. An external override
+			 * (skip_file from clean_min_ratio) clears this so
+			 * the bias controller is NOT updated for a pick
+			 * that never ran -- the bias must reflect actual
+			 * picking policy, not blocked intentions.
+			 */
+			bool attempted_pick = true;
+
+			type = type_order[idx];
+
+			/*
+			 * Per-type body wrapped in do { } while (0) so the
+			 * existing early-exit conditions become plain
+			 * `break` to a single tail that updates the bias
+			 * controller and applies the swappiness=1 fallback
+			 * gate. `goto done` (target reached) still bypasses
+			 * the tail entirely.
+			 */
+			do {
+
+			if (type == 1 && skip_file) {
+				attempted_pick = false;
+				break;
+			}
+
+			oldest = marie_find_oldest_occupied(type);
+			if (oldest < 0)
+				break;
+			ignore_refs = atomic_read(
+				&marie_gen_walker_visits[oldest][type]) >= 1;
+
+			/*
+			 * Accumulate across all tiers of this type into one
+			 * folio_list up to batch_max, then call
+			 * shrink_folio_list once.
+			 *
+			 * Scan writes candidate folios directly into
+			 * scratch_batch[n_taken..] in a SINGLE call per
+			 * tier. The previous SWAP_CLUSTER_MAX-bounded
+			 * tmp_batch did 128 scan invocations per type at
+			 * batch_max=4096, re-initialising the prefetch
+			 * ring each time -- now one invocation per tier
+			 * (4 per type) lets the ring amortise across the
+			 * full bitmap walk.
+			 *
+			 * Failed claims (try_get / test_clear_lru) leave
+			 * the corresponding scratch_batch slot to be
+			 * overwritten by the next successful claim --
+			 * in-place compaction via accept_idx.
+			 */
+			for (tier = 0; tier < MARIE_PFN_NR_TIERS; tier++) {
+				unsigned long nr_isolated, i;
+				unsigned long room;
+				unsigned long accept_idx = n_taken;
+
+				if (sc_reclaim_target_reached(sc))
+					goto done;
+				if (n_taken >= batch_max)
+					break;
+
+				room = batch_max - n_taken;
+				nr_isolated = marie_state_isolate_scan_l2lock(
+					pgdat, type, sc_reclaim_idx(sc),
+					tier, target_memcg,
+					&scratch_batch[n_taken], room,
+					ULONG_MAX);
+				if (!nr_isolated)
+					continue;
+
+				for (i = 0; i < nr_isolated; i++) {
+					f = scratch_batch[n_taken + i];
+					if (!folio_try_get(f))
+						continue;
+					if (!folio_test_clear_lru(f)) {
+						folio_put(f);
+						continue;
+					}
+
+					scratch_batch[accept_idx] = f;
+
+					/*
+					 * marie_evict_counters_only decrements
+					 * counters AND retires the scan-bitmap
+					 * slot (so other CPUs stop re-finding
+					 * this in-flight folio), but KEEPS the
+					 * TRACKED byte so install_local's early-
+					 * out blocks any concurrent install from
+					 * re-setting PG_lru while shrink_folio_-
+					 * list reclaims it. The TRACKED byte is
+					 * wiped at the buddy handoff via
+					 * marie_state_drop_pfn_at_free() (called
+					 * from free_pages_prepare). Survivors
+					 * keep TRACKED and re-publish a fresh
+					 * scan slot + PG_lru in the putback loop
+					 * below.
+					 */
+					marie_evict_counters_only(f);
+
+					list_add(&f->lru, &folio_list);
+					accept_idx++;
+				}
+				n_taken = accept_idx;
+			}
+
+			if (!n_taken)
+				break;
+
+			/*
+			 * PGSCAN accounting, mirroring upstream MGLRU's
+			 * post-isolation bump (mm/vmscan.c evict_folios).
+			 * n_taken is the count actually pulled off the LRU
+			 * (the equivalent of MGLRU's `isolated`); upstream
+			 * PGSCAN_* tracks isolated, not bitmap-scanned bits.
+			 *
+			 * NR_ISOLATED_ANON / _FILE must be bumped here so
+			 * reclaim throttling and writeback congestion
+			 * checks see Marie's in-flight isolation; the
+			 * counter is decremented after shrink_folio_list
+			 * finishes (whether the folio was reclaimed or put
+			 * back).
+			 *
+			 * Since 7.0, PGSCAN_* / PGSTEAL_* / PGSCAN_ANON /
+			 * PGSTEAL_ANON are node_stat_item (lruvec stats),
+			 * not vm_event_item. Use mod_lruvec_state which
+			 * propagates to both node vmstat and memcg.
+			 */
+			{
+				enum node_stat_item scan_item =
+					PGSCAN_KSWAPD +
+					vmscan_reclaimer_offset(sc);
+
+				mod_node_page_state(pgdat,
+						    NR_ISOLATED_ANON + type,
+						    n_taken);
+				mod_lruvec_state(lruvec, scan_item, n_taken);
+				mod_lruvec_state(lruvec, PGSCAN_ANON + type,
+						 n_taken);
+			}
+
+			n_reclaimed = shrink_folio_list(&folio_list, pgdat,
+							sc, &stat, ignore_refs,
+							memcg);
+			sc_add_reclaimed(sc, n_reclaimed);
+
+			/*
+			 * PGSTEAL accounting + matched NR_ISOLATED decrement.
+			 * shrink_folio_list has either freed each folio or
+			 * left it on @folio_list for putback; either way the
+			 * isolation window for these n_taken folios is over.
+			 */
+			{
+				enum node_stat_item steal_item =
+					PGSTEAL_KSWAPD +
+					vmscan_reclaimer_offset(sc);
+
+				mod_node_page_state(pgdat,
+						    NR_ISOLATED_ANON + type,
+						    -n_taken);
+				mod_lruvec_state(lruvec, steal_item,
+						 n_reclaimed);
+				mod_lruvec_state(lruvec, PGSTEAL_ANON + type,
+						 n_reclaimed);
+			}
+
+			oldest_for_putback = marie_find_oldest_occupied(type);
+			if (oldest_for_putback >= 0)
+				putback_gen = (u8)((oldest_for_putback + 1)
+					& (MARIE_PFN_NR_GENS - 1));
+			else
+				putback_gen = (u8)atomic_read(
+					&marie_head_gen[type]);
+
+			list_for_each_entry_safe(f, tmp, &folio_list, lru) {
+				u8 prev, w, target_tier;
+				struct lruvec *lv;
+				struct marie_lruvec *mlv;
+				unsigned long pfn;
+				int zone;
+				enum lru_list inst_lru;
+
+				pfn = folio_pfn(f);
+				/*
+				 * prev_tier comes straight from the per-PFN
+				 * byte: counters_only preserved it across
+				 * isolate and the publish below has not run
+				 * yet, so the byte still encodes the tier this
+				 * folio carried when it was isolated. (Replaces
+				 * the old scratch_prev_tier[] capture + O(n^2)
+				 * linear search back into scratch_batch.)
+				 */
+				if (pfn < marie_state_size)
+					prev = (READ_ONCE(marie_state[pfn]) &
+						MARIE_PFN_TIER_MASK) >>
+					       MARIE_PFN_TIER_SHIFT;
+				else
+					prev = 0;
+				w = (folio_test_active(f) ? 2 : 0) |
+				    (folio_test_workingset(f) ? 1 : 0);
+				target_tier = prev > w ? prev : w;
+
+				list_del_init(&f->lru);
+				lv = folio_lruvec(f);
+				mlv = marie_get_lruvec(lv);
+				zone = folio_zonenum(f);
+				/*
+				 * Normalize PG_active->0 BEFORE computing inst_lru, mirroring
+				 * marie_folio_install() and marie_evict_locked(). The active hotness
+				 * was already folded into target_tier (w) above, so nothing is lost.
+				 * shrink_folio_list's activate_locked path can leave PG_active set on a
+				 * Marie-isolated folio; crediting folio_lru_list() with it still set
+				 * lands the survivor's +nr in ACTIVE_*, but every debit path
+				 * (marie_evict_locked / marie_evict_counters_only) clears PG_active
+				 * first and debits INACTIVE_*. That producer/consumer bucket split is
+				 * what underflows mz->lru_zone_size at the eventual free
+				 * ("mem_cgroup_update_lru_size: lru_size -1").
+				 */
+				if (folio_test_active(f))
+					folio_clear_active(f);
+				inst_lru = folio_lru_list(f);
+
+				/*
+				 * Survivor putback -- UNIFIED, mlv-independent.
+				 * marie_state[pfn] still has TRACKED set from
+				 * before isolate (counters_only preserves it),
+				 * so re-publish a FRESH scan slot at
+				 * (putback_gen, target_tier) -- publish-only, no
+				 * clear-old (isolate already retired the old
+				 * slot). The folio stays a Marie folio; we do
+				 * NOT route it back through folio_putback_lru /
+				 * folio_add_lru. That generic path re-enters the
+				 * per-cpu folio_batch pipeline, which assumes
+				 * legacy-LRU invariants (folio on a real list,
+				 * counted in mz->lru_zone_size) that Marie folios
+				 * break -- under heavy pressure it freed
+				 * still-dirty swapbacked folios out of the batch
+				 * drain ("Bad page state").
+				 *
+				 * mlv is NULL only when marie_get_lruvec could
+				 * not allocate the per-lruvec carrier under the
+				 * GFP_ATOMIC reclaim context (or Marie was just
+				 * disabled). Either way re-publishing into the
+				 * per-PFN state is correct and needs no mlv; only
+				 * the per-mlv counters are skipped (they live in
+				 * the carrier that does not exist -- no leak,
+				 * mirrors marie_evict_counters_only's !mlv path).
+				 * The global vmstat / nr_folios counters ARE
+				 * restored unconditionally to mirror the -nr
+				 * counters_only applied at isolate.
+				 */
+				marie_state_publish_at_gen(pfn, putback_gen,
+							   target_tier);
+				marie_memcg_bitmap_set(folio_memcg(f), pfn);
+
+				/*
+				 * Account the survivor's re-installation. mlv
+				 * may be NULL under reclaim (GFP_ATOMIC carrier
+				 * alloc fail or teardown race); the helper drops
+				 * only the global counters in that case --
+				 * mirrors the !mlv path in
+				 * marie_evict_counters_only / install_isolate.
+				 */
+				marie_account_install_isolate(lv, mlv, f,
+							      inst_lru, zone);
+
+				if (!folio_put_testzero(f)) {
+					/*
+					 * Isolation ref dropped, folio still alive.
+					 * Set PG_lru so the next scan can re-isolate
+					 * it via folio_test_clear_lru.
+					 */
+					folio_set_lru(f);
+				} else {
+					/*
+					 * Isolation ref was the last one -- folio is
+					 * being freed now. PG_lru is clear (was cleared
+					 * at isolation), so __folio_put's
+					 * __page_cache_release will not call
+					 * del_page_from_lru_list and will not debit
+					 * mz->lru_zone_size a second time -- isolation
+					 * already debited it (the install +nr is settled
+					 * by the isolate path), so a free-time debit here
+					 * would underflow.
+					 *
+					 * shrink_folio_list's activate_locked path may
+					 * set PG_active on a folio whose PG_lru is
+					 * already clear (Marie isolated it). Normally
+					 * PAGE_FLAGS_CHECK_AT_FREE is satisfied because
+					 * folio_activate() checks PG_lru and is a no-op
+					 * when it is clear -- but some stock paths set
+					 * PG_active directly (e.g. folio_set_active in
+					 * the deactivate batch). Clear it here; the
+					 * folio has no live references and is not on any
+					 * LRU list, so clearing PG_active is safe.
+					 *
+					 * Undo the putback counter increments before
+					 * completing the free. Bitmaps and TRACKED are
+					 * cleaned at buddy handoff by
+					 * marie_state_drop_pfn_at_free.
+					 */
+					folio_clear_active(f);
+					marie_account_evict_isolate(lv, mlv, f,
+								    inst_lru,
+								    zone);
+					__folio_put(f);
+				}
+			}
+
+			/*
+			 * No deferred drop pass: the scan-bitmap slot was
+			 * retired at isolate (counters_only), and the TRACKED
+			 * byte of a reclaimed folio is wiped at its buddy
+			 * handoff (marie_state_drop_pfn_at_free via the
+			 * free_pages_prepare hook). Folios still alive in
+			 * folio_list went through the survivor putback above,
+			 * which re-published a fresh scan slot via
+			 * marie_state_publish_at_gen.
+			 */
+
+			} while (0);
+
+			/*
+			 * Per-iteration tail.
+			 *
+			 * Bias controller update is skipped when:
+			 *   - !attempted_pick: external override (skip_file
+			 *     from clean_min_ratio) blocked the scan. The
+			 *     bias must track actual picking policy, not
+			 *     policy preempted before it ran.
+			 *   - skip_file is in effect for THIS call: even
+			 *     the ANON pick that succeeds during a
+			 *     skip_file regime is happening only because
+			 *     file was forcibly removed from contention.
+			 *     Freezing the controller during the override
+			 *     keeps the bias at its pre-override value, so
+			 *     when file recovers above clean_min_ratio the
+			 *     proportional regime resumes without an
+			 *     overshoot driven by anon-only reclaim that
+			 *     was never about the swappiness ratio.
+			 *
+			 * swappiness=1 (FILE_THEN_ANON) depletion-fallback
+			 * gate (see the tail `if` below). Two independent
+			 * reasons divert reclaim to ANON, on separate layers:
+			 *
+			 *   1. file < clean_min_ratio floor: handled UPFRONT by
+			 *      marie_file_floor_protect -> skip_file -> pick
+			 *      ANON_STRICT. Protects a minimum clean-file
+			 *      reserve and never reaches here (skip_file
+			 *      short-circuits FILE_THEN_ANON).
+			 *
+			 *   2. file >= floor but file reclaim cannot keep pace:
+			 *      detected HERE by the FILE pass FAILING TO MEET
+			 *      this call's reclaim target. A target-meeting FILE
+			 *      pass exits via the tier loop's
+			 *      sc_reclaim_target_reached() -> `goto done`, PAST
+			 *      this tail; so merely arriving here means file fell
+			 *      short. Occupancy/tier cannot tell reclaimability
+			 *      or throughput apart -- a tracked file folio may be
+			 *      hot/dirty/mapped, and how much actually frees is
+			 *      known only by trying (shrink_folio_list). The
+			 *      earlier gate keyed on the FILE pass returning
+			 *      EXACTLY zero, which conflates "no reclaimable
+			 *      file" with "file frees a positive trickle that
+			 *      cannot match the allocation rate": while any
+			 *      recyclable clean pagecache keeps cycling (refault /
+			 *      IO refill) the FILE pass returns >0 forever, anon
+			 *      is never scanned, and GBs of swappable anon OOM
+			 *      with swap free. Sufficiency, not exact-zero, is the
+			 *      correct depletion signal -> fall through to ANON.
+			 *
+			 * This does NOT wait for sc->priority to decay round by
+			 * round: that would thrash file-first for several more
+			 * rounds before conceding, which is the very stall we are
+			 * eliminating. The fallback fires on the first call where
+			 * file cannot satisfy the target. A transient file stall
+			 * (oldest gen momentarily all dirty/writeback) costs at
+			 * most one early anon batch -- acceptable on swappiness=1
+			 * / ZRAM, and far better than OOM-ing with swap free.
+			 *
+			 * `goto done` (target reached inside the tier loop)
+			 * jumps PAST this tail intentionally: we are winning,
+			 * the controller does not need a back-pressure tick.
+			 */
+			/*
+			 * anon_unreclaimable forced FILE_STRICT above,
+			 * bypassing the proportional controller; do not let
+			 * those forced-file picks drive the bias (matches the
+			 * "special swappiness values bypass the controller"
+			 * rule -- the bias must resume cleanly once swap
+			 * capacity returns and can_reclaim_anon flips back).
+			 *
+			 * The FILE_THEN_ANON depletion fallback (idx==1 ANON,
+			 * reached only because the FILE pass found nothing
+			 * reclaimable) is likewise a forced pick driven by file
+			 * depletion, not by the swappiness ratio, so it must not
+			 * drive the bias either.
+			 */
+			if (attempted_pick && !skip_file && !anon_unreclaimable &&
+			    !(pick_kind == MARIE_PICK_FILE_THEN_ANON && idx == 1) &&
+			    likely(!oom_victim))
+				marie_swap_bias_update(mlv_bias, type,
+						       n_reclaimed, swappiness);
+			if (likely(!oom_victim) &&
+			    pick_kind == MARIE_PICK_FILE_THEN_ANON &&
+			    idx == 0 && !skip_file) {
+				/*
+				 * swappiness=1 depletion fallback --
+				 * SUFFICIENCY-gated, not zero-gated and not
+				 * priority-gated.
+				 *
+				 * FILE is still strongly preferred: it is
+				 * type_order[0], scanned first and in full
+				 * every call, and a FILE pass that MEETS this
+				 * call's reclaim target short-circuits via the
+				 * tier loop's sc_reclaim_target_reached() ->
+				 * `goto done`, which jumps PAST this gate and
+				 * never engages anon. So simply reaching this
+				 * gate means FILE did NOT satisfy the target --
+				 * file reclaim is not keeping pace this call.
+				 *
+				 * The old gate broke out on ANY n_reclaimed > 0,
+				 * requiring a FILE pass of EXACTLY zero before it
+				 * would touch anon. While even a trickle of
+				 * recyclable clean pagecache keeps cycling
+				 * (refault / IO refill), FILE returns >0 every
+				 * call, the break pinned reclaim file-only, and
+				 * GBs of swappable anon were never scanned --
+				 * OOM with swap free (tail /dev/zero). Waiting
+				 * for sc->priority to decay round by round before
+				 * resorting to anon would just thrash file-first
+				 * for several more rounds first, which is exactly
+				 * the stall we want to avoid.
+				 *
+				 * Engage the ANON pass NOW. If the final FILE
+				 * batch happened to tip the target without a
+				 * re-check, the idx==1 ANON pass self-aborts at
+				 * its own sc_reclaim_target_reached() gate, so no
+				 * anon is over-reclaimed. Reaching FILE_THEN_ANON
+				 * proved anon is reclaimable (else FILE_STRICT),
+				 * so swap capacity is available by construction.
+				 */
+				drain_mask |= MARIE_DRAIN_ANON;
+			}
+		}
+done:
+		if (using_percpu)
+			atomic_set(&buf->in_use, 0);
+	}
+
+	return drain_mask;
+}
+
+
+/* --- per-mlv lifecycle and install / evict implementations --- */
+
+
+static DEFINE_PER_CPU(int[ANON_AND_FILE], marie_drain_depth);
+
+void marie_drain_enter_type(int type)
+{
+	this_cpu_inc(marie_drain_depth[type]);
+}
+void marie_drain_exit_type(int type)
+{
+	this_cpu_dec(marie_drain_depth[type]);
+}
+bool marie_in_drain_type(int type)
+{
+	return this_cpu_read(marie_drain_depth[type]) > 0;
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *  Install / evict — direct synchronous transitions under lru_lock
+ * ---------------------------------------------------------------------
+ *
+ * The per-PFN paradigm reduces every Marie folio's state to a single
+ * bit (TRACKED in marie_state[pfn]). There are exactly two state
+ * transitions:
+ *
+ *   marie_folio_install:   TRACKED 0 -> 1   (writes gen, tier, type,
+ *                          zone, sets PG_lru, bumps counters; defined
+ *                          below, declared in pfn_install.h)
+ *   marie_evict_locked:    TRACKED 1 -> 0   (counter decrements +
+ *                          per-PFN state wipe via marie_state_drop_pfn)
+ *
+ * Both are called with the caller's lru_lock irqsave held, so the
+ * per-PFN byte write, the bitmap mutations, and the per-mlv counter
+ * updates all run in the same atomic context. PG_active hygiene and
+ * other cross-cutting concerns are concentrated here.
+ */
+
+
+/*
+ * marie_folio_install - unified fresh install (TRACKED 0 -> 1).
+ *
+ * Replaces the former marie_install_local / marie_install_locked pair.
+ * The two used to differ only in the order of (publish, account, flag)
+ * and in the PG_lru set method; this canonical form picks set_mask_bits
+ * (atomic PG_active clear + PG_lru set in one mask write) and the
+ * publish -> flag -> account order from install_local.
+ *
+ * Call sites:
+ *   - lru_marie_add_folio (THP under per-type lock, small folio direct)
+ *   - marie_change_state_lruvec (gate-on fill, under per-type both lock)
+ *
+ * Per-type lock is a property of the caller, not of this function: the
+ * body only requires lru_lock + IRQs off and behaves identically whether
+ * or not the caller additionally holds the per-type lock.
+ *
+ * Returns true on success, false on the "already TRACKED" early-out.
+ * See pfn_install.h for the contract documentation.
+ */
+bool marie_folio_install(struct folio *folio, struct marie_lruvec *mlv)
+{
+	bool was_active, was_workingset;
+	unsigned int tier;
+	int type, zone;
+	u8 head;
+	enum lru_list inst_lru;
+	unsigned long pfn;
+
+	lockdep_assert_held(&mlv->lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * "Already TRACKED" early-out. A folio reaching install while its
+	 * per-PFN byte is still TRACKED is a Marie-owned, reclaim-isolated
+	 * folio (the deferred-teardown design preserves TRACKED while PG_lru
+	 * is cleared) being re-added through a path that lacks a TRACKED gate
+	 * -- e.g. folio_add_lru()/folio_putback_lru() on an anon folio that
+	 * reclaim isolated into the swap cache and a fault then swaps back in.
+	 * Re-installing would re-set PG_lru and double-count per-mlv counters;
+	 * the resurrected PG_lru then survives onto the buddy free path and
+	 * trips "Bad page state |lru|" PAGE_FLAGS_CHECK_AT_FREE. Bail so the
+	 * in-flight reclaim retains ownership.
+	 *
+	 * Return TRUE, not false: returning false tells lruvec_add_folio() to
+	 * run its LEGACY fallback (update_lru_size(+nr) + list_add onto a real
+	 * lruvec->lists[lru]) on a folio that is STILL TRACKED and that Marie
+	 * never credited to mz->lru_zone_size. That stray, never-debited mz
+	 * credit + a folio cross-linked onto a legacy list is exactly the
+	 * mz->lru_zone_size underflow ("lru_size -1") we were chasing. TRUE
+	 * means "Marie owns it, do not add anywhere" -- which is what "retain
+	 * ownership" requires.
+	 */
+	pfn = folio_pfn(folio);
+	if (pfn < marie_state_size &&
+	    (READ_ONCE(marie_state[pfn]) & MARIE_PFN_TRACKED))
+		return true;
+
+	/*
+	 * Workingset signal capture: (PG_active, PG_workingset) -> tier.
+	 *   (0,0) tier 0  cold
+	 *   (0,1) tier 1  workingset, distance too large
+	 *   (1,0) tier 2  recent refault, never workingset before
+	 *   (1,1) tier 3 = MARIE_PFN_TIER_MAX  established hot
+	 * Read PG_active BEFORE clearing so the captured tier matches the
+	 * byte we publish below. PG_workingset stays set:
+	 * workingset_eviction's shadow encoding needs it at next eviction.
+	 */
+	was_active = folio_test_active(folio);
+	was_workingset = folio_test_workingset(folio);
+	tier = ((unsigned int)was_active << 1) | (unsigned int)was_workingset;
+	tier &= MARIE_PFN_TIER_MAX;
+
+	if (was_active)
+		folio_clear_active(folio);
+
+	/*
+	 * folio->lru MUST be re-initialised here. A recycled folio arrives
+	 * with LIST_POISON{1,2} from the prior owner's list_del, and the
+	 * eventual marie_evict_locked's list_del_init would walk the
+	 * poison pointers and fault.
+	 */
+	INIT_LIST_HEAD(&folio->lru);
+
+	type = folio_is_file_lru(folio);
+	zone = folio_zonenum(folio);
+	head = (u8)atomic_read(&marie_head_gen[type]);
+
+	/*
+	 * Publish per-PFN state byte + scan bitmap + memcg L1 +
+	 * gen_occupied++. See pfn_install.h::marie_pfn_publish_inherit.
+	 */
+	marie_pfn_publish_inherit(folio, type, head, (u8)tier, zone);
+
+	/*
+	 * Bump the install gauge and let the advance hook fire its periodic
+	 * gen-advance decision. Split path intentionally skips this bump
+	 * because the split tail inherits its parent's install budget
+	 * (already counted at the parent's fault-install).
+	 */
+	this_cpu_inc(marie_gen_installs_pc[head][type]);
+	marie_install_advance_hook(type);
+
+	/*
+	 * Atomic PG_active->0 + PG_lru->1 in one mask write. PG_active was
+	 * cleared above when set; the mask write keeps the invariant
+	 * against the defensive case where another path set PG_active
+	 * between then and now. Ordered AFTER the state-byte publish so a
+	 * concurrent __page_cache_release observing PG_lru=1 also observes
+	 * marie_state[pfn] & MARIE_PFN_TRACKED.
+	 */
+	set_mask_bits(&folio->flags.f, BIT(PG_active), BIT(PG_lru));
+	inst_lru = folio_lru_list(folio);
+
+	marie_account_install(mlv, folio, inst_lru, zone);
+
+	return true;
+}
+
+bool marie_evict_locked(struct marie_lruvec *mlv, struct folio *folio)
+{
+	int zone = folio_zonenum(folio);
+
+	lockdep_assert_held(&mlv->lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * folio->lru is either a self-loop (install/flush leave it that
+	 * way, and the per-PFN paradigm never re-attaches it onto a
+	 * Marie-owned list) or on legacy lruvec->lists[lru] after a
+	 * drain handed it off. list_del_init is a no-op in the first
+	 * case and a legacy-list removal in the second; the caller
+	 * holds lruvec->lru_lock for the latter, so no extra Marie-side
+	 * lock is required.
+	 */
+	list_del_init(&folio->lru);
+
+	/*
+	 * PG_active hygiene MUST happen before folio_lru_list() below.
+	 * The install helper clears PG_active and then computes the lru
+	 * index, so install always credits INACTIVE_*. If we read
+	 * folio_lru_list() here while PG_active is still set (e.g. via
+	 * folio_activate() on a tracked folio between install and del),
+	 * we would decrement ACTIVE_* -- an LRU index Marie's install
+	 * never +1'd -- and trip the mz->lru_zone_size underflow WARN.
+	 * Mirror install's order: clear PG_active, then compute lru.
+	 *
+	 * Also drops PG_active for shrink_folio_list, which trips
+	 * VM_BUG_ON_FOLIO(folio_test_active) otherwise.
+	 */
+	if (folio_test_active(folio))
+		folio_clear_active(folio);
+
+	marie_account_evict(mlv, folio, folio_lru_list(folio), zone);
+
+	/*
+	 * Clear PG_lru BEFORE marie_state_drop_pfn so a concurrent
+	 * del-side path gated on folio_test_clear_lru cannot observe
+	 * (state=TRACKED, PG_lru=1) -> Marie del again recursion.
+	 * drop_pfn then wipes the per-PFN state (byte, bitmap,
+	 * l2_range_count, memcg L1) which is the only Marie tracking
+	 * for this folio.
+	 *
+	 * Idempotent for callers that already cleared PG_lru via
+	 * folio_test_clear_lru before reaching evict
+	 * (__page_cache_release, marie_state_shrink_lruvec claim loop).
+	 */
+	folio_clear_lru(folio);
+	marie_state_drop_pfn(folio);
+
+	return true;
+}
+
+/*
+ * marie_evict_counters_only - reclaim-isolate per-folio counter decrement
+ * that also retires the scan-bitmap slot, but PRESERVES marie_state[]'s
+ * TRACKED bit.
+ *
+ * The per-PFN state byte staying TRACKED throughout shrink_folio_list is
+ * the race defence: marie_folio_install's "already TRACKED" early-out
+ * makes a concurrent install on this PFN bail, so install cannot set
+ * PG_lru on the folio while shrink_folio_list is reclaiming it. (The
+ * earlier full marie_evict_isolated cleared TRACKED inline; a concurrent
+ * install would then succeed, set PG_lru, and trip
+ * PAGE_FLAGS_CHECK_AT_FREE at free_unref_folios in the success path.)
+ *
+ * The global (type, gen, tier) bitmap bit + gen_occupied slot ARE dropped
+ * here, at isolate. The bit is the scanner's candidate index, and an
+ * isolated folio is no longer a candidate: leaving it set lets every
+ * other CPU's scanner re-find the same in-flight PFN for the whole
+ * swap-out window (the claim fails on the already-cleared PG_lru, but the
+ * re-scan / re-batch work is pure waste, and a folio shrink_folio_list
+ * chose to KEEP can get re-isolated before its second chance is honoured
+ * -> avoidable refaults). Retiring the scan slot here while keeping the
+ * TRACKED byte separates "is a scan candidate" (bitmap) from "blocks a
+ * concurrent install" (byte). l2_count / gen_occupied stay balanced 1:1:
+ * the matching set is the install; the matching re-set, for a survivor,
+ * is marie_state_publish_at_gen at putback; a reclaimed folio's byte is
+ * wiped at the buddy free hook, which finds the bit already clear.
+ *
+ * Caller-side gates that hold throughout this path:
+ *   1. folio_try_get()        - reference held, folio cannot be freed.
+ *   2. folio_test_clear_lru() - PG_lru cleared atomically, gating
+ *                                external del paths.
+ *   3. install_local TRACKED early-out (above)
+ *
+ * memcg_bitmap is cleared here because the buddy free hook
+ * (marie_state_drop_pfn_at_free) runs without a folio reference and cannot
+ * derive memcg later.
+ *
+ * Counters are decremented immediately so the in-flight folio does not
+ * inflate lruvec_lru_size() and skew reclaim pressure heuristics during
+ * shrink_folio_list. The scan bitmap + gen_occupied are torn down HERE so
+ * the in-flight folio leaves the candidate index immediately; only the
+ * TRACKED byte teardown is deferred (to the buddy free hook for reclaimed
+ * folios). Survivors go through the putback path, which re-publishes a
+ * fresh scan slot via marie_state_publish_at_gen and re-sets PG_lru.
+ */
+void marie_evict_counters_only(struct folio *folio)
+{
+	struct lruvec *lv = folio_lruvec(folio);
+	struct marie_lruvec *mlv = marie_get_lruvec(lv);
+	int zone = folio_zonenum(folio);
+	enum lru_list del_lru;
+
+	if (unlikely(!list_empty(&folio->lru))) {
+		/*
+		 * Defensive: an mm/swap.c batch path lacking a Marie gate
+		 * may have placed this folio onto a legacy lruvec list via
+		 * lruvec_add_folio_tail. The caller's list_add(&f->lru, ...)
+		 * would then corrupt that list. Detach under lru_lock first;
+		 * DO NOT fall back to lru_marie_del_folio (it would clear
+		 * TRACKED via marie_state_drop_pfn, breaking the deferred-
+		 * teardown invariant the putback path relies on).
+		 */
+		VM_WARN_ON_ONCE_FOLIO(1, folio);
+		scoped_guard(spinlock_irq, &lv->lru_lock)
+			list_del_init(&folio->lru);
+	}
+
+	if (folio_test_active(folio))
+		folio_clear_active(folio);
+
+	del_lru = folio_lru_list(folio);
+
+	/*
+	 * marie_account_evict_isolate owns the local_irq_save/restore that
+	 * the lock-free reclaim path needs against same-CPU softirq
+	 * reentrancy on fbc->lock and the per-CPU vmstat diff (see the
+	 * helper's contract in account.h, and 9c6a93782's lockup history).
+	 * mlv may be NULL: the helper drops only the global counters in
+	 * that case (no leak -- the missing carrier took the per-mlv
+	 * counters with it).
+	 */
+	marie_account_evict_isolate(lv, mlv, folio, del_lru, zone);
+
+	/*
+	 * Retire the scan-bitmap slot + gen_occupied at isolate (see the
+	 * function comment). Read the still-TRACKED byte for its (gen, tier,
+	 * type) coordinate; the byte itself is left TRACKED for the install-
+	 * race early-out. These bit ops are atomic and need no IRQ-off
+	 * window; the helper's local_irq_save/restore is scoped to the
+	 * counters that actually need it.
+	 */
+	{
+		unsigned long pfn = folio_pfn(folio);
+
+		if (pfn < marie_state_size) {
+			u8 s = READ_ONCE(marie_state[pfn]);
+
+			if (s & MARIE_PFN_TRACKED) {
+				u8 g = (s & MARIE_PFN_GEN_MASK) >>
+				       MARIE_PFN_GEN_SHIFT;
+				u8 tr = (s & MARIE_PFN_TIER_MASK) >>
+					MARIE_PFN_TIER_SHIFT;
+				u8 tb = (s & MARIE_PFN_TYPE_MASK) ? 1 : 0;
+
+				marie_bm_clear(&marie_track_bm[tb][g][tr], pfn);
+				atomic_long_dec(&marie_gen_occupied[g][tb]);
+			}
+		}
+	}
+
+	/* Clear memcg bitmap now (folio gone before post-reclaim drop runs). */
+	marie_memcg_bitmap_clear(folio_memcg(folio), folio_pfn(folio));
+}
+
+/*
+ * Bumps the per-PFN tier; marie_state_inc_tier handles both the
+ * non-saturated bump (WRITE_ONCE) and the saturated promote
+ * (marie_state_move_to_gen to head_gen + tier 0) internally.
+ */
+void lru_marie_mark_accessed(struct folio *folio)
+{
+	unsigned long pfn = folio_pfn(folio);
+	u8 state;
+
+	if (!lru_marie_enabled() || !marie_state_ready())
+		return;
+	if (pfn >= marie_state_size)
+		return;
+	state = READ_ONCE(marie_state[pfn]);
+	if (!(state & MARIE_PFN_TRACKED))
+		return;
+
+	/* Bump the access tier toward MAX (hotter). */
+	marie_state_inc_tier(pfn);
+	/* Mark the page as recently accessed for the workingset estimator. */
+	if (folio_test_clear_referenced(folio))
+		folio_set_workingset(folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_mark_accessed);
+
+/*
+ * Per-cpu folio_batch LRU-op hooks (declared in <linux/lru_marie.h>).
+ * Each applies the op directly on the folio's per-PFN state and returns
+ * true so mm/swap.c skips the legacy folio_batch; false (Marie off / folio
+ * untracked) falls through to the legacy path. All run lock-free:
+ * marie_state_move_to_gen is CAS-based and its bitmap ops are atomic,
+ * matching the no-lru_lock contract of these entry points.
+ */
+
+/*
+ * Demote: relocate to the oldest live gen at tier 0 so Marie's next scan
+ * reclaims it promptly. Used for the EXPLICIT user "make cold" madvise
+ * (MADV_COLD -> folio_deactivate / deactivate_file_folio). Reclaim-internal
+ * hints (activate / rotate) deliberately do NOT demote -- see those hooks.
+ */
+static bool marie_folio_demote(struct folio *folio)
+{
+	int type, oldest;
+
+	if (!lru_marie_enabled() || !folio_marie_test_tracked(folio))
+		return false;
+	type = folio_is_file_lru(folio);
+	oldest = marie_find_oldest_occupied(type);
+	if (oldest >= 0)
+		marie_state_move_to_gen(folio_pfn(folio), (u8)oldest, 0);
+	return true;
+}
+
+bool lru_marie_deactivate(struct folio *folio)
+{
+	return marie_folio_demote(folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_deactivate);
+
+/*
+ * rotate: NO-OP for Marie folios (skip the legacy batch). Like activate
+ * this is a reclaim-internal hint (folio_rotate_reclaimable fires on
+ * writeback completion of a PG_reclaim folio). An actively reclaimed Marie
+ * folio is isolated (PG_lru cleared) so this is rarely reached, and Marie's
+ * gen aging already orders reclaim -- no per-PFN state change is wanted.
+ */
+bool lru_marie_rotate(struct folio *folio)
+{
+	return lru_marie_enabled() && folio_marie_test_tracked(folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_rotate);
+
+/*
+ * activate: NO-OP for Marie folios (but skip the legacy batch by returning
+ * true). folio_activate is driven mostly by shrink_folio_list's
+ * FOLIOREF_ACTIVATE during reclaim, and Marie already decides retention
+ * there via its tier vote in folio_check_references. Promoting to the head
+ * gen on top would pull referenced folios out of the oldest gen on every
+ * reclaim pass; under an all-hot workload that starves reclaim entirely
+ * (OOM with GBs of unreclaimable inactive_anon). The explicit-access
+ * channel is folio_mark_accessed -> lru_marie_mark_accessed (tier bump),
+ * which must not be double-counted here.
+ */
+bool lru_marie_activate(struct folio *folio)
+{
+	return lru_marie_enabled() && folio_marie_test_tracked(folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_activate);
+
+/*
+ * MADV_FREE: make the anon folio reclaim-without-writeback. Clear the
+ * dirtiness signals synchronously (what the legacy lru_lazyfree move_fn
+ * does) and demote so Marie frees it promptly without swap on the next
+ * scan. type is read before clearing swapbacked (folio_is_file_lru flips
+ * once swapbacked is gone); the Marie byte keeps its anon TYPE, so demote
+ * stays within the anon gen ring.
+ */
+bool lru_marie_lazyfree(struct folio *folio)
+{
+	int type, oldest;
+
+	if (!lru_marie_enabled() || !folio_marie_test_tracked(folio))
+		return false;
+	type = folio_is_file_lru(folio);
+	folio_clear_active(folio);
+	folio_clear_referenced(folio);
+	folio_clear_swapbacked(folio);
+	count_vm_events(PGLAZYFREE, folio_nr_pages(folio));
+	oldest = marie_find_oldest_occupied(type);
+	if (oldest >= 0)
+		marie_state_move_to_gen(folio_pfn(folio), (u8)oldest, 0);
+	return true;
+}
+EXPORT_SYMBOL_GPL(lru_marie_lazyfree);
+
+/*
+ * folio_marie_get_tier (public API in <linux/lru_marie.h>): returns the
+ * folio's tier, or 0 when Marie is off, the PFN is out of range, or the
+ * folio is untracked.
+ */
+unsigned int folio_marie_get_tier(const struct folio *folio)
+{
+	unsigned long pfn = folio_pfn((struct folio *)folio);
+	u8 state;
+
+	if (!marie_state || pfn >= marie_state_size)
+		return 0;
+	state = READ_ONCE(marie_state[pfn]);
+	if (!(state & MARIE_PFN_TRACKED))
+		return 0;
+	return (state & MARIE_PFN_TIER_MASK) >> MARIE_PFN_TIER_SHIFT;
+}
+EXPORT_SYMBOL_GPL(folio_marie_get_tier);
+
+/*
+ * lru_marie_test_tracked (public API in <linux/lru_marie.h>).
+ */
+bool lru_marie_test_tracked(const struct folio *folio)
+{
+	return folio_marie_test_tracked(folio);
+}
+EXPORT_SYMBOL_GPL(lru_marie_test_tracked);
+
+/*
+ * lru_marie_free_page_hook (public API in <linux/lru_marie.h>).
+ * Thin wrapper over marie_state_drop_pfn_at_free so the page allocator
+ * can call the hook without including the private state.h.
+ */
+void lru_marie_free_page_hook(unsigned long pfn)
+{
+	marie_state_drop_pfn_at_free(pfn);
+}
+EXPORT_SYMBOL_GPL(lru_marie_free_page_hook);
+
+enum marie_tier_inc_result marie_tier_inc(struct marie_lruvec *mlv,
+						struct folio *folio)
+{
+	/*
+	 * The saturated promote is folded into marie_state_inc_tier
+	 * itself: when current tier == MAX, the helper invokes
+	 * marie_state_move_to_gen(pfn, head, 0). The walker tier path
+	 * is a single per-PFN write here.
+	 *
+	 * Return value is preserved for ABI symmetry but the single
+	 * caller (walker.c marie_walk_pmd_range) discards it.
+	 */
+	marie_state_inc_tier(folio_pfn(folio));
+	return MARIE_TIER_INC_OK;
+}
+
+/*
+ * marie_del_folio_locked - lru_marie_del_folio body.
+ *
+ * External-removal entry: if the folio is still Marie-tracked, do the
+ * full evict via marie_evict_locked, which routes through
+ * marie_account_evict and owns the ENTIRE counter wind-down -- including
+ * the single marie_nr_folios -1. The caller does no accounting of its
+ * own; an earlier caller-side -1 predated the account.h funnel and
+ * double-counted marie_nr_folios on every generic del of a tracked folio.
+ *
+ * Lock contract: caller holds lruvec->lru_lock. No Marie lock is taken
+ * here -- the lru_lock invariant already serialises every Marie state
+ * mutation. See the comment above the call site in lru_marie_del_folio
+ * for the full protection-model rationale.
+ *
+ * Returning true tells the dispatcher (lruvec_del_folio in
+ * include/linux/mm_inline.h) "Marie owns this folio, do not fall
+ * through to legacy".
+ *
+ * The not-tracked branch returns true defensively. Under the lru_lock
+ * invariant it is unreachable -- the caller's TRACKED fast-path test
+ * already gated entry here -- but returning true keeps the safe
+ * behaviour if the invariant ever regresses: a stray legacy
+ * update_lru_size on a folio Marie already accounted would double-
+ * decrement mz->lru_zone_size.
+ */
+bool marie_del_folio_locked(struct marie_lruvec *mlv, struct folio *folio)
+{
+	lockdep_assert_held(&mlv->lruvec->lru_lock);
+	lockdep_assert_irqs_disabled();
+
+	if (!folio_marie_test_tracked(folio))
+		return true;
+	return marie_evict_locked(mlv, folio);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *  lruvec lifecycle
+ * ---------------------------------------------------------------------
+ */
+
+/*
+ * marie_type_init: caller-side scalar/lock initialisation only.
+ * @t->nr_pages (percpu_counter) is initialised separately in
+ * marie_alloc_lruvec because percpu_counter_init can fail and must be
+ * rolled back together with the lru_zone_size counters.
+ */
+static void marie_type_init(struct marie_type *t, int type)
+{
+	spin_lock_init(&t->type_lock);
+	t->type = type;
+}
+
+/*
+ * Hand off every Marie-tracked PFN from @child_mlv's lruvec to
+ * @parent_mlv's. Used at memcg reparenting (mm/memcontrol.c) so folios
+ * charged to a dying memcg appear under the parent for subsequent
+ * per-memcg targeted reclaim.
+ *
+ * Caller holds both lruvecs' lru_lock and IRQs disabled (the memcg
+ * offline path's reparent_locks), so concurrent del / install paths
+ * targeting either lruvec are pending behind those locks. Caller also
+ * holds @child_mlv's both type_locks via marie_both_mlv.
+ *
+ * No per-folio iteration: the per-PFN state byte stores (gen, tier,
+ * type, zone) which are memcg-agnostic and do not change on reparent.
+ * What changes is membership in the per-memcg L1/L2 bitmaps, which is
+ * an L2-pruned bitmap OR (marie_memcg_bitmap_merge) -- cost scales
+ * with the number of populated 32 MiB ranges in @child, not with the
+ * folio count. The accompanying per-type and per-(lru, zone) counters
+ * move via atomic_long_xchg.
+ *
+ * Returns 0: marie_nr_folios is unchanged (the state bytes still say
+ * TRACKED, the folios continue to count globally).
+ *
+ * @parent_mlv == NULL: child's bitmap and counters are zeroed out;
+ * folios fall back to global tracking with no per-memcg filter
+ * (equivalent to being charged to root_memcg from the bitmap's
+ * perspective). The next del path still cleans them up correctly
+ * because the state byte and global bitmaps remain consistent.
+ */
+long marie_reparent_locked(struct marie_lruvec *child_mlv,
+				   struct marie_lruvec *parent_mlv)
+{
+	struct mem_cgroup *child_memcg = lruvec_memcg(child_mlv->lruvec);
+	struct mem_cgroup *parent_memcg = parent_mlv ?
+		lruvec_memcg(parent_mlv->lruvec) : NULL;
+	int t, z;
+	enum lru_list lru;
+
+	marie_memcg_bitmap_merge(parent_memcg, child_memcg);
+
+	/*
+	 * percpu_counter has no atomic xchg primitive; under the held
+	 * lru_lock + type_lock pair the sum-then-zero-then-add pattern
+	 * is functionally equivalent because no other writer can race the
+	 * child's counters while we hold them.
+	 */
+	for (t = 0; t < ANON_AND_FILE; t++) {
+		struct marie_type *child_type = &child_mlv->types[t];
+		s64 n = percpu_counter_sum(&child_type->nr_pages);
+
+		percpu_counter_set(&child_type->nr_pages, 0);
+		if (n > 0 && parent_mlv)
+			marie_pc_add(
+				&parent_mlv->types[t].nr_pages, n);
+	}
+
+	for (lru = 0; lru < NR_LRU_LISTS; lru++) {
+		for (z = 0; z < MAX_NR_ZONES; z++) {
+			s64 n;
+
+			/*
+			 * Fold the child's deferred isolate mz delta into its
+			 * mz->lru_zone_size first (both lru_locks held), so the
+			 * shadow-based transfer below brings child mz exactly to
+			 * 0. Skipping this would leave child mz at -pending.
+			 */
+			marie_mz_drain_locked(child_mlv, lru, z);
+
+			n = percpu_counter_sum(
+				&child_mlv->marie_lru_zone_size[lru][z]);
+
+			percpu_counter_set(
+				&child_mlv->marie_lru_zone_size[lru][z], 0);
+			if (!n)
+				continue;
+			marie_update_lru_size(child_mlv->lruvec, lru, z, -n);
+			if (parent_mlv) {
+				marie_pc_add(
+					&parent_mlv->marie_lru_zone_size[lru][z],
+					n);
+				marie_update_lru_size(parent_mlv->lruvec,
+						      lru, z, n);
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct marie_lruvec *marie_alloc_lruvec(struct lruvec *lv, gfp_t gfp)
+{
+	struct marie_lruvec *mlv;
+	int t, lru, z;
+	int initialised_t = 0;
+	int initialised_lru = 0;
+	int initialised_z = 0;
+
+	mlv = kzalloc(sizeof(*mlv), gfp);
+	if (!mlv)
+		return NULL;
+
+	mlv->lruvec = lv;
+	mlv->memcg = lruvec_memcg(lv);
+	mlv->nid = lruvec_pgdat(lv)->node_id;
+
+	for (t = 0; t < ANON_AND_FILE; t++) {
+		marie_type_init(&mlv->types[t], t);
+		if (percpu_counter_init(&mlv->types[t].nr_pages, 0, gfp))
+			goto fail_types;
+		initialised_t = t + 1;
+	}
+
+	for (lru = 0; lru < NR_LRU_LISTS; lru++) {
+		for (z = 0; z < MAX_NR_ZONES; z++) {
+			if (percpu_counter_init(
+				&mlv->marie_lru_zone_size[lru][z], 0, gfp))
+				goto fail_zones;
+			initialised_z = z + 1;
+		}
+		initialised_lru = lru + 1;
+		initialised_z = 0;
+	}
+
+	return mlv;
+
+fail_zones:
+	/* Roll back any (lru, z) percpu_counters initialised so far. */
+	for (z = 0; z < initialised_z; z++)
+		percpu_counter_destroy(
+			&mlv->marie_lru_zone_size[initialised_lru][z]);
+	for (lru = 0; lru < initialised_lru; lru++)
+		for (z = 0; z < MAX_NR_ZONES; z++)
+			percpu_counter_destroy(
+				&mlv->marie_lru_zone_size[lru][z]);
+fail_types:
+	for (t = 0; t < initialised_t; t++)
+		percpu_counter_destroy(&mlv->types[t].nr_pages);
+	kfree(mlv);
+	return NULL;
+}
+
+void marie_free_lruvec(struct marie_lruvec *mlv)
+{
+	int t, lru, z;
+
+	for (t = 0; t < ANON_AND_FILE; t++)
+		percpu_counter_destroy(&mlv->types[t].nr_pages);
+	for (lru = 0; lru < NR_LRU_LISTS; lru++)
+		for (z = 0; z < MAX_NR_ZONES; z++)
+			percpu_counter_destroy(
+				&mlv->marie_lru_zone_size[lru][z]);
+	kfree(mlv);
+}
+
+int marie_counters_init(void)
+{
+	return percpu_counter_init(&marie_nr_folios, 0, GFP_KERNEL);
+}
diff --git a/mm/lru_marie/state.h b/mm/lru_marie/state.h
new file mode 100644
index 0000000000..50cafe1831
--- /dev/null
+++ b/mm/lru_marie/state.h
@@ -0,0 +1,1335 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_STATE_H
+#define _MM_LRU_MARIE_STATE_H
+
+#include <linux/percpu_counter.h>
+#include "bitmap.h"	/* struct marie_bitmap, MARIE_L2_BITS, marie_bm_* */
+
+/*
+ * Marie per-PFN state array — paradigm specification.
+ * ======================================================
+ *
+ * Marie represents every folio's reclaim state as a single byte in
+ * a flat per-PFN array allocated once at boot. Each Marie operation
+ * on a folio is a single byte read or write at marie_state[pfn] —
+ * there is no allocation anywhere in the fault / del / aging fast
+ * paths, no linked-list traversal, no per-CPU staging.
+ *
+ * The array is sized once at boot to cover totalram_pages PFNs
+ * (~4 MB on a 16 GiB box, ~16 MB on 64 GiB; the same scale as a
+ * 1/64-th miniature struct page) and never grows or shrinks. The
+ * 32-bit PFN gate (marie_init's MARIE_MAX_SUPPORTED_PFN check)
+ * caps the worst-case array size at 4 GiB.
+ *
+ *
+ * Byte layout
+ * -----------
+ *
+ *   bit 7     TRACKED      1 = folio is owned by Marie; 0 = ignore byte
+ *   bit 6     TYPE         1 = file LRU, 0 = anon LRU
+ *   bit 5..4  ZONE         folio_zonenum: 0=DMA, 1=DMA32, 2=NORMAL, 3=MOVABLE
+ *   bit 3..2  GEN          relative-position 0..3 in the cycling ring
+ *                          (0 = oldest, head = atomic_read(&marie_head_gen[type]))
+ *   bit 1..0  TIER         0=cold, 1=workingset, 2=active, 3=refault
+ *
+ * The 8 bits saturate the byte. Bits are laid out in reclaim filter
+ * hierarchy from MSB (root: existence) down to LSB (leaf: hotness),
+ * so the isolate scan can extend its (s & mask) == target test by
+ * widening @mask from the top:
+ *
+ *   (byte == 0)            -> untracked (single-cycle skip)
+ *   (byte & 0x80)          -> TRACKED
+ *   (byte & 0xC0)          -> TRACKED + type
+ *   (byte & 0xF0)          -> TRACKED + type + zone
+ *   (byte & 0xFC)          -> TRACKED + type + zone + gen
+ *   (byte & 0xFF)          -> all five dimensions
+ *
+ * The whole filter is a pure byte mask + compare with no pfn_folio()
+ * dereference required to make a candidate / skip decision — the
+ * inner loop scales to AVX-512 vpand+vpcmpeqb at 64 byte per cycle
+ * and reserves the struct page touch for confirmed candidates only.
+ *
+ * The zone field truncates to 2 bits. ZONE_DEVICE (when enabled)
+ * never reaches Marie because the dispatcher gates on regular LRU
+ * folios; ZONE_HIGHMEM is 32-bit-only and excluded by the 32-bit
+ * PFN gate. So the 4 zone codes cover every Marie-tracked folio in
+ * practice.
+ *
+ * Untracked PFNs read as 0. The TRACKED bit is the single source of
+ * truth — no separate folio->flags Marie bit is used.
+ *
+ *
+ * Aging — gen ring as a cycling counter (per type)
+ * ------------------------------------------------
+ *
+ *   atomic_t marie_head_gen[ANON_AND_FILE];           // 0..3 cycling per type
+ *   atomic_long_t marie_gen_installs[4][ANON_AND_FILE];
+ *   atomic_long_t marie_gen_occupied[4][ANON_AND_FILE];
+ *
+ * install:
+ *
+ *   u8 gen = atomic_read(&marie_head_gen[type]);
+ *   marie_state[pfn] = MARIE_PFN_TRACKED | (type<<6) | (zone<<4) |
+ *                      (gen<<2) | tier;
+ *   set_bit(pfn, marie_gen_bitmap[gen][type]);
+ *   atomic_long_inc(&marie_gen_installs[gen][type]);
+ *   atomic_long_inc(&marie_gen_occupied[gen][type]);
+ *
+ * head_gen advance is per-type, drain-wait gated (next gen empty for
+ * that type), and fired both by install cadence (gen_installs >
+ * MARIE_AGING_THRESHOLD) and by the reclaim-driven trigger (occupied
+ * gen count for that type < 2 at shrink_lruvec entry). See
+ * mm/lru_marie/design.h sections 3-5 for the binding contract.
+ *
+ *
+ * Del — single byte zero
+ * ----------------------
+ *
+ *   marie_state[pfn] = 0;
+ *
+ * No swap-pop, no list_del, no shard lock dance. External del
+ * (lru_marie_del_folio from compaction, folio_put, munmap) is the
+ * same single store.
+ *
+ *
+ * Isolate — cursor + SIMD scan
+ * ----------------------------
+ *
+ * Per-pgdat scan cursor walks the array; SIMD reads 64 byte / cycle
+ * (AVX-512) and tests for (TRACKED && gen == oldest && tier == 0)
+ * via a single AND + CMP mask. Cursor saves position across calls so
+ * batch-32 isolate typically scans only a few hundred PFNs.
+ *
+ *   for (pfn = cursor; n_batch < batch; pfn = next_or_wrap(pfn)) {
+ *       u8 s = marie_state[pfn];
+ *       if ((s & MARIE_PFN_FILTER) != MARIE_PFN_TARGET)
+ *           continue;
+ *       batch[n_batch++] = pfn_folio(pfn);
+ *   }
+ *   cursor = pfn;
+ *
+ * Worst-case (sparse) full sweep of the 4 MB array is ~0.5 ms at
+ * DRAM bandwidth, ~50 µs in L3. Cursor amortises across many
+ * batches, so typical batch cost is sub-µs.
+ *
+ *
+ * memcg scope
+ * -----------
+ *
+ * The array is global (single allocation system-wide), not per-memcg.
+ * memcg-targeted reclaim filters by checking folio_memcg(pfn_folio)
+ * inside the scan loop. This trades per-memcg locality for vastly
+ * simpler data structures — desktop and small-server cgroup trees
+ * (where Marie targets) are dominated by the root memcg anyway, so
+ * the locality loss is small in practice.
+ *
+ *
+ * Walker integration
+ * ------------------
+ *
+ * The PTE walker (marie_walker) inspects young bits as before but
+ * commits tier bumps to marie_state[pfn] instead of folio->flags.
+ * The same SIMD young-pte machinery from the prior implementation
+ * carries over unchanged.
+ *
+ *
+ * Disable / reparent
+ * ------------------
+ *
+ * Marie disable: write 0 to every TRACKED byte via SIMD bulk store,
+ * folio_put each one. Memcg reparent: same loop, but instead of
+ * zeroing, re-encode the byte under the parent's accounting. Both
+ * are O(N) sweeps but happen rarely.
+ *
+ *
+ * Sizing & init
+ * -------------
+ *
+ * marie_state is kvmalloc'd at subsys_initcall with size
+ * `max_pfn` bytes. max_pfn is bounded by the 32-bit PFN gate
+ * (marie_init's MARIE_MAX_SUPPORTED_PFN check), so the array is at
+ * most 4 GiB on the maximum supported config. Realistic sizings:
+ *
+ *   16 GiB RAM  ->  4 MiB   (single kvmalloc, contiguous in vmalloc)
+ *   64 GiB RAM  -> 16 MiB
+ *  256 GiB RAM  -> 64 MiB
+ *
+ * The array is sparse-tolerant: NUMA holes and reserved regions read
+ * as 0 (untracked) and incur only sequential-read cost during scan.
+ */
+
+/*
+ * Field shifts and masks within each marie_state[] byte. Ordered
+ * MSB -> LSB by reclaim filter hierarchy: TRACKED, TYPE, ZONE, GEN,
+ * TIER. See the byte-layout block above for the rationale.
+ */
+#define MARIE_PFN_TIER_SHIFT		0
+#define MARIE_PFN_TIER_BITS		2
+#define MARIE_PFN_TIER_MASK		(((1U << MARIE_PFN_TIER_BITS) - 1) << \
+					 MARIE_PFN_TIER_SHIFT)
+#define MARIE_PFN_NR_TIERS		(1U << MARIE_PFN_TIER_BITS)
+#define MARIE_PFN_TIER_MAX		(MARIE_PFN_NR_TIERS - 1)
+
+#define MARIE_PFN_GEN_SHIFT		2
+#define MARIE_PFN_GEN_BITS		2
+#define MARIE_PFN_GEN_MASK		(((1U << MARIE_PFN_GEN_BITS) - 1) << \
+					 MARIE_PFN_GEN_SHIFT)
+#define MARIE_PFN_NR_GENS		(1U << MARIE_PFN_GEN_BITS)
+
+#define MARIE_PFN_ZONE_SHIFT		4
+#define MARIE_PFN_ZONE_BITS		2
+#define MARIE_PFN_ZONE_MASK		(((1U << MARIE_PFN_ZONE_BITS) - 1) << \
+					 MARIE_PFN_ZONE_SHIFT)
+#define MARIE_PFN_NR_ZONES_ENCODED	(1U << MARIE_PFN_ZONE_BITS)
+
+#define MARIE_PFN_TYPE_SHIFT		6
+#define MARIE_PFN_TYPE_FILE		(1U << MARIE_PFN_TYPE_SHIFT)
+#define MARIE_PFN_TYPE_MASK		MARIE_PFN_TYPE_FILE
+
+#define MARIE_PFN_TRACKED_SHIFT		7
+#define MARIE_PFN_TRACKED		(1U << MARIE_PFN_TRACKED_SHIFT)
+
+/*
+ * Encode @zone (folio_zonenum result) into the byte's zone nibble.
+ * Truncates to MARIE_PFN_NR_ZONES_ENCODED-1 so ZONE_DEVICE etc. do
+ * not overflow the 2-bit field; in practice those zones do not
+ * reach Marie's install path.
+ */
+static inline u8 marie_pfn_zone_bits(unsigned int zone)
+{
+	return (u8)((zone & (MARIE_PFN_NR_ZONES_ENCODED - 1)) <<
+		    MARIE_PFN_ZONE_SHIFT);
+}
+
+/* Forward declaration: struct marie_lruvec is defined further down in
+ * this file but referenced by some declarations below. */
+struct marie_lruvec;
+
+/* The base allocation (subsys_initcall) covers totalram_pages PFNs. */
+extern u8 *marie_state;
+extern unsigned long marie_state_size;
+
+/*
+ * Per-type head_gen (0..MARIE_PFN_NR_GENS - 1, cycling). anon and file
+ * have independent gen rings so the per-type pressure dial (swappiness)
+ * keeps its meaning and the scan / aging triggers operate on disjoint
+ * counters.
+ */
+extern atomic_t marie_head_gen[2 /* ANON_AND_FILE */];
+
+/*
+ * Per-(gen, type) install gauge; drives the install-cadence aging
+ * trigger (cross-CPU sum > MARIE_AGING_THRESHOLD => try_advance_head).
+ *
+ * PER-CPU (not a global atomic): the install hot path does this_cpu_inc
+ * with no shared cacheline, so concurrent installs from different
+ * lruvecs (different lru_locks) no longer contend a single global line.
+ * Advisory only -- a lost/raced bump merely nudges aging cadence -- so
+ * the approximate per-CPU sum read at the throttled advance check (gated
+ * by the per-CPU marie_aging_tick) is sufficient.
+ */
+DECLARE_PER_CPU(long[MARIE_PFN_NR_GENS][ANON_AND_FILE], marie_gen_installs_pc);
+DECLARE_PER_CPU(unsigned int[ANON_AND_FILE], marie_aging_tick);
+
+/*
+ * Per-(gen, type) live folio count. Bumped on install, decremented on
+ * del / promote-out. Drives:
+ *   - the drain-wait gate (next gen empty => advance allowed)
+ *   - the reclaim-driven aging trigger (occupied gen count < 2 at
+ *     shrink_lruvec entry => try_advance_head)
+ */
+extern atomic_long_t marie_gen_occupied[MARIE_PFN_NR_GENS][2 /* ANON_AND_FILE */];
+
+/*
+ * Per-(gen, type) walker visit counter. Walker pass-end bumps every
+ * gen of the visited type; gen advance resets the new head's slot to
+ * zero. Reclaim reads marie_gen_walker_visits[oldest][type] >= 1 to
+ * decide ignore_references=true on shrink_folio_list (cold-confirmed
+ * gens skip the rmap walk). Hint only — no correctness dependency.
+ */
+extern atomic_t marie_gen_walker_visits[MARIE_PFN_NR_GENS][2 /* ANON_AND_FILE */];
+
+/*
+ * Per-(type, gen, tier) tracking bitmap. One struct marie_bitmap per
+ * (type, gen, tier) tuple, each holding:
+ *   - L1: per-PFN bit (BITS_TO_LONGS(max_pfn) words, ~256 KiB / bitmap
+ *         on an 8 GiB system; 16 bitmaps = ~4 MiB total)
+ *   - L2: 512-bit summary over the same PFN space (64 B / bitmap)
+ *   - per-cell refcount: 512 atomic_t per bitmap (2 KiB / bitmap), so
+ *     the L2 bit transitions track L1 occupancy exactly via the 0 <->
+ *     1 refcount boundary.
+ *
+ * Scanners walk one (type, gen, tier) bitmap at a time; the L2 plane
+ * provides a 512-way fast-skip over empty 32 MiB ranges.
+ *
+ * struct + operations are defined in mm/lru_marie/bitmap.{h,c} and
+ * are also used by the per-memcg plane (struct marie_memcg_bm).
+ */
+extern struct marie_bitmap marie_track_bm[2 /* ANON_AND_FILE */]
+					 [MARIE_PFN_NR_GENS]
+					 [MARIE_PFN_NR_TIERS];
+
+/*
+ * clean_min_ratio: minimum file-pagecache reserve as percent of
+ * node_present_pages. Sysfs-tunable in core.c, read by reclaim.
+ * Default 15 (le9uo recommendation for desktop).
+ */
+extern unsigned int marie_clean_min_ratio;
+
+/*
+ * Aging trigger threshold: per (gen, type), once the install counter
+ * crosses this the head_gen cmpxchg-advances (gated on drain-wait:
+ * next gen must be empty). Mirrors the legacy reader's
+ * marie_gen_growth_threshold floor (SWAP_CLUSTER_MAX << 8).
+ */
+#define MARIE_AGING_THRESHOLD	8192
+
+/*
+ * marie_try_advance_head - cycle the per-type head_gen by one slot
+ * iff the next slot has been fully drained.
+ *
+ * Drain-wait: the advance fires only when marie_gen_occupied[next][type]
+ * reads zero. This is the workingset protection borrowed from legacy
+ * Marie -- reclaim must drain the oldest gen before aging can rotate
+ * its slot back to head.
+ *
+ * "Drained" means gen_occupied==0, which the reclaim isolate path reaches
+ * at isolate (marie_evict_counters_only retires the slot), NOT at free.
+ * So in-flight isolated folios may still carry @next in their per-PFN
+ * byte when this resets @next's bitmap. That is safe: their scan bits
+ * were already cleared at isolate, so bm_reset only touches already-clear
+ * bits, and their deferred byte teardown (marie_state_drop_pfn_at_free)
+ * is gated on marie_bm_test, so it will not double-decrement the slot's
+ * l2_count after the reset. New installs at the recycled @next therefore
+ * cannot collide with the retired old-life bits.
+ *
+ * Concurrency: the cmpxchg ensures exactly one writer advances per
+ * head transition; losers see the new head on their next read. The
+ * gen_installs / walker_visits resets for the new slot are
+ * benign-on-race (a concurrent install/walker bump can be lost --
+ * both counters are advisory hints, not correctness primitives).
+ *
+ * Triggered from both:
+ *   - install-cadence (gen_installs > MARIE_AGING_THRESHOLD)
+ *   - reclaim-driven (occupied gen count < 2 at shrink_lruvec entry,
+ *     see design.h section 4)
+ */
+static inline void marie_try_advance_head(int type)
+{
+	u8 head = (u8)atomic_read(&marie_head_gen[type]);
+	u8 next = (head + 1) & (MARIE_PFN_NR_GENS - 1);
+
+	if (atomic_long_read(&marie_gen_occupied[next][type]) != 0)
+		return;
+
+	/*
+	 * Reset each tier-bitmap of the slot we are about to recycle so
+	 * any residue from the slot's previous lifetime doesn't suppress
+	 * the next install's 0 -> 1 cell_count transition. [type][next][*]
+	 * is a per-type stripe -- clearing it cannot disturb the OTHER
+	 * type's folios that may share the same gen index.
+	 *
+	 * Must happen BEFORE the cmpxchg: installs read head_gen[type]
+	 * to find their target slot, so until the cmpxchg lands no
+	 * install can target @next.
+	 */
+	{
+		int tier_idx;
+
+		for (tier_idx = 0; tier_idx < MARIE_PFN_NR_TIERS; tier_idx++)
+			marie_bm_reset(&marie_track_bm[type][next][tier_idx]);
+		smp_wmb();
+	}
+
+	if (atomic_cmpxchg(&marie_head_gen[type], head, next) != head)
+		return;
+
+	/*
+	 * Zero the recycled slot's per-CPU install gauge across all CPUs.
+	 * Rare (only on a successful head advance) and benign-on-race with
+	 * concurrent this_cpu_inc on other CPUs (advisory hint). possible,
+	 * not online, so a parked CPU's residue can't survive into the
+	 * slot's next lifetime and prematurely re-trip the cadence trigger.
+	 */
+	{
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			per_cpu(marie_gen_installs_pc, cpu)[next][type] = 0;
+	}
+	atomic_set(&marie_gen_walker_visits[next][type], 0);
+}
+
+/*
+ * marie_find_oldest_occupied - return the oldest live gen for @type.
+ *
+ * Walks the ring forward from (head + 1) & 3 to (head + 3) & 3 and
+ * returns the first slot with marie_gen_occupied > 0. This is the
+ * legitimate "oldest install epoch" -- the (head + 1) & 3 shorthand
+ * assumes a fully-occupied 4-gen ring, which does NOT hold during
+ * boot or any sparse-ring window. Returns -1 if only head (or
+ * nothing) is occupied.
+ *
+ * Concretely the bug a fixed (head + 1) & 3 hit: after the
+ * reclaim-driven trigger advances head into an empty slot, the
+ * previous gen still holds the only live data, but (head + 1) & 3
+ * points two slots past it -- scan returns zero forever despite
+ * GiB of reclaimable folios.
+ *
+ * 3 atomic_long_read per call; only runs at scan entry, never on a
+ * per-candidate hot path.
+ */
+static inline int marie_find_oldest_occupied(int type)
+{
+	int head = atomic_read(&marie_head_gen[type]);
+	int i;
+
+	for (i = 1; i < MARIE_PFN_NR_GENS; i++) {
+		int slot = (head + i) & (MARIE_PFN_NR_GENS - 1);
+
+		if (atomic_long_read(&marie_gen_occupied[slot][type]) > 0)
+			return slot;
+	}
+	return -1;
+}
+
+/*
+ * Install-side aging trigger with dynamic threshold.
+ *
+ * Threshold = max(MARIE_AGING_THRESHOLD, total_occupied_for_type / 8)
+ *
+ * The dynamic leg scales the trigger with the workload's actual
+ * footprint -- with NR_GENS = 4 the "balanced ring" target is
+ * total / 4 per gen, so cutting at total / 8 (half of that) advances
+ * often enough that non-head gens always carry a healthy reclaim-
+ * candidate buffer.
+ *
+ * Sampled at every 1024th install via a PER-CPU tick (marie_aging_tick)
+ * rather than the old (cnt & 1023) gate on a global atomic: the gauge is
+ * now per-CPU, so there is no cheap global running count to mask on.
+ * this_cpu_inc_return touches only this CPU's line, adding no contention.
+ * The aggregate check rate is unchanged (~1 per 1024 installs), and only
+ * on that 1-in-1024 slow path do we pay the O(nr_cpus) cross-CPU sum of
+ * the gauge -- amortised to nothing on the install hot path. Sampling
+ * delays the first cross-threshold advance by at most ~1023 installs,
+ * invisible against the ~32 MiB+ threshold window.
+ *
+ * Below the floor (sum <= MARIE_AGING_THRESHOLD) the helper exits
+ * immediately -- advance is impossible regardless of total_occupied.
+ */
+static inline void marie_install_advance_hook(int type)
+{
+	long total, cnt;
+	unsigned long dynamic, threshold;
+	int g, cpu, head;
+
+	/* Per-CPU throttle; no shared cacheline (replaces the old global
+	 * (cnt & 1023) gate). */
+	if (likely((this_cpu_inc_return(marie_aging_tick[type]) & 1023) != 0))
+		return;
+
+	/* Approximate global install count for the current head gen. Only
+	 * reached on the 1-in-1024 slow path, so the cross-CPU walk is free
+	 * on the hot path. */
+	head = atomic_read(&marie_head_gen[type]);
+	cnt = 0;
+	for_each_online_cpu(cpu)
+		cnt += per_cpu(marie_gen_installs_pc, cpu)[head][type];
+
+	if (cnt <= MARIE_AGING_THRESHOLD)
+		return;
+
+	total = 0;
+	for (g = 0; g < MARIE_PFN_NR_GENS; g++)
+		total += atomic_long_read(&marie_gen_occupied[g][type]);
+
+	dynamic = total > 0 ? (unsigned long)total >> 3 : 0;
+	threshold = max((unsigned long)MARIE_AGING_THRESHOLD, dynamic);
+
+	if ((unsigned long)cnt > threshold)
+		marie_try_advance_head(type);
+}
+
+/*
+ * Per-PFN isolate scan uses an L2-pruned, range-locked walk rather
+ * than a per-CPU cursor -- see marie_state_isolate_scan_l2lock below
+ * for the parallelism model (try-lock on per-L2-bit range locks gives
+ * 512-way exclusion across concurrent reclaimers).
+ */
+#include <linux/percpu.h>
+/* One-shot init from marie_init(). Allocates marie_state with kvmalloc. */
+int marie_state_init(void);
+/* Detect CPUID-based prefetch ring parameters. Call before marie_state_init(). */
+void marie_prefetch_params_init(void);
+
+struct pglist_data;
+struct folio;
+struct lruvec;
+struct scan_control;
+struct mem_cgroup;
+
+/*
+ * L2-lock parallel isolate scan: collapses the 512-bit outer L2
+ * walk to an 8-word loop, word-ANDing the global (type, gen, tier)
+ * L2 with the per-memcg L2 to skip empty PFN ranges in one cycle
+ * each; surviving L2 bits are taken under a try_lock for exclusive
+ * PFN-range ownership before the inner producer extracts
+ * candidates via __ffs/blsr + the L1 word-AND with mbm.
+ *
+ * @target_memcg: non-NULL only for cgroup-targeted (child memcg)
+ * reclaim; NULL means root reclaim and the memcg word-AND is
+ * skipped (every Marie folio in scope).
+ */
+unsigned long marie_state_isolate_scan_l2lock(struct pglist_data *pgdat,
+					      int type, int max_zone,
+					      unsigned int tier,
+					      struct mem_cgroup *target_memcg,
+					      struct folio **batch,
+					      unsigned long batch_size,
+					      unsigned long nr_to_scan);
+
+/*
+ * Per-PFN-array reclaim driver. Walks (type, tier) via
+ * marie_state_isolate_scan_l2lock, claims each candidate via
+ * try_get + test_clear_lru, hands the resulting folio_list to
+ * shrink_folio_list, and putbacks any survivors. Sole reclaim
+ * driver in PFN-only Marie.
+ */
+unsigned int marie_state_shrink_lruvec(struct lruvec *lruvec,
+				       struct scan_control *sc);
+
+/*
+ * Marie type-pick return codes for marie_swap_pick_type().
+ *
+ *   MARIE_PICK_FILE_STRICT  swappiness=0:   FILE only, no ANON fallback;
+ *                                           caller proceeds to OOM if FILE
+ *                                           is depleted.
+ *   MARIE_PICK_ANON_STRICT  swappiness=MAX: ANON only, no FILE fallback.
+ *   MARIE_PICK_FILE_THEN_ANON  swappiness=1: FILE first; ANON engages
+ *                                            ONLY when skip_file is set
+ *                                            (clean_min_ratio breached).
+ *                                            Per-call transient FILE
+ *                                            failures do not promote to
+ *                                            ANON -- the floor itself is
+ *                                            the sole depletion signal.
+ *   MARIE_PICK_ANON_FIRST   Proportional regime (s=2..199), bias picks
+ *                           ANON. SINGLE type per call -- scanning the
+ *                           other side would dissolve the s:(MAX-s)
+ *                           page-flow ratio. Bias gets updated from
+ *                           this call's outcome, possibly flipping the
+ *                           pick for the next shrink_lruvec call.
+ *   MARIE_PICK_FILE_FIRST   Symmetric to ANON_FIRST: bias picks FILE,
+ *                           single type per call.
+ */
+enum marie_pick_kind {
+	MARIE_PICK_FILE_STRICT,
+	MARIE_PICK_ANON_STRICT,
+	MARIE_PICK_FILE_THEN_ANON,
+	MARIE_PICK_ANON_FIRST,
+	MARIE_PICK_FILE_FIRST,
+};
+
+/*
+ * Resolve the type-pick policy for one shrink_lruvec invocation.
+ *
+ * Pure read of the controller state: looks at @swappiness to detect
+ * the {0, 1, MAX_SWAPPINESS} special values, otherwise reads
+ * mlv->swap_bias sign to pick the primary type for the proportional
+ * regime. Does not modify any state.
+ *
+ * @mlv may be NULL when the lruvec has no marie_lruvec yet (alloc
+ * failure path); the helper falls back to MARIE_PICK_ANON_FIRST so
+ * the caller iterates both types in the legacy order without any
+ * bias tracking.
+ */
+enum marie_pick_kind marie_swap_pick_type(struct marie_lruvec *mlv,
+					  u8 swappiness);
+
+/*
+ * Apply the bias-controller update for one ATTEMPTED pick.
+ *
+ *   nr_reclaimed > 0  -> bias += sign * nr_reclaimed * weight
+ *                        Page-flow proportional: long-run
+ *                        pages(anon):pages(file) -> s:(MAX-s) even
+ *                        when per-pick batches differ between types.
+ *
+ *   nr_reclaimed == 0 -> no-op (bias unchanged)
+ *                        Failure carries no back-pressure. The
+ *                        picked side stays the picked side
+ *                        indefinitely under sustained failure;
+ *                        anon is not surrendered just because file
+ *                        is transiently or persistently stuck.
+ *
+ *   sign   = -1 for picked=ANON (push toward FILE)
+ *            +1 for picked=FILE (push toward ANON)
+ *   weight = MAX_SWAPPINESS - s   for picked=ANON
+ *          = s                    for picked=FILE
+ *
+ * Bypassed entirely under special-value swappiness (0, 1, MAX),
+ * where the pick is deterministic and the bias is not consulted;
+ * also a no-op when @mlv is NULL (lruvec alloc-failure path).
+ *
+ * Caller MUST only invoke when the pick was actually attempted;
+ * do NOT call when an external override (skip_file from
+ * clean_min_ratio) blocked the picked type before the scan ran.
+ */
+void marie_swap_bias_update(struct marie_lruvec *mlv,
+			    int picked_type,
+			    unsigned long nr_reclaimed,
+			    u8 swappiness);
+
+/*
+ * Per-memcg L1/L2 bitmap pair. Allocated for every non-root memcg
+ * at memcg create; install/del maintain (type, gen, tier)-agnostic
+ * occupancy at PFN granularity (L1) and 32 MiB-range granularity
+ * (L2 via per-bit l2_count refcounter). Scan AND's both planes
+ * with the global (type, gen, tier) L1/L2 to restrict iteration
+ * to (type, gen, tier) ∩ memcg at source.
+ */
+void marie_memcg_bitmap_free(struct mem_cgroup *memcg);
+void marie_memcg_bitmap_set(struct mem_cgroup *memcg, unsigned long pfn);
+void marie_memcg_bitmap_clear(struct mem_cgroup *memcg, unsigned long pfn);
+unsigned long *marie_memcg_bitmap_get(struct mem_cgroup *memcg);
+unsigned long *marie_memcg_bitmap_get_l2(struct mem_cgroup *memcg);
+void marie_memcg_bitmap_merge(struct mem_cgroup *parent,
+			      struct mem_cgroup *child);
+
+/*
+ * Saturating tier increment for a Marie-tracked folio's per-PFN byte.
+ *
+ * Non-saturated bump (tier < MAX) is a best-effort race-tolerant
+ * WRITE_ONCE; losing a bump to a concurrent racer is benign because
+ * tier is a hotness hint, not a correctness primitive.
+ *
+ * Saturated bump (tier == MAX) is in-place promote: the folio's GEN
+ * field is CAS-moved to atomic_read(&marie_head_gen[type]) with TIER
+ * reset to 0. The CAS guards against concurrent del and against
+ * another walker promoting the same PFN.
+ *
+ * Skips quietly if the folio is not (or no longer) tracked, or if a
+ * saturated folio is already encoded on the head gen.
+ */
+void marie_state_inc_tier(unsigned long pfn);
+
+/*
+ * marie_state_move_to_gen - relocate a tracked PFN to (@target_gen,
+ * @target_tier) in the per-PFN byte, with matched bitmap / occupied
+ * counter updates on both source and destination (gen, type) planes.
+ *
+ * Single point of policy for any operation that needs to move a folio
+ * between gens. Two callers in design.h:
+ *   - walker tier saturate (section 7):
+ *       marie_state_move_to_gen(pfn, head, 0)
+ *   - shrink_folio_list residue putback (section 13):
+ *       marie_state_move_to_gen(pfn, (head + 2) & 3, max(prev, w))
+ *
+ * The state-byte cmpxchg defeats races against del / another
+ * concurrent move. The bitmap / counter shuffle uses "new first, then
+ * old" ordering so the folio remains visible to scan on at least one
+ * (gen, type) plane throughout the transition.
+ *
+ * No-op if the folio is no longer tracked or already encodes the
+ * target (gen, tier).
+ */
+void marie_state_move_to_gen(unsigned long pfn, u8 target_gen, u8 target_tier);
+
+struct folio;
+/*
+ * marie_state_drop_pfn - wipe every per-PFN tracking artifact
+ * (state byte, (type, gen, tier) L1 bit, occupancy counter, per-
+ * memcg L1/L2/l2_count, global L2 range counter with bulk L2 clear
+ * on 0) for @folio. Shared by the normal evict path
+ * (marie_evict_locked) and the enable=0 drain path
+ * (marie_drain_one_lruvec) so disable->enable cycles never leave
+ * ghost per-PFN state behind. No-op when the byte is not TRACKED.
+ */
+void marie_state_drop_pfn(struct folio *folio);
+
+
+/* --- per-lruvec residency state and install/evict surface --- */
+#ifdef CONFIG_LRU_MARIE
+
+#include <linux/atomic.h>
+#include <linux/cleanup.h>
+#include <linux/gfp_types.h>
+#include <linux/hash.h>
+#include <linux/irqflags.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+
+#include <linux/mm_inline.h>
+#include <linux/mmzone.h>
+#include <linux/percpu.h>
+#include <linux/xarray.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/swap.h>		/* SWAP_CLUSTER_MAX, ANON_AND_FILE */
+#include <linux/types.h>
+#include <linux/vmstat.h>
+
+struct folio;
+struct lruvec;
+struct mem_cgroup;
+struct marie_lruvec;
+struct marie_gen;
+
+/*
+ * ---------------------------------------------------------------------
+ *  Per-folio state inspection (internal)
+ * ---------------------------------------------------------------------
+ *
+ * Reads of the per-PFN state byte are lock-free (READ_ONCE). Writes
+ * go through state.c helpers (marie_state_inc_tier,
+ * marie_state_move_to_gen, marie_state_drop_pfn); the byte is the
+ * single source of truth for Marie's per-folio state.
+ *
+ * folio->lru is not interpreted as part of Marie's state -- folios
+ * are never linked from a Marie-owned list. It exists only so legacy
+ * LRU can attach drained folios via lruvec->lists[lru] (handed off by
+ * marie_drain_pfn_locked when Marie is disabled).
+ */
+
+/**
+ * folio_marie_test_tracked - is @folio claimed by Marie?
+ *
+ * Reads the per-PFN state byte (the single source of truth in the
+ * per-PFN paradigm). folio->flags carries no Marie state.
+ */
+static inline bool folio_marie_test_tracked(const struct folio *folio)
+{
+	unsigned long pfn = folio_pfn((struct folio *)folio);
+
+	if (!marie_state || pfn >= marie_state_size)
+		return false;
+	return READ_ONCE(marie_state[pfn]) & MARIE_PFN_TRACKED;
+}
+
+/*
+ * folio_marie_get_tier is declared in <linux/lru_marie.h>
+ * so callers outside mm/lru_marie/ (e.g. mm/vmscan.c
+ * folio_check_references) can read tier without including this
+ * private header.
+ *
+ * Tier bumps go through marie_state_inc_tier (defined in state.c) -- the
+ * per-PFN state byte is the only place tier lives.
+ */
+
+/*
+ * marie_folio_lruvec_rcu - RCU-bracketed folio_lruvec() for Marie hot paths.
+ *
+ * folio_lruvec() reaches obj_cgroup_memcg() which has a lockdep predicate
+ * requiring rcu_read_lock or cgroup_mutex. Marie's drain and walker paths
+ * run under mlv->lock (which disables preemption) but NOT under
+ * rcu_read_lock(); preempt-disable does not satisfy the lockdep
+ * predicate. The brief RCU bracket avoids the WARN trip; the returned
+ * pointer is used only for equality comparison or as an xarray key,
+ * never dereferenced after rcu_read_unlock().
+ */
+static inline struct lruvec *marie_folio_lruvec_rcu(struct folio *folio)
+{
+	struct lruvec *lv;
+
+	rcu_read_lock();
+	lv = folio_lruvec(folio);
+	rcu_read_unlock();
+	return lv;
+}
+
+/*
+ * marie_update_lru_size - Marie counterpart to legacy update_lru_size().
+ *
+ * Updates the shared global vmstat / zone counters AND the per-memcg
+ * mz->lru_zone_size, exactly mirroring the legacy update_lru_size()
+ * wrapper (__update_lru_size + mem_cgroup_update_lru_size). Marie
+ * credits mz->lru_zone_size at install and debits it at evict, so a
+ * Marie-tracked folio is counted in mz the same way a legacy/MGLRU
+ * folio is. lruvec_lru_size() therefore reads mz directly with no
+ * Marie-specific summing.
+ *
+ * Unified accounting (was: Marie owned a private mlv->marie_lru_zone_size
+ * and skipped mz): keeping mz authoritative for Marie folios too means
+ * any del path -- Marie's own evict, or a legacy lruvec_del_folio that
+ * fires on a folio whose TRACKED bit was already cleared -- finds the
+ * matching +nr Marie credited at install. The old split made
+ * "added via Marie (no mz +1), del'd via legacy (mz -1)" an
+ * mz->lru_zone_size underflow; crediting mz at install closes that
+ * asymmetry structurally. mlv->marie_lru_zone_size survives only as
+ * Marie's internal per-bucket tally (reparent transfer); it no longer
+ * feeds size reads.
+ *
+ * Marie<->legacy list transitions (drain to legacy, adopt from legacy)
+ * are mz-neutral: the folio stays counted in mz across the list_move,
+ * so those paths must NOT re-credit/re-debit mz (see
+ * marie_drain_pfn_locked).
+ *
+ * Caller MUST hold lruvec->lru_lock. mod_lruvec_state's per-CPU fold
+ * and __mod_zone_page_state's per-zone counter are documented as
+ * lru_lock-protected against concurrent updaters of the same lruvec.
+ */
+static inline void marie_update_lru_size(struct lruvec *lruvec,
+				       enum lru_list lru,
+				       enum zone_type zid,
+				       long nr_pages)
+{
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+	mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
+	__mod_zone_page_state(&pgdat->node_zones[zid],
+			      NR_ZONE_LRU_BASE + lru, nr_pages);
+#ifdef CONFIG_MEMCG
+	/* Unified with legacy: Marie folios are counted in mz too. */
+	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
+#endif
+}
+
+/*
+ * Global folio counter, lives in mm/lru_marie/core.c for stats_show;
+ * the install/evict helpers in state.c percpu_counter_add it during
+ * Marie's TRACKED 0<->1 transitions.
+ */
+extern struct percpu_counter marie_nr_folios;
+
+/*
+ * marie_pc_add - Marie-private percpu_counter add that elides the
+ * outer preempt_disable / preempt_enable bracket of
+ * percpu_counter_add_batch() while preserving its IRQ safety.
+ *
+ * percpu_counter_add_batch() wraps the whole body in
+ * preempt_disable/enable. Under DEBUG_PREEMPT that bracket shows up in
+ * perf under 16-thread memhog as ~4 % of total CPU (preempt_count_add +
+ * check_preemption_disabled). We drop it because the individual
+ * this_cpu_* primitives used here are each self-contained: this_cpu_add
+ * is a single atomic RMW (one instruction on x86), and the slow-path
+ * fbc->lock section takes raw_spin_lock_irqsave, so correctness does
+ * not depend on the caller's preempt or IRQ state.
+ *
+ * IRQ safety is MANDATORY, not optional: not every caller holds
+ * lru_lock. The reclaim isolate path (marie_evict_counters_only) and
+ * the survivor putback in marie_state_shrink_lruvec update the GLOBAL
+ * marie_nr_folios counter with IRQs ENABLED (preempt_disable only).
+ * The same counter is also bumped from IRQ/softirq context when a
+ * Marie-tracked LRU folio's last reference is dropped
+ * (folio_put -> __page_cache_release -> lruvec_del_folio ->
+ * lru_marie_del_folio -> marie_evict_locked). If the flush path used a
+ * plain raw_spin_lock, a softirq landing on the CPU that already holds
+ * fbc->lock would spin forever on it -> hard lockup. Hence
+ * raw_spin_lock_irqsave below, exactly as percpu_counter_add_batch does.
+ *
+ * The fast path uses this_cpu_add (atomic against same-CPU IRQ
+ * reentrancy); the earlier __this_cpu_read + __this_cpu_write pair was
+ * a non-atomic RMW that could lose an IRQ-context update.
+ */
+static inline void marie_pc_add(struct percpu_counter *fbc, s64 amount)
+{
+	s64 count = this_cpu_read(*fbc->counters) + amount;
+
+	if (unlikely(abs(count) >= percpu_counter_batch)) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&fbc->lock, flags);
+		count = __this_cpu_read(*fbc->counters) + amount;
+		fbc->count += count;
+		__this_cpu_sub(*fbc->counters, count - amount);
+		raw_spin_unlock_irqrestore(&fbc->lock, flags);
+	} else {
+		this_cpu_add(*fbc->counters, amount);
+	}
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *  Install / evict — per-folio TRACKED 0 <-> 1 with lru_lock held
+ * ---------------------------------------------------------------------
+ *
+ * Marie's per-folio state is one bit: TRACKED in marie_state[pfn].
+ * Synchronous install/evict helpers own all the bookkeeping (per-PFN
+ * state byte, global per-(type, gen, tier) and per-memcg bitmaps,
+ * per-mlv / global percpu_counters, lru_size mirror, PG_active /
+ * PG_lru hygiene):
+ *
+ *   marie_folio_install:            TRACKED 0 -> 1
+ *                                   unified fresh install for both small
+ *                                   folios and THP; declared in
+ *                                   pfn_install.h
+ *   marie_state_publish_at_gen:     TRACKED stays, (gen, tier) refreshed
+ *                                   reclaim survivor putback
+ *   marie_evict_locked:             TRACKED 1 -> 0
+ *                                   called from marie_del_folio_locked
+ *
+ * folio_marie_test_tracked() is the lock-free state inspector: it
+ * reads marie_state[pfn] & MARIE_PFN_TRACKED, returning whether
+ * Marie owns @folio. The binary state is checked directly at each
+ * callsite -- no intermediate dispatch machinery.
+ */
+bool marie_evict_locked(struct marie_lruvec *mlv, struct folio *folio);
+
+/*
+ * Reclaim isolate path: counters-only decrement at claim time. The per-
+ * PFN state byte's TRACKED bit intentionally stays set throughout
+ * shrink_folio_list so marie_folio_install's TRACKED early-out blocks any
+ * concurrent install from setting PG_lru on a folio currently in the
+ * reclaim list. The scan-bitmap slot + gen_occupied ARE retired here (an
+ * isolated folio is no longer a scan candidate); the TRACKED byte is
+ * wiped later -- at the buddy free hook (marie_state_drop_pfn_at_free)
+ * for a reclaimed folio, or re-published by marie_state_publish_at_gen at
+ * putback for a survivor. See state.c body for the full rationale.
+ */
+void marie_evict_counters_only(struct folio *folio);
+
+/*
+ * Canonical per-PFN state teardown invoked from
+ * mm/page_alloc.c::free_pages_prepare at every page's buddy handoff.
+ * Wipes the per-PFN state byte / bitmap / gen_occupied slot whenever
+ * the byte still carries TRACKED. No-op on already-cleared state.
+ * Counters are NOT touched (they were balanced upstream by Marie's
+ * del path or by marie_evict_counters_only).
+ *
+ * Lock-free; safe from any context.
+ */
+void marie_state_drop_pfn_at_free(unsigned long pfn);
+
+/* marie_folio_install lives in pfn_install.h. */
+
+/* Lazy lookup-or-allocate of the Marie state for @lv. */
+struct marie_lruvec *marie_get_lruvec(struct lruvec *lv);
+
+/*
+ * Adaptive batch threshold. Returns the per-call page accumulator cap,
+ * lerped between MARIE_PFN_BATCH_FLOOR (low pressure,
+ * sc->priority == DEF_PRIORITY) and MARIE_PFN_SHRINK_BATCH (max
+ * pressure, sc->priority == 0). Defined in state.c as
+ * marie_pfn_batch_threshold; this declaration is the public name.
+ */
+struct scan_control;
+unsigned long marie_adaptive_batch_threshold(struct scan_control *sc);
+
+/**
+ * marie_del_folio_locked - lru_marie_del_folio body.
+ * @mlv:               residency set
+ * @folio:            folio to remove (any Marie-tracked state)
+ *
+ * Universal external-removal handler called from lru_marie_del_folio when
+ * lruvec_del_folio fires from outside Marie (compaction, lru_activate
+ * batch drain, __page_cache_release after the last folio_put). If the
+ * folio is TRACKED, calls marie_evict_locked to run the full eviction
+ * (per-PFN state wipe + counter decrements + lru_size mirror). If the
+ * folio is no longer TRACKED, returns true defensively (treated as
+ * "Marie already removed it").
+ *
+ * Returns true iff @folio was tracked (the caller can fall through to
+ * its remaining bookkeeping). The full counter wind-down -- including
+ * the single marie_nr_folios -1 -- is owned by marie_evict_locked via
+ * marie_account_evict; the caller adds no decrement of its own.
+ */
+bool marie_del_folio_locked(struct marie_lruvec *mlv, struct folio *folio);
+
+/*
+ * Walker hot-path tier promotion result codes. Kept for ABI; the
+ * current implementation always returns MARIE_TIER_INC_OK because
+ * marie_state_inc_tier handles saturation in-place (synchronous
+ * move-to-head_gen). The walker discards the return value.
+ */
+enum marie_tier_inc_result {
+	MARIE_TIER_INC_OK = 0,
+	MARIE_TIER_INC_SATURATED,	/* reserved, no longer produced */
+	MARIE_TIER_INC_FAILED,		/* reserved, no longer produced */
+};
+
+/**
+ * marie_tier_inc - walker-side hot path: bump @folio's tier.
+ * @mlv:    residency set (unused by the current implementation; kept
+ *         for ABI compatibility)
+ * @folio: subject folio (must be Marie-tracked or the call is a no-op)
+ *
+ * Thin wrapper around marie_state_inc_tier(folio_pfn(folio)). The
+ * per-PFN saturate path is synchronous -- when tier == MAX the helper
+ * calls marie_state_move_to_gen(pfn, head, 0) directly, so there is no
+ * deferred promote queue and no enqueue allocation that could fail.
+ * The single caller (walker.c marie_walk_pmd_range) discards the
+ * return value.
+ */
+enum marie_tier_inc_result marie_tier_inc(struct marie_lruvec *mlv,
+						struct folio *folio);
+
+/*
+ * Tier count: alias to the per-PFN state byte's tier field width.
+ * Tier lives entirely in marie_state[pfn]'s MARIE_PFN_TIER field
+ * (see state.h: MARIE_PFN_NR_TIERS / MARIE_PFN_TIER_MAX). The
+ * aliases keep call sites (overflow buffer sizing, tier-loop bounds)
+ * readable without rewriting them all.
+ *
+ * Tier 0 = "never touched since added"; tier MARIE_TIER_MAX = saturated
+ * (further young hits trigger a sync promote to head_gen via
+ * marie_state_inc_tier).
+ */
+#define MARIE_NR_TIERS  MARIE_PFN_NR_TIERS
+#define MARIE_TIER_MAX  MARIE_PFN_TIER_MAX
+
+/*
+ * Reclaim-side batch size — fallback compile-time constant used by
+ * a few non-hot-path call sites. The per-PFN scan path uses
+ * MARIE_PFN_FALLBACK_BATCH / MARIE_PFN_SHRINK_BATCH (see state.c).
+ */
+#define MARIE_ISOLATE_BATCH SWAP_CLUSTER_MAX
+
+/*
+ * Allocation-side aging trigger threshold (per head gen installs)
+ * lives in mm/lru_marie/core.c as marie_gen_growth_threshold and is
+ * runtime-tunable via /sys/kernel/mm/lru_marie/gen_growth_threshold.
+ * Default 8192 pages (= MARIE_ISOLATE_BATCH << 8, i.e. 32 MiB).
+ * marie_install_advance_hook combines this with a dynamic
+ * total_occupied / 8 leg to drive marie_try_advance_head.
+ */
+
+/*
+ * ---------------------------------------------------------------------
+ *  data structures
+ * ---------------------------------------------------------------------
+ *
+ * Per-type independence is fundamental: anon and file each have their
+ * own per-type lock and their own slice of the global per-(type, gen,
+ * tier) bitmap / counter arrays. vm.swappiness controls only the
+ * eviction proportion between types; aging on one type never forces
+ * work on the other.
+ *
+ * The per-PFN state byte carries the zone field, so per-zone filtering
+ * is part of the scan mask -- no per-zone data structure is needed
+ * (matching the existing NR_LRU_LISTS / zone semantics).
+ */
+
+struct marie_type {
+	/*
+	 * @type_lock serialises per-type operations that need to be
+	 * mutually exclusive across CPUs (drain, reparent, fill-from-
+	 * legacy). Hot install/del do not take it -- they update the
+	 * per-PFN state byte and the unified bitmap lock-free.
+	 *
+	 * Cross-type sections take both types' locks in canonical order
+	 * (anon first, file second) via the marie_both_mlv guard.
+	 *
+	 * @type: 0 = anon, 1 = file. Set once at marie_type_init time so
+	 * scoped_guard(marie_type_lock, ...) can recover the type index
+	 * (needed for the per-CPU drain-depth counter) from a bare
+	 * struct marie_type * without an extra argument.
+	 *
+	 * @nr_pages is the total page count for this type on this lruvec,
+	 * read by stats / pick callers. percpu_counter so per-folio writes
+	 * hit the local CPU's diff (no global cache line bouncing) and
+	 * only flush to the global every percpu_counter_batch additions.
+	 */
+	spinlock_t		type_lock;
+	int			type;
+	struct percpu_counter	nr_pages;
+};
+
+struct marie_lruvec {
+	/*
+	 * ---- CL0: small hot read fields ----
+	 *
+	 * Layout intent: pack the few small fields the per-fault and
+	 * per-shrink-batch paths touch into one cacheline at the top, so
+	 * that a fault hitting marie_folio_install or the
+	 * swap-bias pick path pulls @lruvec / @swap_bias / @nid in a
+	 * single line read. The bulky @types[] subtree follows, keeping
+	 * the small hot fields off the same cachelines as the per-type
+	 * locks that another CPU may be hammering.
+	 *
+	 * @lruvec: back-pointer to the legacy lruvec. Read in every path
+	 *   that needs lv->lru_lock — install, del,
+	 *   marie_state_shrink_lruvec, walker pass. Read-only after
+	 *   marie_alloc_lruvec.
+	 *
+	 * @nid: node id. Read-only after init; consumed by walker /
+	 *   shrink stat paths.
+	 *
+	 * @swap_bias: signed bias counter that drives anon-vs-file pick in
+	 *   marie_state_shrink_lruvec under proportional swappiness (2..199).
+	 *   Sign decides the type to scan (>=0 -> ANON, <0 -> FILE).
+	 *
+	 *   Update rule for one ATTEMPTED pick:
+	 *
+	 *     nr_reclaimed > 0  -> bias += sign * nr_reclaimed * weight
+	 *                          (page-flow proportional accumulation)
+	 *     nr_reclaimed = 0  -> bias unchanged
+	 *                          (failure carries NO back-pressure)
+	 *
+	 *   where sign = -1 for picked=ANON, +1 for picked=FILE, and
+	 *   weight = MAX_SWAPPINESS - s for ANON, s for FILE.
+	 *
+	 *   Long-run page-flow under healthy operation:
+	 *     pages(anon) : pages(file) = s : (MAX_SWAPPINESS - s)
+	 *
+	 *   Stubborn protection under failure: zero-reclaim cycles leave
+	 *   the bias untouched, so the picked side stays the picked side
+	 *   indefinitely. Low-swappiness configurations on ZRAM systems
+	 *   depend on this -- anon's working set must remain resident
+	 *   even when file is transiently or persistently stuck on
+	 *   dirty / locked / writeback / depleted state. If file truly
+	 *   cannot be reclaimed, the caller escalates priority or OOM
+	 *   intervenes; the controller does not surrender protection.
+	 *
+	 *   Reset to 0 by lru_marie_swappiness_changed() on sysctl write
+	 *   so stale bias from a previous swappiness regime does not
+	 *   steer the first picks under the new value.
+	 *
+	 *   Special-value swappiness (0, 1, MAX_SWAPPINESS) bypasses the
+	 *   bias entirely at pick time; the field is not consulted, and
+	 *   the update path is short-circuited so the value never drifts.
+	 *   No CAP is applied -- per-cycle delta is bounded by batch_max
+	 *   (~8192) * MAX_SWAPPINESS (200) ~ 1.6e6, far below S64_MAX.
+	 */
+	struct lruvec		*lruvec;
+	int			nid;
+	bool			offline;	/* set under lv->lru_lock at css_offline */
+	atomic64_t		swap_bias;
+
+	/*
+	 * ---- Per-type subtrees ----
+	 *
+	 * types[0] = anon, types[1] = file. Each marie_type carries its
+	 * own per-type exclusive lock (type_lock) and its own
+	 * nr_pages percpu_counter. Cross-type sections acquire both
+	 * locks in canonical order via marie_both_mlv.
+	 */
+	struct marie_type		types[ANON_AND_FILE];
+
+	/*
+	 * ---- Hot write band: per-(lru, zone) Marie page counters ----
+	 *
+	 * Authoritative per-(lru, zone) count of Marie-tracked pages on
+	 * this lruvec. Marie owns this counter exclusively -- the legacy
+	 * mz->lru_zone_size mirrors only legacy add/del activity and
+	 * never sees Marie's contribution. Readers that want the
+	 * consolidated total query both via lruvec_lru_size() /
+	 * marie_lruvec_zone_size().
+	 *
+	 * Update sites (paired ±nr): marie_folio_install (install),
+	 * marie_state_publish_at_gen + survivor putback (reclaim
+	 * survivor at non-head gen), marie_evict_locked (evict),
+	 * marie_reparent_locked (reparent),
+	 * marie_fill_one_lruvec (legacy->Marie transition),
+	 * marie_drain_one_lruvec (Marie->legacy transition).
+	 *
+	 * percpu_counter handles concurrent updates without atomic on
+	 * every write: per-folio +/- nr lands in this CPU's local diff,
+	 * and only the periodic flush to the global s64 (every
+	 * percpu_counter_batch operations) touches a shared cacheline.
+	 *
+	 * percpu_counter so per-folio writes hit the local CPU's diff
+	 * instead of contending on the (lru, zone) cell across all CPUs.
+	 * The intra-CL false-sharing concern that justified the previous
+	 * atomic_long array is dissolved: each percpu_counter's hot per-CPU
+	 * storage is allocated separately, the struct itself only holds
+	 * the spinlock + s64 global which is touched only on batch flush.
+	 */
+	struct percpu_counter	marie_lru_zone_size[NR_LRU_LISTS][MAX_NR_ZONES];
+
+	/*
+	 * Deferred legacy mz->lru_zone_size delta from the LOCK-FREE isolate
+	 * paths (marie_account_{install,evict}_isolate). mz->lru_zone_size is a
+	 * non-atomic, lru_lock-protected counter; the isolate paths hold no
+	 * lru_lock, so they MUST NOT RMW it directly (concurrent reclaimers lose
+	 * updates and drift mz negative -- the underflow root cause). Instead
+	 * they accumulate here atomically, and the next LOCKED install/evict on
+	 * the same (lru, zone) drains it into mz under lru_lock
+	 * (marie_mz_drain_locked). The per-CPU-safe shadow (marie_lru_zone_size)
+	 * stays authoritative for the Marie count; mz lags by the un-drained
+	 * pending only, and is reconciled exactly, never raced.
+	 */
+	atomic_long_t		mz_pending[NR_LRU_LISTS][MAX_NR_ZONES];
+
+	/*
+	 * ---- Cold: only walked at memcg teardown ----
+	 *
+	 * @memcg: only used by lru_marie_exit_memcg to enumerate every
+	 * marie_lruvec under a dying memcg for reparent. Never read on
+	 * the per-fault or per-shrink path.
+	 */
+	struct mem_cgroup	*memcg;
+};
+
+/* lifecycle (called from mm/lru_marie/core.c xa lookup path) */
+struct marie_lruvec *marie_alloc_lruvec(struct lruvec *lv, gfp_t gfp);
+void marie_free_lruvec(struct marie_lruvec *mlv);
+
+/*
+ * Per-type re-entrant-drain detection. Caller (lru_marie_del_folio in
+ * mm/lru_marie/core.c) uses marie_in_drain_type(folio's type) to detect "we
+ * are already inside a per-type-locked drain for this folio's type on
+ * this CPU" and skip the scoped_guard re-acquire. The depth counters
+ * are per-CPU statics inside the ADT, mutated by the scoped_guard
+ * lock/unlock body (S5 / per-CPU encapsulation).
+ */
+bool marie_in_drain_type(int type);
+void marie_drain_enter_type(int type);
+void marie_drain_exit_type(int type);
+
+/*
+ * ---------------------------------------------------------------------
+ *  drain helpers
+ * ---------------------------------------------------------------------
+ *
+ * No promote-queue or per-CPU staging drain remains: every install /
+ * evict / tier bump is synchronous (install_local / install_locked
+ * publish per-PFN state inline, evict_locked wipes it inline,
+ * marie_state_inc_tier handles saturation via marie_state_move_to_gen
+ * directly).
+ *
+ * The remaining drain entry, marie_drain_one_lruvec (in core.c), is
+ * only the enable/disable transition: it walks the per-(type, gen,
+ * tier) bitmap and hands every TRACKED folio back to the legacy LRU.
+ */
+
+/**
+ * marie_reparent_locked - merge @child_mlv's per-memcg tracking
+ *                                   into @parent_mlv's via L2-pruned
+ *                                   bitmap OR + atomic counter transfer.
+ * @child_mlv:  source residency set (a memcg being reparented)
+ * @parent_mlv: destination residency set (parent memcg's lruvec mlv); may
+ *             be NULL, in which case @child's per-memcg bitmap and
+ *             counters are zeroed and folios fall back to global
+ *             tracking (no per-memcg filter).
+ *
+ * Per-folio iteration free: the per-PFN state byte (gen / tier / type /
+ * zone) and the global (type, gen, tier) bitmaps are memcg-agnostic
+ * and stay in place. Only the per-memcg L1/L2 bitmap merges into
+ * @parent and the per-type / per-(lru, zone) counters move via
+ * atomic_long_xchg. Cost scales with the number of populated L2 ranges
+ * in @child's per-memcg bitmap, not with the tracked folio count.
+ *
+ * Returns 0: marie_nr_folios is unchanged because the per-PFN state
+ * bytes remain set. Caller need not adjust the global folio counter.
+ *
+ * Caller MUST hold @child_mlv's both type_locks (marie_both_mlv
+ * scoped_guard) plus both lruvecs' lru_lock with IRQs off (the memcg
+ * offline path's reparent_locks).
+ */
+long marie_reparent_locked(struct marie_lruvec *child_mlv,
+				   struct marie_lruvec *parent_mlv);
+
+/*
+ * scoped_guard(marie_type_lock, &mlv->types[type]) — per-type lock acquisition.
+ *
+ * Equivalent to the handwritten dance:
+ *
+ *   spin_lock_irqsave(&t->type_lock, flags);
+ *   marie_drain_enter_type(t->type);
+ *   ... critical section touching mlv->types[t->type] ...
+ *   marie_drain_exit_type(t->type);
+ *   spin_unlock_irqrestore(&t->type_lock, flags);
+ *
+ * The cleanup attribute on the guard variable makes the unlock +
+ * depth-counter pair a structural property of the scope, not a
+ * discipline the caller must remember on every early return / goto.
+ *
+ * Re-entry inside the scope is handled by the per-CPU per-type
+ * marie_drain_depth contract — drain helpers' folio_put recursion that
+ * lands in lru_marie_del_folio observes marie_in_drain_type(folio's
+ * type) > 0 and skips the spin_lock_irqsave for that type only. Recursion
+ * involving the *other* type lands on a depth-0 counter and proceeds to
+ * take the corresponding per-type lock as usual (the outer guard holds
+ * only one type's lock, so this is not a self-deadlock).
+ */
+DEFINE_LOCK_GUARD_1(marie_type_lock, struct marie_type,
+	/* lock */ ({
+		spin_lock_irqsave(&_T->lock->type_lock, _T->flags);
+		marie_drain_enter_type(_T->lock->type);
+	}),
+	/* unlock */ ({
+		marie_drain_exit_type(_T->lock->type);
+		spin_unlock_irqrestore(&_T->lock->type_lock, _T->flags);
+	}),
+	unsigned long flags
+)
+
+/*
+ * scoped_guard(marie_both_mlv, mlv) — take both types' locks.
+ *
+ * Used by cross-type call sites: drain_pending, drain_one_lruvec,
+ * fill_one_lruvec, walker pass-end visit-counter bump, residency
+ * reparent. Putback_batch and isolate_folios use marie_type_lock
+ * (single-type variant) since their input list is type-pure.
+ *
+ * Canonical order: anon (type 0) first, file (type 1) nested with
+ * SINGLE_DEPTH_NESTING to tell lockdep the two locks are different
+ * instances despite sharing a lock class. Every cross-type site uses
+ * this same guard, so the lock order is uniform and AB-BA is
+ * structurally impossible.
+ *
+ * Both per-type drain-depth counters are incremented so that any
+ * recursive lru_marie_del_folio (regardless of folio type) sees the
+ * depth > 0 fast path for its own type and avoids re-locking.
+ */
+DEFINE_LOCK_GUARD_1(marie_both_mlv, struct marie_lruvec,
+	/* lock */ ({
+		spin_lock_irqsave(&_T->lock->types[0].type_lock, _T->flags);
+		spin_lock_nested(&_T->lock->types[1].type_lock,
+				 SINGLE_DEPTH_NESTING);
+		marie_drain_enter_type(0);
+		marie_drain_enter_type(1);
+	}),
+	/* unlock */ ({
+		marie_drain_exit_type(1);
+		marie_drain_exit_type(0);
+		spin_unlock(&_T->lock->types[1].type_lock);
+		spin_unlock_irqrestore(&_T->lock->types[0].type_lock, _T->flags);
+	}),
+	unsigned long flags
+)
+
+/**
+ * marie_counters_init - one-shot init for per-mlv internals.
+ *
+ * Called from marie_init() (subsys_initcall in mm/lru_marie/core.c).
+ * Currently just initialises the global marie_nr_folios percpu_counter
+ * (the per-CPU bucket pool, slab caches, and cpuhp callbacks that
+ * earlier revisions needed have all been retired together with the
+ * staging machinery).
+ *
+ * Returns 0 on success, negative errno on failure (in which case the
+ * caller propagates the error up to the initcall machinery).
+ */
+int marie_counters_init(void);
+
+/*
+ * ---------------------------------------------------------------------
+ *  Cross-file glue (lifecycle xarray + walker entry points)
+ * ---------------------------------------------------------------------
+ *
+ * These declarations connect mm/lru_marie/core.c (dispatch / lifecycle) and
+ * mm/lru_marie/walker.c (PTE walker). They live here to keep mm/
+ * private headers down to a single file.
+ */
+
+/*
+ * Per-lruvec mlv lives in lv->marie_mlv (the single source of truth;
+ * lazy-allocated on first hit via cmpxchg, freed at memcg teardown via
+ * marie_drop_lruvec()). No side xarray, no RCU.
+ */
+
+/*
+ * marie_walk_pgdat - run one walker pass for @pgdat.
+ *
+ * Called from lru_marie_age_node() (kswapd hook) and
+ * lru_marie_shrink_lruvec() (direct-reclaim hook). Internally
+ * rate-limited per pgdat via a jiffies deadline so calling on every
+ * reclaim/kswapd cycle is fine.
+ */
+void marie_walk_pgdat(struct pglist_data *pgdat);
+
+/*
+ * marie_walker_init - one-shot init for the walker subsystem.
+ *
+ * Initialises per-pgdat bloom-filter spinlocks. Bitmaps themselves
+ * are lazily allocated on first Producer hit. Called from
+ * marie_init().
+ */
+void marie_walker_init(void);
+
+
+#endif /* CONFIG_LRU_MARIE */
+#endif /* _MM_LRU_MARIE_STATE_H */
diff --git a/mm/lru_marie/version.h b/mm/lru_marie/version.h
new file mode 100644
index 0000000000..d52a5ba340
--- /dev/null
+++ b/mm/lru_marie/version.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_LRU_MARIE_VERSION_H
+#define _MM_LRU_MARIE_VERSION_H
+
+/*
+ * Marie LRU — version identifiers.
+ *
+ * Kept in mm/ rather than include/linux/lru_marie.h so that bumping
+ * MARIE_VERSION (the only string that changes from one release to the
+ * next) does not invalidate the ccache entry for every translation
+ * unit that includes <linux/lru_marie.h> (mm/mm.h, mm/mm_inline.h,
+ * mm/vmscan.c, mm/swap.c, mm/rmap.c, mm/memcontrol.c, etc.). Only
+ * Marie's own .c files include this header, so a version bump rebuilds
+ * just mm/lru_marie*.o.
+ */
+
+#define MARIE_PROGNAME	"Marie LRU"
+#define MARIE_AUTHOR	"Masahito Suzuki"
+
+#define MARIE_VERSION	"0.3.5"
+
+#endif /* _MM_LRU_MARIE_VERSION_H */
diff --git a/mm/lru_marie/walker.c b/mm/lru_marie/walker.c
new file mode 100644
index 0000000000..846d3787d4
--- /dev/null
+++ b/mm/lru_marie/walker.c
@@ -0,0 +1,961 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/lru_marie/walker.c — Marie's SIMD-accelerated PTE walker.
+ *
+ * The walker is Marie's hot signal harvester. Per pgdat,
+ * rate-limited via marie_walker_interval() (HZ/30..HZ depending on
+ * pressure), marie_walk_pgdat() snapshots the running mm_struct's,
+ * walks each page table to PMD granularity, and at every PMD invokes
+ * lru_marie_simd_young_pte_mask_raw() to extract the young-bit bitmap
+ * of the entire 512-PTE page in one SIMD pass (AVX-512F / AVX2 / SSE2
+ * on x86; scalar fallback on arm64 and other arches via the generic
+ * variant -- NEON acceleration is a future optimisation, see
+ * mm/lru_marie/simd_generic.c).
+ * The FPU bracket around the SIMD call is held across MARIE_FPU_BATCH
+ * consecutive bloom-hit PMDs and flushed by a scoped_guard around the
+ * per-mm walk_page_range, amortising kernel_fpu_begin/end overhead.
+ * For each PTE flagged young, the walker bumps the corresponding
+ * folio's tier via marie_tier_inc() (thin wrapper around
+ * marie_state_inc_tier on the per-PFN byte). Folios that saturate to
+ * MARIE_TIER_MAX trigger an in-place synchronous promote
+ * (marie_state_move_to_gen to head_gen at tier 0) inside the same
+ * helper -- no per-mlv promote queue, no pass-end drain.
+ *
+ * A per-pgdat bloom filter (marie_bloom_*) feeds back from
+ * lru_marie_look_around() (rmap-side, called from
+ * folio_referenced_one()) to the walker: rmap flags PMDs whose target
+ * folio was young, the walker reads that bitmap and skips PMDs the
+ * rmap path has not flagged. The bloom is double-buffered (active /
+ * inactive) and rotated at pass end so the walker reads the feedback
+ * accumulated during the previous reclaim window.
+ *
+ * Bloom is the *only* coupling between rmap and the walker.
+ * lru_marie_look_around() does NOT promote (no tier++, no
+ * PG_referenced) on the surrounding folios; the walker handles tier++
+ * via young-bit detection on bloom-hit PMDs. This split keeps the
+ * rmap path PTL-bounded and lock-free, while the walker pays the
+ * SIMD scan + tier++ cost only for hot PMDs.
+ */
+
+#define pr_fmt(fmt) "lru_marie: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/cleanup.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/lru_marie.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmzone.h>
+#include <linux/pagewalk.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/rmap.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/xarray.h>
+
+#include "../internal.h"	/* folio_pte_batch_flags / FPB_MERGE_YOUNG_DIRTY */
+#include "state.h"
+#include "simd.h"
+
+/*
+ * ---------------------------------------------------------------------
+ *  look-around (rmap-side opportunistic PMD scan)
+ * ---------------------------------------------------------------------
+ *
+ * lru_marie_look_around() is called from rmap.c::folio_referenced_one() while
+ * the rmap caller already holds the page table lock for the target
+ * folio's PTE.  We piggyback on that PTL to scan up to
+ * MARIE_LOOK_AROUND_BATCH PTEs of the surrounding PMD and clear young bits
+ * found there in batch — what would otherwise cost one rmap walk per
+ * neighbouring folio amortises into a single PMD pass, and subsequent
+ * folio_referenced() calls on those folios get a more accurate "young
+ * since last reclaim cycle" answer.
+ *
+ * Crucially, we do NOT call folio_set_referenced() on the surrounding
+ * folios.  Doing so would cascade into a reclaim-side promote and
+ * starve reclaim under fault-heavy workloads (memhog, browser tab
+ * churn) where every recently-faulted PTE has its young bit set —
+ * see the comment above the test_and_clear loop below.
+ *
+ * Returns true iff the target folio's own PTE(s) were young.  That's the
+ * value folio_referenced_one() folds into its referenced count, exactly
+ * mirroring what test_and_clear_young_ptes_notify() would have returned
+ * from the bare clear_flush_young_ptes_notify branch.
+ *
+ * Lock contract: caller holds the PTL and (via rmap_walk) one of the
+ * anon_vma / i_mmap rwsems. We DO NOT take any Marie lock here.
+ * Promotion of folios with an external hotness signal happens out-of-
+ * band: folio_mark_accessed -> lru_marie_mark_accessed bumps the
+ * per-PFN tier (marie_state_inc_tier), and tier saturation triggers a
+ * synchronous marie_state_move_to_gen(pfn, head, 0) on the same path
+ * -- both operations are lock-free byte writes. No new lock-ordering
+ * relationship between rmap and Marie state is introduced.
+ */
+#define MARIE_LOOK_AROUND_BATCH BITS_PER_LONG	/* PTEs scanned per call */
+
+/*
+ * As of 7.1-rc1 both test_and_clear_young_ptes_notify() (mm/internal.h)
+ * and lazy_mmu_mode_enable()/disable() (include/linux/pgtable.h) are
+ * upstream, so the per-PTE emulation shim and the arch_*_lazy_mmu_mode
+ * aliases that the 6.12/6.18/7.0 backports carried are no longer needed
+ * here -- we call the upstream APIs directly.
+ */
+
+/* Forward decl: bloom Producer used by look_around. Definition lives in
+ * the walker helpers section alongside the walker-side Consumer. */
+static void marie_bloom_set(int nid, unsigned long pmd_addr);
+
+bool lru_marie_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
+{
+	pte_t *pte = pvmw->pte;
+	unsigned long addr = pvmw->address;
+	unsigned long start, end;
+	struct vm_area_struct *vma = pvmw->vma;
+	struct folio *target = pfn_folio(pvmw->pfn);
+	struct mem_cgroup *memcg;
+	struct pglist_data *pgdat = folio_pgdat(target);
+	int i;
+
+	lockdep_assert_held(pvmw->ptl);
+
+	/* Always clear the target folio's own young bit and propagate the
+	 * result to the caller, regardless of whether we go on to scan the
+	 * surrounding PMD. */
+	if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr))
+		return false;
+
+	/*
+	 * Producer: feed the per-pgdat bloom. The target folio was young,
+	 * so this PMD has at least one hot PTE worth visiting on the next
+	 * walker pass. This is the *only* rmap-side signal Marie gives the
+	 * walker -- look_around does NOT promote (no tier++, no
+	 * PG_referenced) on the surrounding folios; the walker handles
+	 * tier++ via young-bit detection on bloom-hit PMDs.
+	 */
+	marie_bloom_set(pgdat->node_id, addr & PMD_MASK);
+
+	/* If the PTL is contended skip the surrounding scan — somebody else
+	 * is waiting and we shouldn't extend our hold time. */
+	if (spin_is_contended(pvmw->ptl))
+		return true;
+
+	/* PFN-mapped VMAs don't carry struct folio backings on every PTE;
+	 * skip them rather than feed garbage to pfn_folio(). */
+	if (vma->vm_flags & VM_SPECIAL)
+		return true;
+
+	/* Compute a PMD-bounded surrounding range centred on @addr.  We
+	 * scan at most MARIE_LOOK_AROUND_BATCH PTEs and never cross either
+	 * the PMD or the VMA boundary. */
+	start = max(addr & PMD_MASK, vma->vm_start);
+	end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
+
+	if (end - start == PAGE_SIZE)
+		return true;
+
+	if (end - start > MARIE_LOOK_AROUND_BATCH * PAGE_SIZE) {
+		if (addr - start < MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2)
+			end = start + MARIE_LOOK_AROUND_BATCH * PAGE_SIZE;
+		else if (end - addr < MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2)
+			start = end - MARIE_LOOK_AROUND_BATCH * PAGE_SIZE;
+		else {
+			start = addr - MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2;
+			end = addr + MARIE_LOOK_AROUND_BATCH * PAGE_SIZE / 2;
+		}
+	}
+
+	memcg = get_mem_cgroup_from_folio(target);
+
+	lazy_mmu_mode_enable();
+
+	pte -= (addr - start) / PAGE_SIZE;
+
+	for (i = 0, addr = start; addr != end;
+	     i += nr, pte += nr, addr += nr * PAGE_SIZE) {
+		unsigned long pfn;
+		pte_t ptent = ptep_get(pte);
+		struct folio *folio;
+
+		nr = 1;
+
+		/* Inline minimal get_pte_pfn — vmscan.c's version is
+		 * MGLRU-static and we only need a subset of its checks. */
+		if (!pte_present(ptent))
+			continue;
+		if (pte_special(ptent))
+			continue;
+		pfn = pte_pfn(ptent);
+		if (is_zero_pfn(pfn))
+			continue;
+		if (!pfn_valid(pfn))
+			continue;
+		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+			continue;
+
+		folio = pfn_folio(pfn);
+		if (folio_nid(folio) != pgdat->node_id)
+			continue;
+
+		rcu_read_lock();
+		if (folio_memcg(folio) != memcg)
+			folio = NULL;
+		rcu_read_unlock();
+		if (!folio)
+			continue;
+
+		if (folio_test_large(folio)) {
+			const unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+			/*
+			 * 6.18 folio_pte_batch_flags() takes a pointer to a
+			 * COPY of the first pte (ptentp) and the FPB_* flag
+			 * set directly. FPB_MERGE_YOUNG_DIRTY merges across
+			 * young/dirty differences, matching the young-bit-
+			 * agnostic neighbour batching this look-around wants.
+			 */
+			nr = folio_pte_batch_flags(folio, NULL, pte, &ptent,
+						   max_nr, FPB_MERGE_YOUNG_DIRTY);
+		}
+
+		/* The target folio's young bit was already cleared above and
+		 * its referenced status will be re-derived by the caller from
+		 * our return value — don't double-clear it here. */
+		if (folio == target)
+			continue;
+
+		/*
+		 * Clear young bits across the surrounding PMD in batch. We
+		 * deliberately do NOT touch any tier / PG_referenced state on
+		 * the neighbours here: under a fault-heavy allocator (memhog,
+		 * browser tab churn) every recently-faulted PTE has its young
+		 * bit set, and amplifying that into a hot signal on
+		 * ~MARIE_LOOK_AROUND_BATCH neighbours per rmap call cascades
+		 * through promote-in-place and starves the reclaim path of
+		 * evictable folios. The only signal look_around emits for the
+		 * neighbours is the per-pgdat bloom (set above) — that tells
+		 * the next walker pass "this PMD had at least one hot PTE",
+		 * and the walker itself does per-PTE tier++ from young-bit
+		 * detection, preserving per-folio cardinality in the "hot"
+		 * signal that drives MARIE_TIER promotions.
+		 */
+		test_and_clear_young_ptes_notify(vma, addr, pte, nr);
+	}
+
+	lazy_mmu_mode_disable();
+	mem_cgroup_put(memcg);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(lru_marie_look_around);
+
+/*
+ * ---------------------------------------------------------------------
+ *  Walker helpers: adaptive walker rate, per-pgdat state
+ * ---------------------------------------------------------------------
+ */
+
+/*
+ * Adaptive walker rate. High pressure -> short interval
+ * (frequent walks -> fresh tier signal); idle -> long interval (don't
+ * burn CPU). Returns jiffies until the next walker pass for this pgdat.
+ *
+ * Watermarks come from ZONE_NORMAL when present; for builds where
+ * ZONE_NORMAL is absent we fall back to the first populated zone.
+ *
+ * All four stage intervals are runtime-tunable via
+ * /sys/kernel/mm/lru_marie/walker_interval_{critical,low,normal,idle}_ms;
+ * defaults preserve the original HZ/30, HZ/10, HZ/4, HZ cadence.
+ */
+static unsigned long marie_walker_interval(struct pglist_data *pgdat)
+{
+	struct zone *zone = NULL;
+	unsigned long free, high, low, min;
+	int zid;
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		struct zone *z = &pgdat->node_zones[zid];
+
+		if (!populated_zone(z))
+			continue;
+		if (zid == ZONE_NORMAL) {
+			zone = z;
+			break;
+		}
+		if (!zone)
+			zone = z;
+	}
+	if (!zone)
+		return READ_ONCE(marie_walker_interval_idle);
+
+	free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+	high = high_wmark_pages(zone);
+	low  = low_wmark_pages(zone);
+	min  = min_wmark_pages(zone);
+
+	if (free < min)
+		return READ_ONCE(marie_walker_interval_critical);
+	if (free < low)
+		return READ_ONCE(marie_walker_interval_low);
+	if (free < high)
+		return READ_ONCE(marie_walker_interval_normal);
+	return READ_ONCE(marie_walker_interval_idle);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *  Bloom filter -- rmap → walker forward feedback
+ * ---------------------------------------------------------------------
+ *
+ * Per-pgdat probabilistic set of "PMDs the rmap path saw young in since
+ * the last walker pass." Keyed by PMD index (>>PMD_SHIFT), m=1<<15
+ * (4 KiB per filter, 8 KiB per pgdat), k=2.
+ *
+ * Producer: lru_marie_look_around() (rmap-side, runs under PTL during eviction
+ *   folio_referenced walks). Sets bits in @inactive.
+ *
+ * Consumer: marie_walk_pmd_range() (walker hot path, runs under PTL).
+ *   Tests bits in @active. Bloom miss -> skip the PMD's SIMD scan.
+ *
+ * Pass-end (marie_walk_pgdat): swap active/inactive under @lock and clear
+ * the new inactive. The walker therefore reads the rmap feedback that
+ * accumulated during the previous reclaim window.
+ *
+ * @warmed_up is the force_scan kill-switch: sticky-true on the first
+ * Producer write per pgdat. Until then, the walker bypasses bloom and
+ * scans every PMD (covers cold-boot and freshly-online pgdats where
+ * rmap has never fed the filter).
+ *
+ * Lazy alloc with GFP_ATOMIC -- look_around runs under PTL, so any
+ * sleeping alloc would deadlock. Allocation failure leaves @inactive
+ * NULL; the next look_around call retries. With both bitmaps NULL the
+ * walker falls back to force_scan via @warmed_up == false.
+ */
+#define MARIE_BLOOM_SHIFT		15
+#define MARIE_BLOOM_SIZE		(1U << MARIE_BLOOM_SHIFT)	/* 32K bits */
+
+struct marie_bloom {
+	spinlock_t	lock;		/* serialises swap + alloc */
+	unsigned long	*active;	/* read by walker */
+	unsigned long	*inactive;	/* written by look_around */
+	bool		warmed_up;	/* sticky: true after first Producer set */
+};
+
+static struct marie_bloom marie_blooms[MAX_NUMNODES];
+
+static inline void marie_bloom_keys(unsigned long pmd_addr, int *key)
+{
+	u32 hash = hash_long(pmd_addr >> PMD_SHIFT, MARIE_BLOOM_SHIFT * 2);
+
+	key[0] = hash & (MARIE_BLOOM_SIZE - 1);
+	key[1] = (hash >> MARIE_BLOOM_SHIFT) & (MARIE_BLOOM_SIZE - 1);
+}
+
+static unsigned long *marie_bloom_alloc_atomic(void)
+{
+	return bitmap_zalloc(MARIE_BLOOM_SIZE, GFP_ATOMIC);
+}
+
+/*
+ * Producer: feed @pmd_addr into pgdat @nid's inactive bloom. Idempotent.
+ * Bitops are lock-free; only the lazy-alloc slow path takes b->lock.
+ */
+static void marie_bloom_set(int nid, unsigned long pmd_addr)
+{
+	struct marie_bloom *b;
+	unsigned long *filter;
+	unsigned long flags;
+	int key[2];
+
+	if (nid < 0 || nid >= MAX_NUMNODES)
+		return;
+	b = &marie_blooms[nid];
+
+	marie_bloom_keys(pmd_addr, key);
+
+	filter = READ_ONCE(b->inactive);
+	if (filter) {
+		if (!test_bit(key[0], filter))
+			set_bit(key[0], filter);
+		if (!test_bit(key[1], filter))
+			set_bit(key[1], filter);
+		if (!READ_ONCE(b->warmed_up))
+			WRITE_ONCE(b->warmed_up, true);
+		return;
+	}
+
+	/* Slow path: lazy allocate both bitmaps. */
+	spin_lock_irqsave(&b->lock, flags);
+	if (!b->inactive)
+		b->inactive = marie_bloom_alloc_atomic();
+	if (!b->active)
+		b->active = marie_bloom_alloc_atomic();
+	if (!b->inactive) {
+		spin_unlock_irqrestore(&b->lock, flags);
+		return;	/* OOM: walker will use force_scan via !warmed_up */
+	}
+	filter = b->inactive;
+	if (!test_bit(key[0], filter))
+		set_bit(key[0], filter);
+	if (!test_bit(key[1], filter))
+		set_bit(key[1], filter);
+	b->warmed_up = true;
+	spin_unlock_irqrestore(&b->lock, flags);
+}
+
+/*
+ * Consumer: walker hot path. Returns true iff @pmd_addr is in pgdat
+ * @nid's active bloom. NULL active -> false (caller's force_scan path
+ * covers it).
+ */
+static bool marie_bloom_test(int nid, unsigned long pmd_addr)
+{
+	unsigned long *filter;
+	int key[2];
+
+	if (nid < 0 || nid >= MAX_NUMNODES)
+		return false;
+
+	filter = READ_ONCE(marie_blooms[nid].active);
+	if (!filter)
+		return false;
+
+	marie_bloom_keys(pmd_addr, key);
+	return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+/*
+ * Pass-end: swap active <- inactive, clear new inactive. Called from
+ * marie_walk_pgdat under no other lock.
+ */
+static void marie_bloom_swap(int nid)
+{
+	struct marie_bloom *b;
+	unsigned long *tmp;
+	unsigned long flags;
+
+	if (nid < 0 || nid >= MAX_NUMNODES)
+		return;
+	b = &marie_blooms[nid];
+
+	spin_lock_irqsave(&b->lock, flags);
+	tmp = b->active;
+	b->active = b->inactive;
+	b->inactive = tmp;
+	if (b->inactive)
+		bitmap_zero(b->inactive, MARIE_BLOOM_SIZE);
+	spin_unlock_irqrestore(&b->lock, flags);
+}
+
+static inline bool marie_bloom_warmed(int nid)
+{
+	if (nid < 0 || nid >= MAX_NUMNODES)
+		return false;
+	return READ_ONCE(marie_blooms[nid].warmed_up);
+}
+
+/*
+ * Per-CPU walk context: a preallocated mm snapshot buffer so the walker
+ * doesn't kmalloc inside its hot entry path (the walker can be entered
+ * from direct reclaim, where allocator recursion is disallowed).
+ *
+ * Ownership is established by marie_walker_busy below: the pass owner
+ * pins to its CPU via migrate_disable() and claims the per-CPU ctx
+ * with this_cpu_cmpxchg(marie_walker_busy, 0, 1). A preempted-and-
+ * resumed reclaimer that reaches marie_walk_pgdat on the same CPU will
+ * find the flag set and bail, preventing concurrent reuse of the
+ * snapshot buffer. The walker pass itself stays preemptible so
+ * cond_resched() inside marie_walk_pmd_range remains effective.
+ *
+ * marie_walker_next[] lives in the walker section below alongside the
+ * rest of the walker state.
+ */
+#define MARIE_WALK_MAX_MMS	256
+
+struct marie_walk_ctx {
+	struct mm_struct	*mms[MARIE_WALK_MAX_MMS];
+	int			n_mms;
+};
+
+static DEFINE_PER_CPU(struct marie_walk_ctx, marie_walker_ctx);
+static DEFINE_PER_CPU(unsigned int, marie_walker_busy);
+
+/*
+ * ---------------------------------------------------------------------
+ *  Walker -- SIMD + adaptive + per-pgdat
+ * ---------------------------------------------------------------------
+ *
+ * Per pgdat, rate-limited via marie_walker_interval (HZ/30 .. HZ).
+ * Each PMD scans young bits in batch via lru_marie_simd_young_pte_mask_raw
+ * (AVX-512F / AVX2 / SSE2 on x86; scalar fallback on arm64 and other
+ * arches), with an enclosing FPU bracket batched across MARIE_FPU_BATCH
+ * scans. Cross-node folios are filtered out so each pgdat owns its
+ * work cleanly.
+ *
+ * Walker tier promotion is synchronous: marie_tier_inc
+ * (a thin wrapper around marie_state_inc_tier) handles both the
+ * non-saturated bump and the saturate -> in-place promote inside
+ * the same call, so there is no per-CPU promote queue and no
+ * pass-end promote drain.
+ *
+ * Lock contract:
+ *   per-PMD:         holds the existing pte_offset_map_lock ptl
+ *   per-PTE body:    lock-free -- marie_state_inc_tier mutates only the
+ *                    per-PFN state byte
+ *   walker_visits:   lock-free atomic_inc on the global
+ *                    marie_gen_walker_visits[gen][type] counter (read as a
+ *                    >= 1 boolean; reset in marie_try_advance_head)
+ *   bloom rotation:  per-pgdat marie_blooms[nid].lock (irqsave), taken only
+ *                    for lazy alloc / pass-end swap
+ *   per-pgdat deadline: cmpxchg on marie_walker_next[nid]
+ *
+ * Lock ordering: the walker takes the pte ptl and, under it, at most the
+ * per-pgdat bloom lock (a leaf). It takes NO lru_lock and NO per-type
+ * lock anywhere, so it does not participate in -- and cannot invert --
+ * Marie's lru_lock -> type_lock hierarchy.
+ */
+
+/*
+ * Per-pgdat walker deadline (jiffies). One pass per pgdat per
+ * marie_walker_interval(pgdat) is allowed; concurrent reclaimers /
+ * kswapd cycles atomic-cmpxchg to claim the slot.
+ *
+ * MARIE_WALK_MAX_MMS bounds the per-pass task snapshot (see the per-CPU
+ * marie_walker_ctx definition earlier in this file).
+ */
+static atomic_long_t marie_walker_next[MAX_NUMNODES];
+
+/*
+ * MARIE_FPU_BATCH — number of consecutive bloom-hit PMDs scanned under
+ * a single FPU bracket before flushing.
+ *
+ * Trade-off: larger batch amortises kernel_fpu_begin/end (~100 ns each
+ * on x86 xsave/xrstor) across more scans, but extends the preempt-
+ * disabled window proportionally because the bitmap iteration runs
+ * inside the same PTL window with FPU still held. With per-PMD total
+ * cost ~1-25 µs (dominated by marie_state_inc_tier on set bits), a
+ * batch of 16 gives a worst-case preempt window of ~400 µs -- well
+ * within tolerance and acceptable for desktop-grade preemption.
+ *
+ * Bloom misses do NOT advance the counter (the FPU bracket has not been
+ * opened on those PMDs), so the batch is purely "PMDs we actually
+ * scanned in a row".
+ */
+#define MARIE_FPU_BATCH		16
+
+/*
+ * Per-walk FPU batch state.  Lives inside marie_walk_arg so it is reset
+ * automatically each marie_walk_pgdat() invocation; the scoped_guard
+ * around walk_page_range() in marie_walk_one_mm() flushes any partial
+ * batch when the per-mm walk exits, guaranteeing FPU is released
+ * before mmap_read_unlock and before the next mm starts.
+ */
+struct marie_fpu_batch {
+	unsigned int	count;		/* scans in current bracket */
+	bool		held;		/* FPU bracket currently open */
+};
+
+static inline void marie_fpu_batch_open(struct marie_fpu_batch *b)
+{
+	if (!b->held) {
+		lru_marie_simd_batch_begin();
+		b->held = true;
+	}
+}
+
+static inline void marie_fpu_batch_step(struct marie_fpu_batch *b)
+{
+	if (++b->count >= MARIE_FPU_BATCH) {
+		lru_marie_simd_batch_end();
+		b->held = false;
+		b->count = 0;
+	}
+}
+
+static inline void marie_fpu_batch_flush(struct marie_fpu_batch *b)
+{
+	if (b->held) {
+		lru_marie_simd_batch_end();
+		b->held = false;
+		b->count = 0;
+	}
+}
+
+/*
+ * scoped_guard(marie_fpu_batch, &arg->fpu) — entry is a no-op (the
+ * walker opens the bracket lazily on the first bloom-hit PMD); exit
+ * flushes any in-flight bracket.  Wrapped around walk_page_range() in
+ * marie_walk_one_mm() so an early return / fault from the underlying
+ * mm walker still releases FPU before mmap_read_unlock.
+ */
+DEFINE_GUARD(marie_fpu_batch, struct marie_fpu_batch *,
+	     (void)_T,
+	     marie_fpu_batch_flush(_T))
+
+struct marie_walk_arg {
+	struct pglist_data	*pgdat;
+	struct marie_lruvec	*mlv_anon_cache;	/* one-deep lruvec cache */
+	struct marie_lruvec	*mlv_file_cache;
+	bool			force_scan;	/* bypass bloom gate */
+	struct marie_fpu_batch	fpu;		/* per-walk FPU bracket state */
+};
+
+static int marie_walk_pmd_range(pmd_t *pmd, unsigned long start,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct marie_walk_arg *arg = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *pte_table, *orig_pte;
+	spinlock_t *ptl;
+	unsigned long pmd_addr;
+	unsigned long bitmap[MARIE_SIMD_PTE_BITMAP_LONGS] = { 0 };
+	int bit;
+
+	if (!vma)
+		return 0;
+
+	pmd_addr = start & PMD_MASK;
+
+	/*
+	 * Bloom gate (Consumer side of rmap → walker forward feedback).
+	 * Skip PMDs the rmap path has not flagged as recently-young; the
+	 * SIMD scan + tier++ work is paid only for hot PMDs. force_scan
+	 * bypasses the gate during cold-boot / freshly-online pgdats where
+	 * the bloom has yet to be primed by look_around. Returning before
+	 * pte_offset_map_lock() avoids the PTL cost on misses too.
+	 */
+	if (!arg->force_scan &&
+	    !marie_bloom_test(arg->pgdat->node_id, pmd_addr))
+		return 0;
+
+	/*
+	 * pte_offset_map_lock returns pte_base + pte_index(start), which may
+	 * not be at the start of the page table.  The SIMD kernel must receive
+	 * the page-table base (index 0) so that:
+	 *   (a) the 512-entry scan does not walk past the end of the page, and
+	 *   (b) bit N in the output bitmap corresponds to pte_base[N], making
+	 *       "pte_base + bit" the correct per-entry pointer in the loop.
+	 *
+	 * Keep orig_pte (= pte_base + pte_index(start)) for pte_unmap_unlock.
+	 */
+	pte_table = pte_offset_map_lock(walk->mm, pmd, start, &ptl);
+	if (!pte_table)
+		return 0;
+	orig_pte = pte_table;
+
+	/*
+	 * Open the FPU bracket lazily on the first bloom-hit PMD of this
+	 * batch and run the raw scan inside it.  marie_fpu_batch_step()
+	 * closes the bracket once we've accumulated MARIE_FPU_BATCH
+	 * scans; subsequent bloom hits reopen for the next batch.  The
+	 * scoped_guard around walk_page_range() in marie_walk_one_mm()
+	 * flushes any partial batch on walker exit.
+	 */
+	marie_fpu_batch_open(&arg->fpu);
+	lru_marie_simd_young_pte_mask_raw(pte_table - pte_index(start), bitmap);
+	marie_fpu_batch_step(&arg->fpu);
+
+	for_each_set_bit(bit, bitmap, 512) {
+		unsigned long addr = pmd_addr + bit * PAGE_SIZE;
+		pte_t *pte = orig_pte - pte_index(start) + bit;
+		pte_t ptent;
+		unsigned long pfn;
+		struct folio *folio;
+		struct marie_lruvec *mlv;
+		struct lruvec *lv;
+		int type;
+		int next_bit;
+
+		/* Peek ahead at the next set bit and prefetch its
+		 * struct page into L1. The body chain below does multiple
+		 * folio-struct accesses (folio_pgdat, marie_test_tracked,
+		 * folio_is_file_lru, folio_lruvec, marie_state_inc_tier).
+		 * Sparse bitmap iteration defeats the hardware prefetcher,
+		 * so an explicit lookahead hides struct page L2/L3 latency. */
+		next_bit = find_next_bit(bitmap, 512, bit + 1);
+		if (next_bit < 512) {
+			pte_t next_ptent = ptep_get(orig_pte - pte_index(start)
+						    + next_bit);
+			unsigned long next_pfn = pte_pfn(next_ptent);
+
+			if (pte_present(next_ptent) && pfn_valid(next_pfn))
+				__builtin_prefetch(pfn_to_page(next_pfn), 0, 3);
+		}
+
+		/* Only process PTEs within the [start, end) walk range. */
+		if (addr < start || addr >= end)
+			continue;
+
+		ptent = ptep_get(pte);
+		if (!pte_present(ptent) || pte_special(ptent))
+			continue;
+
+		pfn = pte_pfn(ptent);
+		if (is_zero_pfn(pfn) || !pfn_valid(pfn))
+			continue;
+
+		folio = pfn_folio(pfn);
+
+		/* Skip cross-node folios -- this pass is per pgdat. */
+		if (folio_pgdat(folio) != arg->pgdat)
+			continue;
+
+		/* Lock-free pre-filter: act only on folios Marie is currently
+		 * tracking. folio->lru is no longer a Marie-state signal --
+		 * tracked folios sit on a self-loop (post install/flush) or
+		 * on legacy lruvec->lists[lru] (post drain); only the per-PFN
+		 * TRACKED bit identifies "Marie has a live tier value for
+		 * this folio." */
+		if (!folio_marie_test_tracked(folio))
+			continue;
+
+		if (!ptep_test_and_clear_young(vma, addr, pte))
+			continue;
+
+		type = folio_is_file_lru(folio);
+		lv = marie_folio_lruvec_rcu(folio);
+
+		/* One-deep cache: most consecutive PTEs in an mm walk hit
+		 * the same lruvec. Avoid the xa_load on every PTE. */
+		if (type == 0 && arg->mlv_anon_cache &&
+		    arg->mlv_anon_cache->lruvec == lv) {
+			mlv = arg->mlv_anon_cache;
+		} else if (type == 1 && arg->mlv_file_cache &&
+			   arg->mlv_file_cache->lruvec == lv) {
+			mlv = arg->mlv_file_cache;
+		} else {
+			/*
+			 * lv->marie_mlv is authoritative (no side xarray) and
+			 * stays valid for the rest of this pass without RCU:
+			 * the folios being walked are mapped in this mmget'd mm
+			 * and therefore charged, which pins their memcg ->
+			 * lruvec -> mlv alive (mlv is freed only at
+			 * mem_cgroup_free).
+			 */
+			mlv = READ_ONCE(lv->marie_mlv);
+			if (!mlv)
+				continue;
+			if (type == 0)
+				arg->mlv_anon_cache = mlv;
+			else
+				arg->mlv_file_cache = mlv;
+		}
+
+		/*
+		 * Walker tier++: marie_tier_inc is a thin
+		 * wrapper over marie_state_inc_tier on the per-PFN byte;
+		 * non-saturated bumps are a lock-free WRITE_ONCE and the
+		 * saturate path triggers a synchronous in-place promote
+		 * (marie_state_move_to_gen to head_gen at tier 0). No
+		 * per-tier counter to update -- tier lives entirely in
+		 * marie_state[pfn] and the per-(type, gen, tier) bitmap
+		 * is moved alongside the byte by the helper itself.
+		 */
+		(void)marie_tier_inc(mlv, folio);
+	}
+
+	pte_unmap_unlock(orig_pte, ptl);
+	cond_resched();
+	return 0;
+}
+
+static const struct mm_walk_ops marie_walk_ops = {
+	.pmd_entry	= marie_walk_pmd_range,
+	.walk_lock	= PGWALK_RDLOCK,
+};
+
+static void marie_walk_one_mm(struct mm_struct *mm, struct marie_walk_arg *arg)
+{
+	if (!mmap_read_trylock(mm))
+		return;
+	scoped_guard(marie_fpu_batch, &arg->fpu)
+		walk_page_range(mm, 0, TASK_SIZE, &marie_walk_ops, arg);
+	mmap_read_unlock(mm);
+}
+
+/**
+ * marie_walk_pgdat - run one walker pass for @pgdat.
+ *
+ * Atomically claims the per-pgdat deadline; concurrent reclaimers /
+ * kswapd cycles either advance the deadline or no-op. The walker
+ * snapshots running mm_struct's via for_each_process under RCU,
+ * walks each via the SIMD pmd_entry handler, then drains both
+ * per-CPU queues for every mlv on this pgdat.
+ *
+ * Safe from any context that allows brief sleeping (cond_resched in
+ * the inner walk).
+ */
+void marie_walk_pgdat(struct pglist_data *pgdat)
+{
+	int nid = pgdat->node_id;
+	unsigned long deadline;
+	struct marie_walk_ctx *ctx;
+	struct task_struct *p;
+	struct marie_walk_arg arg = {
+		.pgdat = pgdat,
+		/*
+		 * force_scan disabled: the cold-bloom force_scan was the
+		 * dominant kswapd startup latency under fault-burst (full
+		 * PMD scan = ~45 ms on memhog 2.5 GB). The walker's role is
+		 * purely tier promotion; folio_check_references' Marie gate
+		 * (vmscan.c) is independent of walker state because
+		 * lru_marie_mark_accessed funnels external access into tier rather
+		 * than PG_referenced, so reclaim functions correctly even
+		 * with an unprimed bloom. Bloom is warmed lazily by
+		 * lru_marie_look_around during the first reclaim cycle's rmap walk.
+		 */
+		.force_scan = false,
+	};
+	int i;
+
+	if (nid >= MAX_NUMNODES)
+		return;	/* defensive */
+
+	/* Atomic test-and-claim deadline for this pgdat. */
+	deadline = (unsigned long)atomic_long_read(&marie_walker_next[nid]);
+	if (time_before(jiffies, deadline))
+		return;
+	if ((unsigned long)atomic_long_cmpxchg(&marie_walker_next[nid],
+					       (long)deadline,
+					       (long)(jiffies + marie_walker_interval(pgdat))) != deadline)
+		return;	/* lost race to another reclaimer */
+
+	/*
+	 * Pin to this CPU and reentrancy-claim its per-CPU walker ctx.
+	 * The walker iterates up to MARIE_WALK_MAX_MMS mm_struct's per pass
+	 * and walks each up to TASK_SIZE; running the entire pass with
+	 * preempt_disable() makes cond_resched() inside marie_walk_pmd_range
+	 * a no-op and starves the rest of the system to RCU stall under
+	 * sustained memory pressure (observed as desktop stutter then
+	 * freeze on real hardware). migrate_disable() keeps us on the
+	 * CPU whose marie_walker_ctx we own, while marie_walker_busy stops a
+	 * preempted-and-resumed reclaimer from reaching marie_walk_pgdat
+	 * for a different pgdat on the same CPU and clobbering the
+	 * in-flight snapshot.
+	 */
+	migrate_disable();
+	if (this_cpu_cmpxchg(marie_walker_busy, 0, 1) != 0) {
+		migrate_enable();
+		return;
+	}
+
+	ctx = this_cpu_ptr(&marie_walker_ctx);
+	ctx->n_mms = 0;
+
+	rcu_read_lock();
+	for_each_process(p) {
+		struct mm_struct *mm = READ_ONCE(p->mm);
+
+		if (!mm || ctx->n_mms >= MARIE_WALK_MAX_MMS)
+			continue;
+		if (!mmget_not_zero(mm))
+			continue;
+		ctx->mms[ctx->n_mms++] = mm;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Walk preemptibly.  FPU bracket is held across MARIE_FPU_BATCH
+	 * consecutive bloom-hit PMDs (see marie_fpu_batch_* helpers above)
+	 * and flushed by the scoped_guard around walk_page_range() inside
+	 * marie_walk_one_mm(), so the per-mm walk always exits with FPU
+	 * released.  Whole-pass FPU scope is avoided because the bitmap
+	 * iteration runs inside the bracket and would extend the
+	 * preempt-disabled window by the full iteration (~100 ms on memhog);
+	 * the batched-per-PMD scope keeps preempt windows bounded at
+	 * MARIE_FPU_BATCH x per-PMD time.
+	 */
+	for (i = 0; i < ctx->n_mms; i++) {
+		marie_walk_one_mm(ctx->mms[i], &arg);
+		/*
+		 * mmput_async, not mmput: if our mmget_not_zero above pinned the
+		 * last reference (the owning task exited mid-walk), a plain mmput
+		 * here drops to zero and enters __mmput -> exit_mmap, which takes
+		 * mm->mmap_lock. marie_walk_pgdat runs from kswapd's balance_pgdat
+		 * with fs_reclaim held; taking mmap_lock under fs_reclaim closes
+		 * the cycle against the execve path that takes mmap_lock then
+		 * allocates (fs_reclaim) via mas_alloc_nodes. Caught by lockdep
+		 * as a circular dependency and reproduced as a desktop hang under
+		 * memory pressure with concurrent fork/exec. MGLRU solves the
+		 * same problem the same way in iterate_mm_list (mm/vmscan.c).
+		 */
+		mmput_async(ctx->mms[i]);
+	}
+
+	/*
+	 * Pass-end housekeeping: mark every (gen, type) as walker-visited.
+	 *
+	 * marie_gen_walker_visits is a GLOBAL [gen][type] counter read only
+	 * as a >= 1 boolean -- marie_state_shrink_lruvec gates
+	 * ignore_references on "walker has visited this gen at least once",
+	 * and the slot is reset to 0 in marie_try_advance_head on recycle.
+	 * A single bump per pass is therefore equivalent to the former
+	 * per-mlv bump, and there is no per-mlv state to touch here: walker
+	 * tier saturate is materialised inline by marie_state_inc_tier
+	 * during the per-PMD walk, so no promote-queue drain accumulates.
+	 *
+	 * This replaces the former xa_for_each(&marie_lruvec_xa) -- the only
+	 * full traversal of that xarray, and the rcu_read_lock it ran under.
+	 * mlv lifetime is now tied to the lruvec/memcg (lv->marie_mlv), so
+	 * neither the traversal nor marie_drop_lruvec's synchronize_rcu() is
+	 * needed.
+	 */
+	{
+		int t, g;
+
+		for (t = 0; t < ANON_AND_FILE; t++)
+			for (g = 0; g < MARIE_PFN_NR_GENS; g++)
+				atomic_inc(&marie_gen_walker_visits[g][t]);
+	}
+
+	/*
+	 * Pass-end bloom rotation: the inactive filter has accumulated
+	 * Producer (look_around) feedback during this reclaim window;
+	 * promote it to active so the next pass scans those PMDs. The
+	 * old active is recycled as the new inactive, cleared of stale
+	 * bits.
+	 */
+	marie_bloom_swap(nid);
+
+	/* Release the per-CPU ctx claim before allowing migration. */
+	this_cpu_write(marie_walker_busy, 0);
+	migrate_enable();
+}
+
+/**
+ * lru_marie_age_node - kswapd's pre-reclaim aging hook.
+ *
+ * MGLRU's `lru_gen_age_node()` analogue. Called from kswapd_age_node()
+ * before direct reclaim machinery runs, so the gen ring has fresh
+ * hot/cold ordering by the time pressure builds. Delegates to the
+ * per-pgdat walker; rate-limiting is internal so calling on every
+ * kswapd cycle is fine.
+ */
+void lru_marie_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+	marie_walk_pgdat(pgdat);
+}
+EXPORT_SYMBOL_GPL(lru_marie_age_node);
+
+/**
+ * marie_walker_init - one-shot init for the walker.
+ *
+ * Initialises per-pgdat bloom-filter spinlocks. Bitmaps themselves
+ * are lazily allocated by marie_bloom_set() on first Producer hit
+ * (under PTL, GFP_ATOMIC). Called from marie_init() in mm/lru_marie/core.c.
+ */
+void marie_walker_init(void)
+{
+	int nid;
+
+	for (nid = 0; nid < MAX_NUMNODES; nid++)
+		spin_lock_init(&marie_blooms[nid].lock);
+}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 433bba9dfe..bed1c07fe4 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -11,6 +11,7 @@
 #include <linux/sort.h>
 #include <linux/file.h>
 #include <linux/seq_buf.h>
+#include <linux/lru_marie.h>
 
 #include "internal.h"
 #include "swap.h"
@@ -2000,6 +2001,18 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 	} else
 		WRITE_ONCE(vm_swappiness, val);
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Notify Marie so its per-lruvec swap_bias controllers reset to
+	 * neutral under the new value. lru_marie_swappiness_changed
+	 * walks every lruvec once -- the reset is unconditional, not
+	 * scoped to @memcg, because the controller's only state is the
+	 * bias counter and an extra reset on unaffected lruvecs is
+	 * harmless. See lru_marie.h.
+	 */
+	lru_marie_swappiness_changed();
+#endif
+
 	return 0;
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 749c128b4f..9f8f52c8ca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -28,6 +28,7 @@
 #include <linux/cgroup-defs.h>
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
+#include <linux/lru_marie.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/sched/mm.h>
@@ -292,6 +293,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
 			lru_reparent_memcg(memcg, parent, nid);
 		}
 
+#ifdef CONFIG_LRU_MARIE
+		/*
+		 * Marie tracks folios in its own per-PFN / per-mlv structures,
+		 * independent of MGLRU and legacy, so its reparent must run
+		 * regardless of which branch above fired (the lru_gen branch
+		 * never calls lru_reparent_memcg, where the helper is otherwise
+		 * documented to live). reparent_locks() above holds objcg_lock
+		 * plus both lruvecs' lru_lock with IRQs disabled, exactly
+		 * matching lru_marie_reparent_lruvec's contract; the call
+		 * short-circuits when lru_marie is off or the child has no mlv.
+		 *
+		 * This is mandatory on 7.1: cgroup_rstat_exit() now runs before
+		 * css_free(), so a Marie counter update on the child after this
+		 * point (e.g. from lru_marie_exit_memcg) would hit a freed
+		 * css_rstat_cpu and crash in css_rstat_updated(). Draining the
+		 * child's Marie state into the parent here prevents that.
+		 */
+		lru_marie_reparent_lruvec(mem_cgroup_lruvec(memcg, NODE_DATA(nid)),
+					  mem_cgroup_lruvec(parent, NODE_DATA(nid)));
+#endif
+
 		objcg = __memcg_reparent_objcgs(memcg, parent, nid);
 
 		reparent_unlocks(memcg, parent, nid);
@@ -3967,6 +3989,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	lru_gen_exit_memcg(memcg);
+#ifdef CONFIG_LRU_MARIE
+	lru_marie_exit_memcg(memcg);
+#endif
 	memcg_wb_domain_exit(memcg);
 	__mem_cgroup_free(memcg);
 }
@@ -4040,6 +4065,15 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	memcg->deferred_split_queue.split_queue_len = 0;
 #endif
 	lru_gen_init_memcg(memcg);
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Per-memcg bitmap (G) alloc: no-op unless lru_marie.memcg_bitmap=1
+	 * is set on the kernel cmdline. Alloc failure is non-fatal -- the
+	 * scan path falls back to per-candidate folio_memcg() lookup
+	 * (hybrid E semantics).
+	 */
+	(void)lru_marie_memcg_alloc(memcg);
+#endif
 	return memcg;
 fail:
 	mem_cgroup_private_id_remove(memcg);
@@ -4199,6 +4233,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	reparent_shrinker_deferred(memcg);
 	wb_memcg_offline(memcg);
 	lru_gen_offline_memcg(memcg);
+	lru_marie_offline_memcg(memcg);
 
 	drain_all_stock(memcg);
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af92..5855f9ef89 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1395,6 +1395,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 	pgdat_init_kcompactd(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
+#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP)
+	init_waitqueue_head(&pgdat->kcompmari_wait);
+#endif
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 
 	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23c7298d3b..1978c03b10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
 
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/lru_marie.h>
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -1314,6 +1315,28 @@ __always_inline bool __free_pages_prepare(struct page *page,
 	trace_mm_page_free(page, order);
 	kmsan_free_page(page, order);
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Wipe Marie's per-PFN state at the buddy handoff. Marie's reclaim
+	 * isolate path intentionally leaves marie_state[pfn]'s TRACKED bit
+	 * set across shrink_folio_list (so install_local's TRACKED early-
+	 * out keeps blocking concurrent installs on the in-flight folio);
+	 * this hook is the canonical point at which that stale bit must
+	 * disappear so the next allocation at this PFN starts clean. No-op
+	 * when TRACKED is already 0 (normal Marie del path cleared it).
+	 * Only the head PFN ever carries TRACKED for compound folios.
+	 *
+	 * Gated on marie_state_ready() (latched at marie_state[] alloc),
+	 * NOT lru_marie_enabled(): a Marie disable transition flips the
+	 * enable key false while marie_drain is still walking the bitmaps,
+	 * and freed pages in that window must still have their stale
+	 * TRACKED bits wiped here -- otherwise the drain walk dereferences
+	 * a re-allocated folio's poisoned list head and oopses.
+	 */
+	if (marie_state_ready())
+		lru_marie_free_page_hook(page_to_pfn(page));
+#endif
+
 	if (memcg_kmem_online() && PageMemcgKmem(page))
 		__memcg_kmem_uncharge_page(page, order);
 
@@ -4595,6 +4618,45 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
 		goto out;
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Marie swap-backend-failure OOM trigger.
+	 *
+	 * Catches "get_nr_swap_pages() > 0 but writes still fail" — primarily
+	 * ZRAM/zswap zs_malloc starvation when free RAM cannot satisfy the
+	 * compression buffer, but also disk swap I/O errors. In this state
+	 * can_reclaim_anon_pages() still reports true (slots appear free), so
+	 * the pick driver keeps attempting anon swapout that never completes;
+	 * left to run it grinds the file working set down to the clean_min_ratio
+	 * floor before the no-progress path finally OOMs. Trip OOM as soon as
+	 * the backend has rejected more than MAX_SWAP_WRITE_FAIL_RETRIES writes
+	 * during this allocation, well before that grind. The threshold
+	 * tolerates a handful of transient failures (concurrent ZRAM ops, brief
+	 * retry windows).
+	 *
+	 * The free+swap exhaustion case needs no early watermark here: the pick
+	 * driver and the legacy drain both withhold file reclaim at the
+	 * clean_min_ratio floor (marie_file_floor_protect), so once file is at
+	 * the floor and anon is unreclaimable reclaim returns no progress and
+	 * the stock no_progress_loops path OOMs promptly — mirroring le9uo's
+	 * reliance on the no-progress path at any floor size.
+	 *
+	 * Skipped for reserve / OOM-victim allocations (ALLOC_OOM,
+	 * ALLOC_NO_WATERMARKS, tsk_is_oom_victim): those contexts exist to let a
+	 * dying system make forward progress. MGLRU/Legacy builds
+	 * (lru_marie_enabled()=false) keep vanilla retry semantics so this does
+	 * not leak into baseline comparisons.
+	 */
+	if (lru_marie_enabled() &&
+	    likely(!(alloc_flags & (ALLOC_OOM | ALLOC_NO_WATERMARKS))) &&
+	    likely(!tsk_is_oom_victim(current))) {
+		long swap_fail_delta = atomic_long_read(&nr_swap_write_failed) -
+				       ac->initial_swap_write_failed;
+
+		if (swap_fail_delta > MAX_SWAP_WRITE_FAIL_RETRIES)
+			goto out;
+	}
+#endif
 
 	/*
 	 * Keep reclaiming pages while there is a chance this will lead
@@ -4718,6 +4780,20 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
 	}
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Snapshot the global swap-write-fail counter at the start of this
+	 * allocation. should_reclaim_retry compares against this baseline so
+	 * "swap backend rejected N writes since I started trying" can short-
+	 * circuit the MAX_RECLAIM_RETRIES wait. See include/linux/swap.h.
+	 *
+	 * Snapshot unconditionally under CONFIG_LRU_MARIE so the field is in a
+	 * defined state even if the lru_marie_enabled() gate flips between
+	 * here and should_reclaim_retry's read.
+	 */
+	ac->initial_swap_write_failed = atomic_long_read(&nr_swap_write_failed);
+#endif
+
 restart:
 	compaction_retries = 0;
 	no_progress_loops = 0;
diff --git a/mm/page_io.c b/mm/page_io.c
index 70cea9e24d..3c958fbda1 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -25,8 +25,19 @@
 #include <linux/sched/task.h>
 #include <linux/delayacct.h>
 #include <linux/zswap.h>
+#include <linux/kfifo.h>
+#include <linux/lru_marie.h>
 #include "swap.h"
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * Counter consumed by the early-OOM gate in
+ * mm/page_alloc.c:should_reclaim_retry. Declared in include/linux/swap.h.
+ * Marie-only: omitted entirely under CONFIG_LRU_MARIE=n.
+ */
+atomic_long_t nr_swap_write_failed = ATOMIC_LONG_INIT(0);
+#endif
+
 static void __end_swap_bio_write(struct bio *bio)
 {
 	struct folio *folio = bio_first_folio_all(bio);
@@ -39,7 +50,21 @@ static void __end_swap_bio_write(struct bio *bio)
 		 * very quickly.
 		 *
 		 * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
+		 *
+		 * Bump nr_swap_write_failed so the early-OOM gate in
+		 * should_reclaim_retry can short-circuit the
+		 * MAX_RECLAIM_RETRIES wait when the swap backend (most
+		 * commonly ZRAM/zswap zs_malloc, or a real disk error) has
+		 * stopped accepting writes — anon reclaim is doomed in that
+		 * state regardless of get_nr_swap_pages() reporting free
+		 * entries. Marie-only signal; vanilla MGLRU/Legacy builds
+		 * (lru_marie_enabled()=false) skip the counter bump so the
+		 * baseline allocator sees vanilla retry behaviour.
 		 */
+#ifdef CONFIG_LRU_MARIE
+		if (lru_marie_enabled())
+			atomic_long_inc(&nr_swap_write_failed);
+#endif
 		folio_mark_dirty(folio);
 		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
 				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
@@ -233,6 +258,120 @@ static void swap_zeromap_folio_clear(struct folio *folio)
 	}
 }
 
+/*
+ * do_swapout() - Write a folio to swap space
+ * @folio: The folio to write out
+ *
+ * This function writes the folio to swap space, either using zswap or
+ * synchronous write. It ensures that the folio is unlocked and the
+ * reference count is decremented after the operation.
+ */
+static inline void do_swapout(struct folio *folio, struct swap_iocb **swap_plug)
+{
+	if (zswap_store(folio)) {
+		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
+		folio_unlock(folio);
+	} else
+		__swap_writepage(folio, swap_plug); /* Implies folio_unlock(folio) */
+
+	/* Decrement the folio reference count */
+	folio_put(folio);
+}
+
+#ifdef CONFIG_LRU_MARIE
+/*
+ * kcompmari_store() - Off-load folio compression to kcompmari
+ * @folio: The folio to compress
+ *
+ * This function attempts to off-load the compression of the folio to
+ * kcompmari. If kcompmari is not available or the folio cannot be
+ * compressed, it falls back to synchronous write.
+ *
+ * Returns true if the folio was successfully queued for compression,
+ * false otherwise.
+ */
+static bool kcompmari_store(struct folio *folio, struct swap_iocb **swap_plug)
+{
+	pg_data_t *pgdat = NODE_DATA(numa_node_id());
+	unsigned int ret;
+	struct folio *head = NULL;
+
+	/* Only kswapd can use kcompmari */
+	if (!current_is_kswapd())
+		return false;
+
+	/* Mode 0, or mode 1 with Marie off — short-circuit on the static branches. */
+	if (!kcompmari_active())
+		return false;
+
+	/* kthread must be running */
+	if (unlikely(!pgdat->kcompmari))
+		return false;
+
+	/* We can only off-load anon folios */
+	if (!folio_test_anon(folio))
+		return false;
+
+	/* Fall back to synchronously return AOP_WRITEPAGE_ACTIVATE.
+	 * folio_memcg -> obj_cgroup_memcg requires RCU read-side held to
+	 * keep objcg from being freed by a concurrent memcg teardown
+	 * (lockdep_assert_once in obj_cgroup_memcg). */
+	{
+		bool zswap_wb_ok;
+
+		rcu_read_lock();
+		zswap_wb_ok = mem_cgroup_zswap_writeback_enabled(folio_memcg(folio));
+		rcu_read_unlock();
+		if (!zswap_wb_ok)
+			return false;
+	}
+
+	/* Swap device must be sync-efficient */
+	if (!zswap_is_enabled() &&
+		!data_race(__swap_entry_to_info(folio->swap)->flags & SWP_SYNCHRONOUS_IO))
+		return false;
+
+	/*
+	 * The kfifo backing storage is sized at KCOMPMARI_FIFO_SIZE (the
+	 * compile-time max). The effective queue depth is |vm_kcompmari|;
+	 * when current depth meets or exceeds that, treat the queue as
+	 * full and swap out the head folio synchronously to make space.
+	 */
+	scoped_guard(spinlock_irqsave, &pgdat->kcompmari_fifo_lock)
+		if (kfifo_len(&pgdat->kcompmari_fifo) >=
+			abs(READ_ONCE(vm_kcompmari)) * sizeof(struct folio *) &&
+			unlikely(!kfifo_out(&pgdat->kcompmari_fifo,
+					&head, sizeof(folio))))
+			return false;
+
+	/* Increment the folio reference count to avoid it being freed */
+	folio_get(folio);
+
+	/* Enqueue the folio for compression */
+	ret = kfifo_in(&pgdat->kcompmari_fifo, &folio, sizeof(folio));
+	if (likely(ret))
+		/* We successfully enqueued the folio. wake up kcompmari */
+		wake_up_interruptible(&pgdat->kcompmari_wait);
+	else
+		/* Enqueue failed, so we must cancel the reference count */
+		folio_put(folio);
+
+	/* If we had to swap out the head folio, do it now.
+	 * This will block until the folio is written out.
+	 */
+	if (head)
+		do_swapout(head, swap_plug);
+
+	return ret;
+}
+#else  /* !CONFIG_LRU_MARIE */
+static inline bool kcompmari_store(struct folio *folio,
+				   struct swap_iocb **swap_plug)
+{
+	return false;
+}
+#endif
+
 /*
  * We may have stale swap cache pages in memory: notice
  * them here and get rid of the unnecessary final write.
@@ -272,6 +411,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	 */
 	swap_zeromap_folio_clear(folio);
 
+	/*
+	 * Compression within zswap and zram might block rmap, unmap
+	 * of both file and anon pages, try to do compression async
+	 * if possible
+	 */
+	if (kcompmari_store(folio, swap_plug))
+		return 0;
+
 	if (zswap_store(folio)) {
 		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
 		goto out_unlock;
@@ -292,6 +439,46 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	return ret;
 }
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * kcompmari() - Kernel thread for compressing folios
+ * @p: Pointer to pg_data_t structure
+ *
+ * This function runs in a kernel thread and waits for folios to be
+ * queued for compression. It processes the folios by calling do_swapout()
+ * on them, which handles the actual writing to swap space.
+ */
+int kcompmari(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t *)p;
+	struct folio *folio;
+
+	/* * kcompmari runs with PF_MEMALLOC and PF_KSWAPD flags set to
+	 * allow it to allocate memory for compression without being
+	 * restricted by the current memory allocation context.
+	 * Also PF_KSWAPD prevents Intel Graphics driver from crashing
+	 * the system in i915_gem_shrinker.c:i915_gem_shrinker_scan()
+	 */
+	current->flags |= PF_MEMALLOC | PF_KSWAPD;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(pgdat->kcompmari_wait,
+				!kfifo_is_empty(&pgdat->kcompmari_fifo));
+
+		while (kfifo_out_locked(&pgdat->kcompmari_fifo,
+				&folio, sizeof(folio), &pgdat->kcompmari_fifo_lock))
+			/*
+			 * kcompmari is async reclaim writeback; pass a NULL
+			 * swap_plug so __swap_writepage submits each folio's
+			 * bio immediately rather than batching it on a plug
+			 * the caller would have to unplug.
+			 */
+			do_swapout(folio, NULL);
+	}
+	return 0;
+}
+#endif /* CONFIG_LRU_MARIE */
+
 static inline void count_swpout_vm_event(struct folio *folio)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/rmap.c b/mm/rmap.c
index 78b7fb5f36..0cfaf5bcce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -75,6 +75,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/mm_inline.h>
 #include <linux/oom.h>
+#include <linux/lru_marie.h>
 
 #include <asm/tlb.h>
 
@@ -981,6 +982,11 @@ static bool folio_referenced_one(struct folio *folio,
 		if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) {
 			if (lru_gen_look_around(&pvmw, nr))
 				referenced++;
+#ifdef CONFIG_LRU_MARIE
+		} else if (lru_marie_enabled() && pvmw.pte) {
+			if (lru_marie_look_around(&pvmw, nr))
+				referenced++;
+#endif
 		} else if (pvmw.pte) {
 			if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
 				referenced++;
diff --git a/mm/swap.c b/mm/swap.c
index 5cc44f0de9..5696ddebbd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -37,6 +37,7 @@
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/lru_marie.h>
 
 #include "internal.h"
 
@@ -73,11 +74,39 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
 static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
 		unsigned long *flagsp)
 {
-	if (folio_test_lru(folio)) {
-		folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
-		lruvec_del_folio(*lruvecp, folio);
-		__folio_clear_lru_flags(folio);
+	/*
+	 * PG_lru is the "on an LRU list, still holding +nr LRU accounting"
+	 * signal. A folio that Marie's reclaim isolate already claimed has
+	 * PG_lru clear and its Marie counters already wound down
+	 * (marie_account_evict_isolate); only its per-PFN TRACKED byte stays
+	 * set, until the buddy handoff (marie_state_drop_pfn_at_free). Gating
+	 * the Marie del path on PG_lru -- not TRACKED alone -- keeps such an
+	 * isolated folio from being evict-accounted a SECOND time here: that
+	 * double count (a TRACKED-only gate over-firing on the isolate path)
+	 * drove marie_nr_folios and the per-mlv scan counters negative, which
+	 * in turn fed a runaway reclaim scan. Legacy del is already
+	 * PG_lru-gated, so an off-LRU folio was always a no-op here anyway.
+	 *
+	 * For an on-LRU folio, TRACKED then selects Marie del over legacy
+	 * del. Both debit mz->lru_zone_size now (marie_update_lru_size is
+	 * unified with legacy update_lru_size), so the choice is about the
+	 * LIST, not the count: a TRACKED folio sits on a Marie self-loop, so
+	 * legacy lruvec_del_folio's list_del would corrupt it -- it must go
+	 * through lru_marie_release_folio, which unlinks the self-loop and
+	 * debits mz. See lru_marie_release_folio's contract in
+	 * <linux/lru_marie.h>.
+	 */
+	if (!folio_test_lru(folio))
+		return;
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled() && lru_marie_test_tracked(folio)) {
+		lru_marie_release_folio(folio, lruvecp, flagsp);
+		return;
 	}
+#endif
+	folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
+	lruvec_del_folio(*lruvecp, folio);
+	__folio_clear_lru_flags(folio);
 }
 
 /*
@@ -171,6 +200,27 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 		folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
 		move_fn(lruvec, folio);
 
+#ifdef CONFIG_LRU_MARIE
+		/*
+		 * lru_add's move_fn routes through lruvec_add_folio ->
+		 * lru_marie_add_folio, which (on success) installs the folio
+		 * into Marie and sets PG_lru itself. Marie's reclaim isolate
+		 * path claims folios with a lock-free folio_test_clear_lru and
+		 * does NOT hold lru_lock, so it can clear PG_lru in the window
+		 * between the install above and this trailing folio_set_lru.
+		 * Re-setting PG_lru here would then stamp PG_lru back onto a
+		 * folio the isolate path already owns and is about to free,
+		 * tripping "Bad page state |lru|" at free_unref_folios. When
+		 * the folio is Marie-tracked, install already published PG_lru;
+		 * skip the redundant (and racy) re-set. The non-lru_add move_fns
+		 * (rotate/activate/deactivate/lazyfree) are gated away from
+		 * Marie folios at their swap.c entry points, so they never
+		 * reach here for a tracked folio.
+		 */
+		if (lru_marie_enabled() && lru_marie_test_tracked(folio))
+			continue;
+#endif
+
 		folio_set_lru(folio);
 	}
 
@@ -215,6 +265,44 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
 	if (folio_test_unevictable(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * This rotate-batch move_fn can run in hardirq: the lru_move_tail
+	 * batch is flushed from folio_end_writeback() in the block-completion
+	 * IRQ (e.g. nvme_irq -> blk_mq_end_request_batch). Marie's
+	 * lruvec_del_folio / lruvec_add_folio_tail hooks must not run there:
+	 * they assert !in_hardirq(), and lru_marie_add_folio() would ADOPT the
+	 * folio into Marie (which never credits mz->lru_zone_size) right after
+	 * the legacy del already did mz -nr, underflowing mz->lru_zone_size.
+	 *
+	 * Under Marie, handle the rotate without those hooks:
+	 *   - a tracked folio does not sit on the legacy list and ages by gen
+	 *     rotation, so rotate-to-tail is a no-op -- skip it;
+	 *   - a non-tracked folio is on the legacy lruvec list (mz-accounted),
+	 *     so rotate it with pure legacy list ops (this mirrors
+	 *     lruvec_del_folio + lruvec_add_folio_tail for an evictable,
+	 *     non-tracked folio, minus the Marie/lru_gen hooks).
+	 */
+	if (lru_marie_enabled()) {
+		long nr_pages = folio_nr_pages(folio);
+		int zid = folio_zonenum(folio);
+		enum lru_list lru;
+
+		if (lru_marie_test_tracked(folio))
+			return;
+
+		lru = folio_lru_list(folio);
+		list_del(&folio->lru);
+		update_lru_size(lruvec, lru, zid, -nr_pages);
+		folio_clear_active(folio);
+		lru = folio_lru_list(folio);
+		update_lru_size(lruvec, lru, zid, nr_pages);
+		list_add_tail(&folio->lru, &lruvec->lists[lru]);
+		__count_vm_events(PGROTATED, nr_pages);
+		return;
+	}
+#endif
+
 	lruvec_del_folio(lruvec, folio);
 	folio_clear_active(folio);
 	lruvec_add_folio_tail(lruvec, folio);
@@ -234,6 +322,14 @@ void folio_rotate_reclaimable(struct folio *folio)
 	    folio_test_unevictable(folio) || !folio_test_lru(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/* Marie folios bypass legacy LRU lists; apply the rotate on the
+	 * per-PFN state (demote toward prompt reclaim) instead of queueing
+	 * the legacy lru_move_tail batch. See lru_marie_rotate(). */
+	if (lru_marie_rotate(folio))
+		return;
+#endif
+
 	folio_batch_add_and_move(folio, lru_move_tail);
 }
 
@@ -304,6 +400,12 @@ void lru_note_cost_refault(struct folio *folio)
 				folio_nr_pages(folio), 0);
 }
 
+/*
+ * lru_marie_orphan_add() (the non-adopting legacy add for untracked orphans
+ * inside a del+add move_fn) lives in mm/lru_marie/core.c so vmscan.c's reclaim
+ * putback can share it; declared in <linux/lru_marie.h>.
+ */
+
 static void lru_activate(struct lruvec *lruvec, struct folio *folio)
 {
 	long nr_pages = folio_nr_pages(folio);
@@ -311,10 +413,24 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio)
 	if (folio_test_active(folio) || folio_test_unevictable(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Tracked Marie folios are never on legacy lists (the swap.c entry
+	 * gates divert them); guard defensively, and route the untracked
+	 * orphan's re-add away from lru_marie_add_folio()'s adopt path.
+	 */
+	if (lru_marie_enabled() && lru_marie_test_tracked(folio))
+		return;
+#endif
 
 	lruvec_del_folio(lruvec, folio);
 	folio_set_active(folio);
-	lruvec_add_folio(lruvec, folio);
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled())
+		lru_marie_orphan_add(lruvec, folio, false);
+	else
+#endif
+		lruvec_add_folio(lruvec, folio);
 	trace_mm_lru_activate(folio);
 
 	__count_vm_events(PGACTIVATE, nr_pages);
@@ -336,6 +452,13 @@ void folio_activate(struct folio *folio)
 	    !folio_test_lru(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/* Marie folios bypass legacy LRU lists; apply the promote on the
+	 * per-PFN state instead of queueing the legacy lru_activate batch. */
+	if (lru_marie_activate(folio))
+		return;
+#endif
+
 	folio_batch_add_and_move(folio, lru_activate);
 }
 
@@ -351,6 +474,15 @@ void folio_activate(struct folio *folio)
 	if (!folio_test_clear_lru(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/* Re-publish the PG_lru this path speculatively cleared above; the
+	 * promote happened on the per-PFN state in lru_marie_activate(). */
+	if (lru_marie_activate(folio)) {
+		folio_set_lru(folio);
+		return;
+	}
+#endif
+
 	lruvec = folio_lruvec_lock_irq(folio);
 	lru_activate(lruvec, folio);
 	lruvec_unlock_irq(lruvec);
@@ -466,6 +598,32 @@ void folio_mark_accessed(struct folio *folio)
 		lru_gen_inc_refs(folio);
 		return;
 	}
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Marie: do NOT feed the explicit access signal into the tier.
+	 *
+	 * Tier has no decay -- marie_state_inc_tier only ever raises it
+	 * (reset to 0 happens solely on the saturate->head promotion), and
+	 * survivor re-publish preserves it (target_tier = max(prev, w)). So
+	 * a per-access bump, which fires on essentially every read / pagecache
+	 * hit / fault (filemap_read, __filemap_get_folio, shmem, gup, ...),
+	 * is unbounded and monotonic: folios pin at hot_votes >= 1 in
+	 * folio_check_references (permanent KEEP) or churn on the head-gen
+	 * promotion treadmill. Under memory pressure that starves reclaim --
+	 * file stalls above the clean_min_ratio floor and anon is never
+	 * swapped -- and the machine OOMs with swap free.
+	 *
+	 * Marie's hotness instead comes from the (rate-limited, kswapd-driven)
+	 * walker young-bit scan plus the rmap young bits read at reclaim time
+	 * in folio_check_references; both self-pace and clear, so they do not
+	 * accumulate. Routing folio_mark_accessed here can be revisited only
+	 * once tier gains a decay/aging mechanism. lru_marie_mark_accessed()
+	 * is kept (dormant) for that future. Return without falling through to
+	 * the legacy activate path, which would re-tier via lruvec_add_folio.
+	 */
+	if (lru_marie_enabled())
+		return;
+#endif
 
 	if (!folio_test_referenced(folio)) {
 		folio_set_referenced(folio);
@@ -510,9 +668,24 @@ void folio_add_lru(struct folio *folio)
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 
 	/* see the comment in lru_gen_folio_seq() */
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Marie bypass: Marie tracks folios via per-PFN state bytes,
+	 * not on legacy/MGLRU lists, and does not use PG_active. If
+	 * we set it here, the folio enters Marie with PG_active=1;
+	 * later marie_state_shrink_lruvec -> shrink_folio_list trips
+	 * VM_BUG_ON_FOLIO(folio_test_active(folio), folio) in
+	 * mm/vmscan.c. Skip the MGLRU fault hint when Marie owns
+	 * the LRU. (See also defensive clear in lru_marie_add_folio.)
+	 */
+	if (!lru_marie_enabled() && lru_gen_enabled() && !folio_test_unevictable(folio) &&
+	    lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+		folio_set_active(folio);
+#else
 	if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
 	    lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
 		folio_set_active(folio);
+#endif
 
 	folio_batch_add_and_move(folio, lru_add);
 }
@@ -569,6 +742,11 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
 	if (folio_mapped(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled() && lru_marie_test_tracked(folio))
+		return;
+#endif
+
 	lruvec_del_folio(lruvec, folio);
 	folio_clear_active(folio);
 	folio_clear_referenced(folio);
@@ -580,14 +758,24 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
 		 * race window is _really_ small and  it's not a critical
 		 * problem.
 		 */
-		lruvec_add_folio(lruvec, folio);
+#ifdef CONFIG_LRU_MARIE
+		if (lru_marie_enabled())
+			lru_marie_orphan_add(lruvec, folio, false);
+		else
+#endif
+			lruvec_add_folio(lruvec, folio);
 		folio_set_reclaim(folio);
 	} else {
 		/*
 		 * The folio's writeback ended while it was in the batch.
 		 * We move that folio to the tail of the inactive list.
 		 */
-		lruvec_add_folio_tail(lruvec, folio);
+#ifdef CONFIG_LRU_MARIE
+		if (lru_marie_enabled())
+			lru_marie_orphan_add(lruvec, folio, true);
+		else
+#endif
+			lruvec_add_folio_tail(lruvec, folio);
 		__count_vm_events(PGROTATED, nr_pages);
 	}
 
@@ -605,10 +793,20 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
 	if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled() && lru_marie_test_tracked(folio))
+		return;
+#endif
+
 	lruvec_del_folio(lruvec, folio);
 	folio_clear_active(folio);
 	folio_clear_referenced(folio);
-	lruvec_add_folio(lruvec, folio);
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled())
+		lru_marie_orphan_add(lruvec, folio, false);
+	else
+#endif
+		lruvec_add_folio(lruvec, folio);
 
 	__count_vm_events(PGDEACTIVATE, nr_pages);
 	count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
@@ -622,6 +820,11 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
 	    folio_test_swapcache(folio) || folio_test_unevictable(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled() && lru_marie_test_tracked(folio))
+		return;
+#endif
+
 	lruvec_del_folio(lruvec, folio);
 	folio_clear_active(folio);
 	if (lru_gen_enabled())
@@ -634,7 +837,12 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
 	 * anonymous folios
 	 */
 	folio_clear_swapbacked(folio);
-	lruvec_add_folio(lruvec, folio);
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled())
+		lru_marie_orphan_add(lruvec, folio, false);
+	else
+#endif
+		lruvec_add_folio(lruvec, folio);
 
 	__count_vm_events(PGLAZYFREE, nr_pages);
 	count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
@@ -698,6 +906,13 @@ void deactivate_file_folio(struct folio *folio)
 	if (lru_gen_enabled() && lru_gen_clear_refs(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/* Marie folios bypass legacy LRU lists; apply the demote on the
+	 * per-PFN state instead of queueing the legacy batch. */
+	if (lru_marie_deactivate(folio))
+		return;
+#endif
+
 	folio_batch_add_and_move(folio, lru_deactivate_file);
 }
 
@@ -717,6 +932,13 @@ void folio_deactivate(struct folio *folio)
 	if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/* Marie folios bypass legacy LRU lists; apply the demote on the
+	 * per-PFN state instead of queueing the legacy batch. */
+	if (lru_marie_deactivate(folio))
+		return;
+#endif
+
 	folio_batch_add_and_move(folio, lru_deactivate);
 }
 
@@ -734,6 +956,14 @@ void folio_mark_lazyfree(struct folio *folio)
 	    folio_test_swapcache(folio) || folio_test_unevictable(folio))
 		return;
 
+#ifdef CONFIG_LRU_MARIE
+	/* Marie folios bypass legacy LRU lists; lru_marie_lazyfree() clears
+	 * PG_swapbacked synchronously (MADV_FREE: free-without-writeback) and
+	 * demotes on the per-PFN state instead of queueing the legacy batch. */
+	if (lru_marie_lazyfree(folio))
+		return;
+#endif
+
 	folio_batch_add_and_move(folio, lru_lazyfree);
 }
 
diff --git a/mm/swap.h b/mm/swap.h
index a77016f242..7f499ac84c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -227,6 +227,10 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 void swap_write_unplug(struct swap_iocb *sio);
 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
 void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+#ifdef CONFIG_LRU_MARIE
+/* CONFIG_SWAP is implied by this branch (mm/swap.h's CONFIG_SWAP gate). */
+int kcompmari(void *p);
+#endif
 
 /* linux/mm/swap_state.c */
 extern struct address_space swap_space __read_mostly;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd1b1aa125..3f09c89868 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/lru_marie.h>
 #include <linux/migrate.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
@@ -181,6 +182,36 @@ struct scan_control {
 	struct reclaim_state reclaim_state;
 };
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * Accessors for out-of-tree reclaim readers (mm/lru_marie). Declared
+ * in mm/internal.h with the struct kept private here. Trivial
+ * field reads + one helper for the "target reached" comparison and
+ * one for the reclaimed-count update. EXPORT_SYMBOL_GPL is
+ * unnecessary -- Marie is built into vmlinux when CONFIG_LRU_MARIE
+ * is on, never a module.
+ */
+int sc_priority(const struct scan_control *sc)
+{
+	return sc->priority;
+}
+
+int sc_reclaim_idx(const struct scan_control *sc)
+{
+	return sc->reclaim_idx;
+}
+
+bool sc_reclaim_target_reached(const struct scan_control *sc)
+{
+	return sc->nr_reclaimed >= sc->nr_to_reclaim;
+}
+
+void sc_add_reclaimed(struct scan_control *sc, unsigned long nr)
+{
+	sc->nr_reclaimed += nr;
+}
+#endif
+
 #ifdef ARCH_HAS_PREFETCHW
 #define prefetchw_prev_lru_folio(_folio, _base, _field)			\
 	do {								\
@@ -197,8 +228,19 @@ struct scan_control {
 
 /*
  * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
+ *
+ * CONFIG_LRU_MARIE=y: default 1. Under Marie's per-PFN reclaim
+ * driver this maps to MARIE_PICK_FILE_THEN_ANON -- anon is fully
+ * protected until clean_min_ratio is breached, matching the
+ * ZRAM-era assumption that file pagecache is cheaper to refault
+ * than anon is to swap. See the storage-tier rationale at the top
+ * of mm/lru_marie/core.c.
  */
+#ifdef CONFIG_LRU_MARIE
+int vm_swappiness = 1;
+#else
 int vm_swappiness = 60;
+#endif
 
 #ifdef CONFIG_MEMCG
 
@@ -397,10 +439,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
 	struct zone *zone;
 
 	for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
-		if (!mem_cgroup_disabled())
+		if (!mem_cgroup_disabled()) {
+			/*
+			 * mz->lru_zone_size accounts Marie-tracked folios too
+			 * (marie_update_lru_size credits/debits mz like legacy
+			 * update_lru_size), so no Marie-specific summing here.
+			 */
 			size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
-		else
+		} else {
 			size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
+		}
 	}
 	return size;
 }
@@ -458,6 +506,45 @@ static int reclaimer_offset(struct scan_control *sc)
 	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
 }
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * Wrapper used by mm/lru_marie, which sees @sc but not the static
+ * reclaimer_offset() above. 7.0's reclaimer_offset() already takes
+ * @sc, so just forward it.
+ */
+int vmscan_reclaimer_offset(struct scan_control *sc)
+{
+	return reclaimer_offset(sc);
+}
+
+/*
+ * cgroup_reclaim() is static above and struct scan_control is private
+ * to vmscan.c. Marie needs the same predicate to gate its PGSCAN_* /
+ * PGSTEAL_* event accounting (cgroup-scoped reclaim must not bump the
+ * global vm events). Expose it as an sc_* accessor matching the
+ * pattern already used for sc_priority / sc_reclaim_idx etc.
+ */
+bool sc_cgroup_reclaim(const struct scan_control *sc)
+{
+	return cgroup_reclaim((struct scan_control *)sc);
+}
+
+/*
+ * can_reclaim_anon_pages() is static above. Marie's pick driver needs
+ * the same predicate: when anon cannot be reclaimed at all (no free
+ * swap slots, cgroup swap limit hit, no demotion target) the
+ * swappiness/bias controller is meaningless -- every ANON pick
+ * reclaims nothing -- so Marie must force FILE reclaim, mirroring
+ * get_scan_count()'s "!can_reclaim_anon_pages -> SCAN_FILE" forcing.
+ * Expose it as a vmscan_* wrapper; struct scan_control stays private.
+ */
+bool vmscan_can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid,
+				   struct scan_control *sc)
+{
+	return can_reclaim_anon_pages(memcg, nid, sc);
+}
+#endif
+
 /*
  * We detected a synchronous write error writing a folio out.  Probably
  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
@@ -885,7 +972,11 @@ static enum folio_references folio_check_references(struct folio *folio,
 	if (referenced_ptes == -1)
 		return FOLIOREF_KEEP;
 
+#ifdef CONFIG_LRU_MARIE
+	if (!lru_marie_enabled() && lru_gen_enabled() && !lru_gen_switching()) {
+#else
 	if (lru_gen_enabled() && !lru_gen_switching()) {
+#endif
 		if (!referenced_ptes)
 			return FOLIOREF_RECLAIM;
 
@@ -894,6 +985,30 @@ static enum folio_references folio_check_references(struct folio *folio,
 
 	referenced_folio = folio_test_clear_referenced(folio);
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled()) {
+		unsigned int tier = folio_marie_get_tier(folio);
+		int hot_votes;
+
+		hot_votes = (tier > 0) + (referenced_ptes > 0) + !!referenced_folio;
+
+		if (hot_votes >= 2 || referenced_ptes > 1)
+			return FOLIOREF_ACTIVATE;
+
+		if (referenced_ptes > 0 && (vm_flags & VM_EXEC) &&
+		    						folio_is_file_lru(folio))
+			return FOLIOREF_ACTIVATE;
+
+		if (hot_votes == 1 && referenced_folio && folio_is_file_lru(folio))
+			return FOLIOREF_RECLAIM_CLEAN;
+
+		if (hot_votes >= 1)
+			return FOLIOREF_KEEP;
+
+		return FOLIOREF_RECLAIM;
+	}
+#endif
+
 	if (referenced_ptes) {
 		/*
 		 * All mapped folios start out with page table
@@ -1053,9 +1168,13 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
 }
 
 /*
- * shrink_folio_list() returns the number of reclaimed pages
+ * shrink_folio_list() returns the number of reclaimed pages.
+ *
+ * Exposed via mm/internal.h so that mm/lru_marie can drive its own
+ * isolate->shrink->putback loop without duplicating the per-folio
+ * reclaim machinery.
  */
-static unsigned int shrink_folio_list(struct list_head *folio_list,
+unsigned int shrink_folio_list(struct list_head *folio_list,
 		struct pglist_data *pgdat, struct scan_control *sc,
 		struct reclaim_stat *stat, bool ignore_references,
 		struct mem_cgroup *memcg)
@@ -1916,7 +2035,34 @@ static unsigned int move_folios_to_lru(struct list_head *list)
 			continue;
 		}
 
-		lruvec_add_folio(lruvec, folio);
+		/*
+		 * All pages were isolated from the same lruvec (and isolation
+		 * inhibits memcg migration).
+		 */
+		VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+#ifdef CONFIG_LRU_MARIE
+		/*
+		 * Legacy reclaim putback. Under Marie this is reached for the
+		 * untracked orphans that legacy shrink_{in,}active_list isolates
+		 * off legacy lists (e.g. workingset-refault activations routed
+		 * through folio_activate). Routing their re-add through
+		 * lruvec_add_folio() -> lru_marie_add_folio() would ADOPT them
+		 * into Marie -- the exact adopt asymmetry fixed for swap.c's
+		 * move_fns. Mirror that fix here: a tracked folio must never
+		 * have been on a legacy list (WARN; hand it back to Marie, whose
+		 * install early-out re-asserts ownership), and an untracked
+		 * orphan gets a pure non-adopting legacy add.
+		 */
+		if (lru_marie_enabled()) {
+			if (unlikely(lru_marie_test_tracked(folio))) {
+				VM_WARN_ON_ONCE_FOLIO(1, folio);
+				lruvec_add_folio(lruvec, folio);
+			} else {
+				lru_marie_orphan_add(lruvec, folio, false);
+			}
+		} else
+#endif
+			lruvec_add_folio(lruvec, folio);
 		nr_pages = folio_nr_pages(folio);
 		nr_moved += nr_pages;
 		if (folio_test_active(folio))
@@ -5307,6 +5453,55 @@ static bool drain_evictable(struct lruvec *lruvec)
 	return true;
 }
 
+/*
+ * lru_gen_fill_lruvec - hand off legacy LRU residue to MGLRU.
+ *
+ * Move every folio currently on lruvec->lists[lru] into lrugen via
+ * the canonical lru_gen_add_folio path. Symmetric counterpart to
+ * lru_gen_drain_lruvec below; exported so external LRU drivers
+ * (mm/lru_marie) can call it after their own drain pass to keep
+ * MGLRU's state_is_valid invariant ("lrugen enabled => legacy
+ * lists empty") intact across enable/disable cycles of the other
+ * driver.
+ *
+ * Caller must hold @lruvec->lru_lock with IRQs disabled. The
+ * helper internally releases and reacquires across the cond_resched
+ * between MAX_LRU_BATCH-sized passes, matching the locking pattern
+ * lru_gen_change_state itself uses.
+ */
+void lru_gen_fill_lruvec(struct lruvec *lruvec)
+{
+	while (!fill_evictable(lruvec)) {
+		spin_unlock_irq(&lruvec->lru_lock);
+		cond_resched();
+		spin_lock_irq(&lruvec->lru_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(lru_gen_fill_lruvec);
+
+/*
+ * lru_gen_drain_lruvec - evacuate lrugen via the canonical add path.
+ *
+ * Inverse of lru_gen_fill_lruvec: empty lrugen by removing every
+ * folio via lru_gen_del_folio and re-adding via lruvec_add_folio.
+ * With another LRU driver's gate on (e.g. Marie), the re-add routes
+ * through that driver's install path -- which both saves Marie from
+ * reimplementing MGLRU's accounting and gives the folio Marie's
+ * canonical per-PFN install for free. With no other driver active
+ * the folios fall through to lruvec->lists[lru].
+ *
+ * Caller must hold @lruvec->lru_lock with IRQs disabled.
+ */
+void lru_gen_drain_lruvec(struct lruvec *lruvec)
+{
+	while (!drain_evictable(lruvec)) {
+		spin_unlock_irq(&lruvec->lru_lock);
+		cond_resched();
+		spin_lock_irq(&lruvec->lru_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(lru_gen_drain_lruvec);
+
 static void lru_gen_change_state(bool enabled)
 {
 	static DEFINE_MUTEX(state_mutex);
@@ -5827,7 +6022,15 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	lrugen->max_seq = MIN_NR_GENS + 1;
-	lrugen->enabled = lru_gen_enabled();
+	/*
+	 * lrugen->enabled mirrors the raw MGLRU core key, not the
+	 * Marie-masked lru_gen_enabled() view: it must stay true when MGLRU
+	 * is configured-on even while Marie masks MGLRU off, so the
+	 * Marie-disable ownership handoff (lru_gen_fill_lruvec ->
+	 * fill_evictable -> lru_gen_add_folio, which bails on !lrugen->enabled)
+	 * can migrate folios back onto lrugen.
+	 */
+	lrugen->enabled = lru_gen_core_enabled();
 
 	for (i = 0; i <= MIN_NR_GENS + 1; i++)
 		lrugen->timestamps[i] = jiffies;
@@ -5927,7 +6130,23 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 	bool proportional_reclaim;
 	struct blk_plug plug;
+#ifdef CONFIG_LRU_MARIE
+	unsigned int marie_drain_mask = MARIE_DRAIN_ANON | MARIE_DRAIN_FILE;
+#endif
 
+#ifdef CONFIG_LRU_MARIE
+	if (lru_marie_enabled()) {
+		marie_drain_mask = lru_marie_shrink_lruvec(lruvec, sc);
+		/*
+		 * Fall through to the legacy reclaim path below to drain orphan
+		 * folios (failed Marie install, drain/reparent handoffs) that
+		 * landed on lruvec->lists; MGLRU is bypassed under Marie. The
+		 * drain is constrained by marie_drain_mask below so it touches
+		 * only the type(s) Marie scanned. Common case: the lists are
+		 * empty and this is a cheap no-op.
+		 */
+	} else
+#endif
 	if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) {
 		lru_gen_shrink_lruvec(lruvec, sc);
 
@@ -5938,6 +6157,29 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 
 	get_scan_count(lruvec, sc, nr);
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Constrain the legacy orphan drain to the type(s) Marie's pick driver
+	 * actually scanned this call (marie_drain_mask). Stock get_scan_count's
+	 * policy (SCAN_EQUAL at sc->priority==0, SCAN_ANON on file_is_tiny)
+	 * ignores Marie's swappiness / clean_min_ratio / ANON_STRICT decisions
+	 * and would otherwise cut the protected type behind the driver's back
+	 * (e.g. evicting file at vm.swappiness=200, or swapping at swappiness=0).
+	 * Zero the nr[] of any type Marie did not scan. Marie-only; the
+	 * legacy/MGLRU nr[] is left byte-identical.
+	 */
+	if (lru_marie_enabled()) {
+		if (!(marie_drain_mask & MARIE_DRAIN_FILE)) {
+			nr[LRU_ACTIVE_FILE] = 0;
+			nr[LRU_INACTIVE_FILE] = 0;
+		}
+		if (!(marie_drain_mask & MARIE_DRAIN_ANON)) {
+			nr[LRU_ACTIVE_ANON] = 0;
+			nr[LRU_INACTIVE_ANON] = 0;
+		}
+	}
+#endif
+
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
 
@@ -6193,14 +6435,32 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	struct lruvec *target_lruvec;
 	bool reclaimable = false;
 
-	if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) {
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * MGLRU's root-reclaim shortcut bypasses shrink_node_memcgs entirely,
+	 * which is where shrink_lruvec — and therefore Marie — gets invoked.
+	 * When lru_marie_enabled() that bypass would leave kswapd walking empty
+	 * MGLRU gens (since folios live in Marie gens) and never touching Marie
+	 * at all.  Gate the shortcut on !lru_marie_enabled() so kswapd takes the
+	 * standard shrink_node_memcgs path under Marie.
+	 */
+	if (!lru_marie_enabled() &&
+	    (lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) {
 		memset(&sc->nr, 0, sizeof(sc->nr));
 		lru_gen_shrink_node(pgdat, sc);
 
 		if (!lru_gen_switching())
 			return;
+	}
+#else
+	if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) {
+		memset(&sc->nr, 0, sizeof(sc->nr));
+		lru_gen_shrink_node(pgdat, sc);
 
+		if (!lru_gen_switching())
+			return;
 	}
+#endif
 
 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
@@ -6469,6 +6729,17 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
 	struct lruvec *target_lruvec;
 	unsigned long refaults;
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Marie has no equivalent of legacy refault tracking yet, and the
+	 * legacy WORKINGSET_* counters don't reflect Marie state — skip the
+	 * snapshot to avoid feeding MGLRU/legacy-tuned heuristics with stale
+	 * data.
+	 */
+	if (lru_marie_enabled())
+		return;
+#endif
+
 	if (lru_gen_enabled() && !lru_gen_switching())
 		return;
 
@@ -6859,6 +7130,21 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
+#ifdef CONFIG_LRU_MARIE
+	/*
+	 * Marie: drive proactive aging from kswapd's pre-reclaim hook so the
+	 * gen ring has accurate hot/cold ordering by the time direct reclaim
+	 * picks the tail.  lru_marie_age_node() walks running tasks' PTEs
+	 * (rate-limited internally) and skips the legacy active-list
+	 * deactivation below — legacy lists only hold mempool-failure orphans
+	 * under Marie and aging them is not worthwhile.
+	 */
+	if (lru_marie_enabled()) {
+		lru_marie_age_node(pgdat, sc);
+		return;
+	}
+#endif
+
 	if (lru_gen_enabled() || lru_gen_switching()) {
 		lru_gen_age_node(pgdat, sc);
 
@@ -7632,6 +7918,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 void __meminit kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP)
+	int ret;
+#endif
 
 	pgdat_kswapd_lock(pgdat);
 	if (!pgdat->kswapd) {
@@ -7645,7 +7934,32 @@ void __meminit kswapd_run(int nid)
 		} else {
 			wake_up_process(pgdat->kswapd);
 		}
+#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP)
+		ret = kfifo_alloc(&pgdat->kcompmari_fifo,
+				KCOMPMARI_FIFO_SIZE * sizeof(struct folio *),
+				GFP_KERNEL);
+		if (ret) {
+			pr_err("%s: fail to kfifo_alloc\n", __func__);
+			goto out;
+		}
+
+		pr_info("kcompmari (forked from kcompressd-unofficial by Masahito Suzuki, originally Kcompressd by Qun-Wei Lin from MediaTek)\n");
+		spin_lock_init(&pgdat->kcompmari_fifo_lock);
+		pgdat->kcompmari = kthread_create_on_node(kcompmari, pgdat, nid,
+				"kcompmari%d", nid);
+		if (IS_ERR(pgdat->kcompmari)) {
+			pr_err("Failed to start kcompmari on node %d，ret=%ld\n",
+					nid, PTR_ERR(pgdat->kcompmari));
+			pgdat->kcompmari = NULL;
+			kfifo_free(&pgdat->kcompmari_fifo);
+		} else {
+			wake_up_process(pgdat->kcompmari);
+		}
+#endif
 	}
+#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP)
+out:
+#endif
 	pgdat_kswapd_unlock(pgdat);
 }
 
@@ -7664,16 +7978,60 @@ void __meminit kswapd_stop(int nid)
 		kthread_stop(kswapd);
 		pgdat->kswapd = NULL;
 	}
+#if defined(CONFIG_LRU_MARIE) && defined(CONFIG_SWAP)
+	if (pgdat->kcompmari) {
+		kthread_stop(pgdat->kcompmari);
+		pgdat->kcompmari = NULL;
+		kfifo_free(&pgdat->kcompmari_fifo);
+	}
+#endif
 	pgdat_kswapd_unlock(pgdat);
 }
 
+#ifdef CONFIG_LRU_MARIE
+/*
+ * vm.swappiness write notifier for the Marie LRU controller. Calls
+ * the default proc_dointvec_minmax to perform range-checked storage
+ * into vm_swappiness, then, on a successful write, notifies Marie so
+ * it can reset every per-lruvec swap_bias counter. The notification
+ * is skipped on read or on validation failure -- only an actual
+ * value change should trigger controller reset.
+ *
+ * Note: we always notify on a successful write even when the new
+ * value equals the old one. The cost is one xa walk; the alternative
+ * (snapshot+compare) would require atomicity guarantees that
+ * proc_dointvec_minmax does not provide, and gives no practical
+ * benefit since reset-to-zero is idempotent.
+ */
+static int marie_swappiness_sysctl_handler(const struct ctl_table *table,
+					   int write, void *buffer,
+					   size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (write && !ret)
+		lru_marie_swappiness_changed();
+	return ret;
+}
+#endif
+
 static const struct ctl_table vmscan_sysctl_table[] = {
 	{
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
 		.maxlen		= sizeof(vm_swappiness),
 		.mode		= 0644,
+#ifdef CONFIG_LRU_MARIE
+		/*
+		 * Marie wraps the default minmax handler so that a sysctl
+		 * write resets every per-lruvec swap_bias counter to zero.
+		 * See mm/lru_marie/state.c::marie_swap_bias_update for the
+		 * controller this notification clears.
+		 */
+		.proc_handler	= marie_swappiness_sysctl_handler,
+#else
 		.proc_handler	= proc_dointvec_minmax,
+#endif
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_TWO_HUNDRED,
 	},
-- 
2.34.1