From fc4200cf9268485188fd162800ed04a07da54969 Mon Sep 17 00:00:00 2001 From: Masahito S Date: Fri, 1 May 2026 19:47:25 +0900 Subject: [PATCH] linux7.1-rc1-Re-swappiness-v1.3 --- include/linux/mm_inline.h | 16 +- include/linux/mmzone.h | 29 ++- mm/vmscan.c | 426 ++++++++++++++++++++++++-------------- mm/workingset.c | 2 +- 4 files changed, 288 insertions(+), 185 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index a171070e15..3b9f4f56cb 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -162,9 +162,9 @@ static inline int folio_lru_gen(const struct folio *folio) return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } -static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen) +static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen, int type) { - unsigned long max_seq = lruvec->lrugen.max_seq; + unsigned long max_seq = lruvec->lrugen.max_seq[type]; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); @@ -194,7 +194,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli /* addition */ if (old_gen < 0) { - if (lru_gen_is_active(lruvec, new_gen)) + if (lru_gen_is_active(lruvec, new_gen, type)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; @@ -202,20 +202,20 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli /* deletion */ if (new_gen < 0) { - if (lru_gen_is_active(lruvec, old_gen)) + if (lru_gen_is_active(lruvec, old_gen, type)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ - if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { + if (!lru_gen_is_active(lruvec, old_gen, type) && lru_gen_is_active(lruvec, new_gen, type)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ - VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); + VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen, type) && !lru_gen_is_active(lruvec, new_gen, type)); } static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, @@ -249,7 +249,7 @@ static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, else gen = MAX_NR_GENS - folio_test_workingset(folio); - return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); + return max(READ_ONCE(lrugen->max_seq[type]) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) @@ -294,7 +294,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ - flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; + flags = !reclaiming && lru_gen_is_active(lruvec, gen, folio_is_file_lru(folio)) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9adb2ad21d..9bc1dfb832 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -547,23 +547,20 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file - * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types so that they can be - * incremented independently. Ideally min_seq[] are kept in sync when both anon - * and file types are evictable. However, to adapt to situations like extreme - * swappiness, they are allowed to be out of sync by at most - * MAX_NR_GENS-MIN_NR_GENS-1. + * types so that they can be aged independently. The oldest generation numbers + * are stored in min_seq[] separately for anon and file types so that they can + * be incremented independently. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { - /* the aging increments the youngest generation number */ - unsigned long max_seq; + /* the aging increments the youngest generation numbers */ + unsigned long max_seq[ANON_AND_FILE]; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ - unsigned long timestamps[MAX_NR_GENS]; + unsigned long timestamps[ANON_AND_FILE][MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ @@ -599,12 +596,12 @@ enum { #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { - /* synced with max_seq after each iteration */ - unsigned long seq; + /* synced with max_seq[] after each iteration */ + unsigned long seq[ANON_AND_FILE]; /* where the current iteration continues after */ - struct list_head *head; + struct list_head *head[ANON_AND_FILE]; /* where the last iteration ended before */ - struct list_head *tail; + struct list_head *tail[ANON_AND_FILE]; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ @@ -614,8 +611,10 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* max_seq from lru_gen_folio: can be out of date */ - unsigned long seq; + /* max_seq[] from lru_gen_folio: can be out of date */ + unsigned long seq[ANON_AND_FILE]; + /* which type is being aged (LRU_GEN_ANON or LRU_GEN_FILE) */ + int aging_type; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ diff --git a/mm/vmscan.c b/mm/vmscan.c index bd1b1aa125..eafe77374d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -836,6 +836,12 @@ enum folio_references { }; #ifdef CONFIG_LRU_GEN + +#define RESWAPPINESS_PROGNAME "Re-swappiness: MGLRU anon/file independent aging" +#define RESWAPPINESS_AUTHOR "Masahito Suzuki" + +#define RESWAPPINESS_VERSION "1.3" + /* * Only used on a mapped folio in the eviction (rmap walk) path, where promotion * needs to be done by taking the folio off the LRU list and then adding it back @@ -2648,7 +2654,10 @@ static bool should_clear_pmd_young(void) ******************************************************************************/ #define DEFINE_MAX_SEQ(lruvec) \ - unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) + unsigned long max_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.max_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.max_seq[LRU_GEN_FILE]), \ + } #define DEFINE_MIN_SEQ(lruvec) \ unsigned long min_seq[ANON_AND_FILE] = { \ @@ -2711,7 +2720,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) static int get_nr_gens(struct lruvec *lruvec, int type) { - return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; + return lruvec->lrugen.max_seq[type] - lruvec->lrugen.min_seq[type] + 1; } static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) @@ -2856,7 +2865,7 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); - mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + mm = list_entry(mm_state->head[walk->aging_type], struct mm_struct, lru_gen.list); key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) @@ -2886,8 +2895,10 @@ void lru_gen_add_mm(struct mm_struct *mm) struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ - if (mm_state->tail == &mm_list->fifo) - mm_state->tail = &mm->lru_gen.list; + for (int t = 0; t < ANON_AND_FILE; t++) { + if (mm_state->tail[t] == &mm_list->fifo) + mm_state->tail[t] = &mm->lru_gen.list; + } } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); @@ -2915,13 +2926,15 @@ void lru_gen_del_mm(struct mm_struct *mm) struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - /* where the current iteration continues after */ - if (mm_state->head == &mm->lru_gen.list) - mm_state->head = mm_state->head->prev; + for (int t = 0; t < ANON_AND_FILE; t++) { + /* where the current iteration continues after */ + if (mm_state->head[t] == &mm->lru_gen.list) + mm_state->head[t] = mm_state->head[t]->prev; - /* where the last iteration ended before */ - if (mm_state->tail == &mm->lru_gen.list) - mm_state->tail = mm_state->tail->next; + /* where the last iteration ended before */ + if (mm_state->tail[t] == &mm->lru_gen.list) + mm_state->tail[t] = mm_state->tail[t]->next; + } } list_del_init(&mm->lru_gen.list); @@ -2992,7 +3005,7 @@ static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); - hist = lru_hist_from_seq(walk->seq); + hist = lru_hist_from_seq(walk->seq[walk->aging_type]); for (i = 0; i < NR_MM_STATS; i++) { WRITE_ONCE(mm_state->stats[hist][i], @@ -3001,7 +3014,7 @@ static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(walk->seq + 1); + hist = lru_hist_from_seq(walk->seq[walk->aging_type] + 1); for (i = 0; i < NR_MM_STATS; i++) WRITE_ONCE(mm_state->stats[hist][i], 0); @@ -3013,13 +3026,14 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite bool first = false; bool last = false; struct mm_struct *mm = NULL; + int type = walk->aging_type; struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* - * mm_state->seq is incremented after each iteration of mm_list. There + * mm_state->seq[] is incremented after each iteration of mm_list. There * are three interesting cases for this page table walker: * 1. It tries to start a new iteration with a stale max_seq: there is * nothing left to do. @@ -3030,28 +3044,28 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite */ spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); + VM_WARN_ON_ONCE(mm_state->seq[type] + 1 < walk->seq[type]); - if (walk->seq <= mm_state->seq) + if (walk->seq[type] <= mm_state->seq[type]) goto done; - if (!mm_state->head) - mm_state->head = &mm_list->fifo; + if (!mm_state->head[type]) + mm_state->head[type] = &mm_list->fifo; - if (mm_state->head == &mm_list->fifo) + if (mm_state->head[type] == &mm_list->fifo) first = true; do { - mm_state->head = mm_state->head->next; - if (mm_state->head == &mm_list->fifo) { - WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + mm_state->head[type] = mm_state->head[type]->next; + if (mm_state->head[type] == &mm_list->fifo) { + WRITE_ONCE(mm_state->seq[type], mm_state->seq[type] + 1); last = true; break; } /* force scan for those added after the last iteration */ - if (!mm_state->tail || mm_state->tail == mm_state->head) { - mm_state->tail = mm_state->head->next; + if (!mm_state->tail[type] || mm_state->tail[type] == mm_state->head[type]) { + mm_state->tail[type] = mm_state->head[type]->next; walk->force_scan = true; } } while (!(mm = get_next_mm(walk))); @@ -3062,7 +3076,7 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(mm_state, walk->seq + 1); + reset_bloom_filter(mm_state, walk->seq[type] + 1); if (*iter) mmdrop(*iter); @@ -3072,7 +3086,7 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite return last; } -static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq, int type) { bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -3081,12 +3095,12 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); + VM_WARN_ON_ONCE(mm_state->seq[type] + 1 < seq); - if (seq > mm_state->seq) { - mm_state->head = NULL; - mm_state->tail = NULL; - WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + if (seq > mm_state->seq[type]) { + mm_state->head[type] = NULL; + mm_state->tail[type] = NULL; + WRITE_ONCE(mm_state->seq[type], mm_state->seq[type] + 1); success = true; } @@ -3148,7 +3162,7 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) int hist, tier; struct lru_gen_folio *lrugen = &lruvec->lrugen; bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; - unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq[type] + 1; lockdep_assert_held(&lruvec->lru_lock); @@ -3284,7 +3298,7 @@ static void reset_batch_size(struct lru_gen_mm_walk *walk) WRITE_ONCE(lrugen->nr_pages[gen][type][zone], lrugen->nr_pages[gen][type][zone] + delta); - if (lru_gen_is_active(lruvec, gen)) + if (lru_gen_is_active(lruvec, gen, type)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); } @@ -3312,7 +3326,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->swappiness; + return walk->aging_type == LRU_GEN_FILE || !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3321,8 +3335,13 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal if (mapping_unevictable(mapping)) return true; + /* shmem pages are anon for LRU purposes */ if (shmem_mapping(mapping)) - return !walk->swappiness; + return walk->aging_type == LRU_GEN_FILE || !walk->swappiness; + + /* file VMA: skip when aging anon type */ + if (walk->aging_type == LRU_GEN_ANON) + return true; if (walk->swappiness > MAX_SWAPPINESS) return true; @@ -3475,7 +3494,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq[walk->aging_type]); unsigned int nr; pmd_t pmdval; @@ -3563,7 +3582,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq[walk->aging_type]); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3692,7 +3711,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq[walk->aging_type], pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -3703,7 +3722,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(mm_state, walk->seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->seq[walk->aging_type] + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -3772,7 +3791,7 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) err = -EBUSY; /* another thread might have called inc_max_seq() */ - if (walk->seq != max_seq) + if (walk->seq[walk->aging_type] != max_seq[walk->aging_type]) break; /* the caller might be holding the lock for write */ @@ -3890,7 +3909,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) /* find the oldest populated generation */ for_each_evictable_type(type, swappiness) { - while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq[type]) { gen = lru_gen_from_seq(min_seq[type]); for (zone = 0; zone < MAX_NR_ZONES; zone++) { @@ -3913,16 +3932,6 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) if (!seq_inc_flag) return success; - /* see the comment on lru_gen_folio */ - if (swappiness && swappiness <= MAX_SWAPPINESS) { - unsigned long seq = lrugen->max_seq - MIN_NR_GENS; - - if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) - min_seq[LRU_GEN_ANON] = seq; - else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) - min_seq[LRU_GEN_FILE] = seq; - } - for_each_evictable_type(type, swappiness) { if (min_seq[type] <= lrugen->min_seq[type]) continue; @@ -3935,34 +3944,30 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int type) { bool success; int prev, next; - int type, zone; + int zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: - if (seq < READ_ONCE(lrugen->max_seq)) + if (seq < READ_ONCE(lrugen->max_seq[type])) return false; lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); - success = seq == lrugen->max_seq; + success = seq == lrugen->max_seq[type]; if (!success) goto unlock; - for (type = 0; type < ANON_AND_FILE; type++) { - if (get_nr_gens(lruvec, type) != MAX_NR_GENS) - continue; - - if (inc_min_seq(lruvec, type, swappiness)) - continue; - - lruvec_unlock_irq(lruvec); - cond_resched(); - goto restart; + if (get_nr_gens(lruvec, type) == MAX_NR_GENS) { + if (!inc_min_seq(lruvec, type, 1)) { + lruvec_unlock_irq(lruvec); + cond_resched(); + goto restart; + } } /* @@ -3971,29 +3976,26 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do * overlap, cold/hot inversion happens. */ - prev = lru_gen_from_seq(lrugen->max_seq - 1); - next = lru_gen_from_seq(lrugen->max_seq + 1); + prev = lru_gen_from_seq(lrugen->max_seq[type] - 1); + next = lru_gen_from_seq(lrugen->max_seq[type] + 1); - for (type = 0; type < ANON_AND_FILE; type++) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - enum lru_list lru = type * LRU_INACTIVE_FILE; - long delta = lrugen->nr_pages[prev][type][zone] - - lrugen->nr_pages[next][type][zone]; + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; - if (!delta) - continue; + if (!delta) + continue; - __update_lru_size(lruvec, lru, zone, delta); - __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); - } + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); } - for (type = 0; type < ANON_AND_FILE; type++) - reset_ctrl_pos(lruvec, type, false); + reset_ctrl_pos(lruvec, type, false); - WRITE_ONCE(lrugen->timestamps[next], jiffies); + WRITE_ONCE(lrugen->timestamps[type][next], jiffies); /* make sure preceding modifications appear */ - smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + smp_store_release(&lrugen->max_seq[type], lrugen->max_seq[type] + 1); unlock: lruvec_unlock_irq(lruvec); @@ -4001,7 +4003,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, - int swappiness, bool force_scan) + int type, int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -4009,13 +4011,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); + VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq[type])); if (!mm_state) - return inc_max_seq(lruvec, seq, swappiness); + return inc_max_seq(lruvec, seq, type); /* see the comment in iterate_mm_list() */ - if (seq <= READ_ONCE(mm_state->seq)) + if (seq <= READ_ONCE(mm_state->seq[type])) return false; /* @@ -4025,18 +4027,19 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, * is less efficient, but it avoids bursty page faults. */ if (!should_walk_mmu()) { - success = iterate_mm_list_nowalk(lruvec, seq); + success = iterate_mm_list_nowalk(lruvec, seq, type); goto done; } walk = set_mm_walk(NULL, true); if (!walk) { - success = iterate_mm_list_nowalk(lruvec, seq); + success = iterate_mm_list_nowalk(lruvec, seq, type); goto done; } walk->lruvec = lruvec; - walk->seq = seq; + walk->aging_type = type; + walk->seq[type] = seq; walk->swappiness = swappiness; walk->force_scan = force_scan; @@ -4047,7 +4050,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, seq, swappiness); + success = inc_max_seq(lruvec, seq, type); WARN_ON_ONCE(!success); } @@ -4097,7 +4100,7 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) for_each_evictable_type(type, swappiness) { unsigned long seq; - for (seq = min_seq[type]; seq <= max_seq; seq++) { + for (seq = min_seq[type]; seq <= max_seq[type]; seq++) { gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) @@ -4112,8 +4115,8 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) { - int gen; - unsigned long birth; + int type, gen; + unsigned long birth = jiffies; int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); @@ -4124,8 +4127,14 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc if (!lruvec_is_sizable(lruvec, sc)) return false; - gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); - birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + for_each_evictable_type(type, swappiness) { + unsigned long ts; + + gen = lru_gen_from_seq(min_seq[type]); + ts = READ_ONCE(lruvec->lrugen.timestamps[type][gen]); + if (time_before(ts, birth)) + birth = ts; + } return time_is_before_jiffies(birth + min_ttl); } @@ -4197,6 +4206,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec; struct lru_gen_mm_state *mm_state; + int ftype = folio_is_file_lru(folio); unsigned long max_seq; int gen; @@ -4235,7 +4245,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) memcg = get_mem_cgroup_from_folio(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); - max_seq = READ_ONCE((lruvec)->lrugen.max_seq); + max_seq = READ_ONCE((lruvec)->lrugen.max_seq[ftype]); gen = lru_gen_from_seq(max_seq); mm_state = get_mm_state(lruvec); @@ -4443,28 +4453,32 @@ static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg, { struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - int swappiness = mem_cgroup_swappiness(memcg); DEFINE_MAX_SEQ(lruvec); - bool success = false; + int type; - /* - * We are not iterating the mm_list here, updating mm_state->seq is just - * to make mm walkers work properly. - */ - if (mm_state) { - spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); - if (max_seq > mm_state->seq) { - WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + for (type = 0; type < ANON_AND_FILE; type++) { + bool success = false; + + /* + * We are not iterating the mm_list here, updating mm_state->seq + * is just to make mm walkers work properly. + */ + if (mm_state) { + spin_lock(&mm_list->lock); + VM_WARN_ON_ONCE(mm_state->seq[type] + 1 < max_seq[type]); + if (max_seq[type] > mm_state->seq[type]) { + WRITE_ONCE(mm_state->seq[type], + mm_state->seq[type] + 1); + success = true; + } + spin_unlock(&mm_list->lock); + } else { success = true; } - spin_unlock(&mm_list->lock); - } else { - success = true; - } - if (success) - inc_max_seq(lruvec, max_seq, swappiness); + if (success) + inc_max_seq(lruvec, max_seq[type], type); + } } /* @@ -4524,10 +4538,10 @@ static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec parent_lrugen = &parent_lruvec->lrugen; for (i = 0; i < get_nr_gens(child_lruvec, type); i++) { - int gen = lru_gen_from_seq(child_lrugen->max_seq - i); + int gen = lru_gen_from_seq(child_lrugen->max_seq[type] - i); long nr_pages = child_lrugen->nr_pages[gen][type][zone]; - int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0; - int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0; + int child_lru_active = lru_gen_is_active(child_lruvec, gen, type) ? LRU_ACTIVE : 0; + int parent_lru_active = lru_gen_is_active(parent_lruvec, gen, type) ? LRU_ACTIVE : 0; /* Assuming that child pages are colder than parent pages */ list_splice_tail_init(&child_lrugen->folios[gen][type][zone], @@ -4537,7 +4551,7 @@ static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone], parent_lrugen->nr_pages[gen][type][zone] + nr_pages); - if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) { + if (lru_gen_is_active(child_lruvec, gen, type) != lru_gen_is_active(parent_lruvec, gen, type)) { __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages); __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages); } @@ -4754,7 +4768,10 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, * There might not be eligible folios due to reclaim_idx. Check the * remaining to prevent livelock if it's not making progress. */ - return isolated || !remaining ? scanned : 0; + if (isolated || !remaining) + return scanned; + + return 0; } static int get_tier_idx(struct lruvec *lruvec, int type) @@ -4843,8 +4860,20 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, scanned += try_to_inc_min_seq(lruvec, swappiness); - if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) - scanned = 0; + { + bool can_evict = false; + int type_i; + + for_each_evictable_type(type_i, swappiness) { + if (lrugen->min_seq[type_i] + MIN_NR_GENS <= + lrugen->max_seq[type_i]) { + can_evict = true; + break; + } + } + if (!can_evict) + scanned = 0; + } lruvec_unlock_irq(lruvec); @@ -4906,23 +4935,26 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, return scanned; } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq[], int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; unsigned long size = 0; + bool need_aging = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); *nr_to_scan = 0; /* have to run aging, since eviction is not possible anymore */ - if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) - return true; + for_each_evictable_type(type, swappiness) { + if (min_seq[type] + MIN_NR_GENS > max_seq[type]) + return true; + } for_each_evictable_type(type, swappiness) { unsigned long seq; - for (seq = min_seq[type]; seq <= max_seq; seq++) { + for (seq = min_seq[type]; seq <= max_seq[type]; seq++) { gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) @@ -4932,7 +4964,11 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, *nr_to_scan = size; /* better to run aging even though eviction is still possible */ - return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; + for_each_evictable_type(type, swappiness) { + if (min_seq[type] + MIN_NR_GENS == max_seq[type]) + need_aging = true; + } + return need_aging; } /* @@ -4946,6 +4982,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; @@ -4963,7 +5000,27 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; + { + bool aged = false; + int type; + + for_each_evictable_type(type, swappiness) { + /* + * Per-type independent aging: only age the type that + * actually needs it. Aging a type that already has + * enough evictable generations would forcibly drain + * its oldest gen via inc_min_seq() and disrupt the + * eviction progress, especially when the other type + * is stuck (e.g., file cache protected by le9uo). + */ + if (min_seq[type] + MIN_NR_GENS < max_seq[type]) + continue; + + if (try_to_inc_max_seq(lruvec, max_seq[type], type, swappiness, false)) + aged = true; + } + return aged ? -1 : 0; + } } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5500,7 +5557,7 @@ static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) } static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, - unsigned long max_seq, unsigned long *min_seq, + unsigned long max_seq[], unsigned long *min_seq, unsigned long seq) { int i; @@ -5515,7 +5572,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, const char *s = "xxx"; unsigned long n[3] = {}; - if (seq == max_seq) { + if (seq == max_seq[type]) { s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); @@ -5540,10 +5597,12 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, const char *s = "xxxx"; unsigned long n = 0; - if (seq == max_seq && NR_HIST_GENS == 1) { + if (seq == max(max_seq[LRU_GEN_ANON], max_seq[LRU_GEN_FILE]) && + NR_HIST_GENS == 1) { s = "TYFA"; n = READ_ONCE(mm_state->stats[hist][i]); - } else if (seq != max_seq && NR_HIST_GENS > 1) { + } else if (seq != max(max_seq[LRU_GEN_ANON], max_seq[LRU_GEN_FILE]) && + NR_HIST_GENS > 1) { s = "tyfa"; n = READ_ONCE(mm_state->stats[hist][i]); } @@ -5577,34 +5636,58 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); - if (!full) - seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); - else if (max_seq >= MAX_NR_GENS) - seq = max_seq - MAX_NR_GENS + 1; - else - seq = 0; + { + unsigned long max_seq_val = max(max_seq[LRU_GEN_ANON], max_seq[LRU_GEN_FILE]); - for (; seq <= max_seq; seq++) { - int type, zone; - int gen = lru_gen_from_seq(seq); - unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + if (!full) + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); + else if (max_seq_val >= MAX_NR_GENS) + seq = max_seq_val - MAX_NR_GENS + 1; + else + seq = 0; - seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); + for (; seq <= max_seq_val; seq++) { + int type, zone; + int gen = lru_gen_from_seq(seq); - for (type = 0; type < ANON_AND_FILE; type++) { - unsigned long size = 0; - char mark = full && seq < min_seq[type] ? 'x' : ' '; + seq_printf(m, " %10lu", seq); - for (zone = 0; zone < MAX_NR_ZONES; zone++) - size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + /* show the older birth time among evictable types */ + { + unsigned long birth = jiffies; + int t; - seq_printf(m, " %10lu%c", size, mark); - } + for (t = 0; t < ANON_AND_FILE; t++) { + unsigned long ts = READ_ONCE(lruvec->lrugen.timestamps[t][gen]); - seq_putc(m, '\n'); + if (time_before(ts, birth)) + birth = ts; + } + seq_printf(m, " %10u", jiffies_to_msecs(jiffies - birth)); + } + + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long size = 0; + char mark = ' '; - if (full) - lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + if (full) { + if (seq < min_seq[type]) + mark = 'x'; + else if (seq > max_seq[type]) + mark = '-'; + } + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + seq_printf(m, " %10lu%c", size, mark); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } } return 0; @@ -5620,21 +5703,36 @@ static const struct seq_operations lru_gen_seq_ops = { static int run_aging(struct lruvec *lruvec, unsigned long seq, int swappiness, bool force_scan) { + int type; + bool success = false; DEFINE_MAX_SEQ(lruvec); - if (seq > max_seq) - return -EINVAL; + for_each_evictable_type(type, swappiness) { + if (seq > max_seq[type]) + return -EINVAL; + } + + for_each_evictable_type(type, swappiness) { + if (seq == max_seq[type]) { + if (try_to_inc_max_seq(lruvec, max_seq[type], type, + swappiness, force_scan)) + success = true; + } + } - return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; + return success ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long nr_to_reclaim) { + int type; DEFINE_MAX_SEQ(lruvec); - if (seq + MIN_NR_GENS > max_seq) - return -EINVAL; + for_each_evictable_type(type, swappiness) { + if (seq + MIN_NR_GENS > max_seq[type]) + return -EINVAL; + } sc->nr_reclaimed = 0; @@ -5826,17 +5924,20 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - lrugen->max_seq = MIN_NR_GENS + 1; + for (type = 0; type < ANON_AND_FILE; type++) { + lrugen->max_seq[type] = MIN_NR_GENS + 1; + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[type][i] = jiffies; + } lrugen->enabled = lru_gen_enabled(); - for (i = 0; i <= MIN_NR_GENS + 1; i++) - lrugen->timestamps[i] = jiffies; - for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); - if (mm_state) - mm_state->seq = MIN_NR_GENS; + if (mm_state) { + mm_state->seq[LRU_GEN_ANON] = MIN_NR_GENS; + mm_state->seq[LRU_GEN_FILE] = MIN_NR_GENS; + } } #ifdef CONFIG_MEMCG @@ -5886,6 +5987,9 @@ static int __init init_lru_gen(void) BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + printk(KERN_INFO "%s %s by %s\n", + RESWAPPINESS_PROGNAME, RESWAPPINESS_VERSION, RESWAPPINESS_AUTHOR); + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); diff --git a/mm/workingset.c b/mm/workingset.c index 07e6836d05..51cfbd95dc 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -283,7 +283,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, memcg = mem_cgroup_from_private_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); - max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq[file]); max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; -- 2.34.1