immortalwrt-mt798x/target/linux/generic/backport-5.4/020-23-BACKPORT-FROMLIST-mm-multigenerational-lru-eviction.patch

663 lines
19 KiB
Diff

From ebd2497c0f0383fd24e536c5de0f73cbe939c5af Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 5 Apr 2021 04:38:24 -0600
Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: eviction
The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It first tries to select a type based on the values of min_seq[2].
When anon and file types are both available from the same generation,
it selects the one that has a lower refault rate.
During a scan, the eviction sorts pages according to their new
generation numbers, if the aging has found them referenced. It also
moves pages from the tiers that have higher refault rates than tier 0
to the next generation. When it finds all the per-zone lists of a
selected type are empty, the eviction increments min_seq[2] indexed by
this selected type.
With the aging and the eviction in place, we can build page reclaim in
a straightforward manner:
1) In order to reduce the latency, direct reclaim only invokes the
aging when both min_seq[2] reaches max_seq-1; otherwise it invokes
the eviction.
2) In order to avoid the aging in the direct reclaim path, kswapd
does the background aging. It invokes the aging when either of
min_seq[2] reaches max_seq-1; otherwise it invokes the eviction.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
(am from https://lore.kernel.org/patchwork/patch/1432186/)
BUG=b:123039911
TEST=Built
Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987190
Reviewed-by: Yu Zhao <yuzhao@chromium.org>
Tested-by: Yu Zhao <yuzhao@chromium.org>
Commit-Queue: Sonny Rao <sonnyrao@chromium.org>
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
---
include/linux/mmzone.h | 5 +
mm/vmscan.c | 531 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 536 insertions(+)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -811,6 +811,8 @@ struct deferred_split {
};
#endif
+struct mm_walk_args;
+
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
@@ -900,6 +902,9 @@ typedef struct pglist_data {
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ struct mm_walk_args *mm_walk_args;
+#endif
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1169,6 +1169,11 @@ static unsigned long shrink_page_list(st
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
+ /* in case the page was found accessed by lru_gen_scan_around() */
+ if (lru_gen_enabled() && !ignore_references &&
+ page_mapped(page) && PageReferenced(page))
+ goto keep_locked;
+
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -3818,6 +3823,482 @@ out:
}
/******************************************************************************
+ * the eviction
+ ******************************************************************************/
+
+static bool should_skip_page(struct page *page, struct scan_control *sc)
+{
+ if (!sc->may_unmap && page_mapped(page))
+ return true;
+
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
+ return true;
+
+ if (!get_page_unless_zero(page))
+ return true;
+
+ return false;
+}
+
+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_to_isolate)
+{
+ bool success;
+ int gen = page_lru_gen(page);
+ int type = page_is_file_cache(page);
+ int zone = page_zonenum(page);
+ int tier = lru_tier_from_usage(page_tier_usage(page));
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ VM_BUG_ON_PAGE(gen == -1, page);
+ VM_BUG_ON_PAGE(tier_to_isolate < 0, page);
+
+ /* a lazy-free page that has been written into? */
+ if (type && PageDirty(page) && PageAnon(page)) {
+ success = lru_gen_deletion(page, lruvec);
+ VM_BUG_ON_PAGE(!success, page);
+ SetPageSwapBacked(page);
+ add_page_to_lru_list_tail(page, lruvec);
+ return true;
+ }
+
+ /* page_update_gen() has updated the gen #? */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
+ /* activate this page if its tier has a higher refault rate */
+ if (tier_to_isolate < tier) {
+ int hist = hist_from_seq_or_gen(gen);
+
+ page_inc_gen(page, lruvec, false);
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1],
+ lrugen->activated[hist][type][tier - 1] + hpage_nr_pages(page));
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ return true;
+ }
+
+ /* mark this page for reclaim if it's pending writeback */
+ if (PageWriteback(page) || (type && PageDirty(page))) {
+ page_inc_gen(page, lruvec, true);
+ return true;
+ }
+
+ return false;
+}
+
+static void isolate_page(struct page *page, struct lruvec *lruvec)
+{
+ bool success;
+
+ success = lru_gen_deletion(page, lruvec);
+ VM_BUG_ON_PAGE(!success, page);
+
+ ClearPageLRU(page);
+
+ if (PageActive(page)) {
+ ClearPageActive(page);
+ /* make sure shrink_page_list() rejects this page */
+ SetPageReferenced(page);
+ return;
+ }
+
+ /* make sure shrink_page_list() doesn't try to write this page */
+ ClearPageReclaim(page);
+ /* make sure shrink_page_list() doesn't reject this page */
+ ClearPageReferenced(page);
+}
+
+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, long *nr_to_scan,
+ int type, int tier, struct list_head *list)
+{
+ bool success;
+ int gen, zone;
+ enum vm_event_item item;
+ int sorted = 0;
+ int scanned = 0;
+ int isolated = 0;
+ int batch_size = 0;
+ struct lrugen *lrugen = &lruvec->evictable;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ VM_BUG_ON(!list_empty(list));
+
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
+ return -ENOENT;
+
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ LIST_HEAD(moved);
+ int skipped = 0;
+ struct list_head *head = &lrugen->lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct page *page = lru_to_page(head);
+ int delta = hpage_nr_pages(page);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_is_file_cache(page) != type, page);
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ scanned += delta;
+
+ if (sort_page(page, lruvec, tier))
+ sorted += delta;
+ else if (should_skip_page(page, sc)) {
+ list_move(&page->lru, &moved);
+ skipped += delta;
+ } else {
+ isolate_page(page, lruvec);
+ list_add(&page->lru, list);
+ isolated += delta;
+ }
+
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
+ ++batch_size == MAX_BATCH_SIZE)
+ break;
+ }
+
+ list_splice(&moved, head);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
+ batch_size == MAX_BATCH_SIZE)
+ break;
+ }
+
+ success = try_inc_min_seq(lruvec, type);
+
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (global_reclaim(sc)) {
+ __count_vm_events(item, isolated);
+ __count_vm_events(PGREFILL, sorted);
+ }
+ __count_memcg_events(memcg, item, isolated);
+ __count_memcg_events(memcg, PGREFILL, sorted);
+
+ *nr_to_scan -= scanned;
+
+ if (*nr_to_scan <= 0 || success || isolated)
+ return isolated;
+ /*
+ * We may have trouble finding eligible pages due to reclaim_idx,
+ * may_unmap and may_writepage. The following check makes sure we won't
+ * be stuck if we aren't making enough progress.
+ */
+ return batch_size == MAX_BATCH_SIZE && sorted >= SWAP_CLUSTER_MAX ? 0 : -ENOENT;
+}
+
+static int get_tier_to_isolate(struct lruvec *lruvec, int type)
+{
+ int tier;
+ struct controller_pos sp, pv;
+
+ /*
+ * Ideally we don't want to evict upper tiers that have higher refault
+ * rates. However, we need to leave a margin for the fluctuations in
+ * refault rates. So we use a larger gain factor to make sure upper
+ * tiers are indeed more active. We choose 2 because the lowest upper
+ * tier would have twice of the refault rate of the base tier, according
+ * to their numbers of accesses.
+ */
+ read_controller_pos(&sp, lruvec, type, 0, 1);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_controller_pos(&pv, lruvec, type, tier, 2);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ return tier - 1;
+}
+
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_to_isolate)
+{
+ int type, tier;
+ struct controller_pos sp, pv;
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+
+ /*
+ * Compare the refault rates between the base tiers of anon and file to
+ * determine which type to evict. Also need to compare the refault rates
+ * of the upper tiers of the selected type with that of the base tier of
+ * the other type to determine which tier of the selected type to evict.
+ */
+ read_controller_pos(&sp, lruvec, 0, 0, gain[0]);
+ read_controller_pos(&pv, lruvec, 1, 0, gain[1]);
+ type = positive_ctrl_err(&sp, &pv);
+
+ read_controller_pos(&sp, lruvec, !type, 0, gain[!type]);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_controller_pos(&pv, lruvec, type, tier, gain[type]);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ *tier_to_isolate = tier - 1;
+
+ return type;
+}
+
+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ long *nr_to_scan, int *type_to_scan, struct list_head *list)
+{
+ int i;
+ int type;
+ int isolated;
+ int tier = -1;
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+
+ if (get_hi_wmark(max_seq, min_seq, swappiness) == MIN_NR_GENS)
+ return 0;
+ /*
+ * Try to select a type based on generations and swappiness, and if that
+ * fails, fall back to get_type_to_scan(). When anon and file are both
+ * available from the same generation, swappiness 200 is interpreted as
+ * anon first and swappiness 1 is interpreted as file first.
+ */
+ type = !swappiness || min_seq[0] > min_seq[1] ||
+ (min_seq[0] == min_seq[1] && swappiness != 200 &&
+ (swappiness == 1 || get_type_to_scan(lruvec, swappiness, &tier)));
+
+ if (tier == -1)
+ tier = get_tier_to_isolate(lruvec, type);
+
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
+ isolated = scan_pages(lruvec, sc, nr_to_scan, type, tier, list);
+ if (isolated >= 0)
+ break;
+
+ type = !type;
+ tier = get_tier_to_isolate(lruvec, type);
+ }
+
+ if (isolated < 0)
+ isolated = *nr_to_scan = 0;
+
+ *type_to_scan = type;
+
+ return isolated;
+}
+
+/* Main function used by the foreground, the background and the user-triggered eviction. */
+static bool evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ long *nr_to_scan)
+{
+ int type;
+ int isolated;
+ int reclaimed;
+ LIST_HEAD(list);
+ struct page *page;
+ enum vm_event_item item;
+ struct reclaim_stat stat;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock_irq(&pgdat->lru_lock);
+
+ isolated = isolate_pages(lruvec, sc, swappiness, nr_to_scan, &type, &list);
+ VM_BUG_ON(list_empty(&list) == !!isolated);
+
+ if (isolated)
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, isolated);
+
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ if (!isolated)
+ goto done;
+
+ reclaimed = shrink_page_list(&list, pgdat, sc, 0, &stat, false);
+ /*
+ * We need to prevent rejected pages from being added back to the same
+ * lists they were isolated from. Otherwise we may risk looping on them
+ * forever. We use PageActive() or !PageReferenced() && PageWorkingset()
+ * to tell lru_gen_addition() not to add them to the oldest generation.
+ */
+ list_for_each_entry(page, &list, lru) {
+ if (PageMlocked(page))
+ continue;
+
+ if (page_mapped(page) && PageReferenced(page))
+ SetPageActive(page);
+ else if (!PageActive(page))
+ SetPageWorkingset(page);
+ ClearPageReferenced(page);
+ }
+
+ spin_lock_irq(&pgdat->lru_lock);
+
+ move_pages_to_lru(lruvec, &list);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, -isolated);
+
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, reclaimed);
+
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ mem_cgroup_uncharge_list(&list);
+ free_unref_page_list(&list);
+
+ sc->nr_reclaimed += reclaimed;
+done:
+ return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim;
+}
+
+/******************************************************************************
+ * page reclaim
+ ******************************************************************************/
+
+static int get_swappiness(struct lruvec *lruvec)
+{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ int swappiness = mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ?
+ mem_cgroup_swappiness(memcg) : 0;
+
+ VM_BUG_ON(swappiness > 200U);
+
+ return swappiness;
+}
+
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+{
+ int gen, type, zone;
+ long nr_to_scan = 0;
+ struct lrugen *lrugen = &lruvec->evictable;
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ lru_add_drain();
+
+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone <= sc->reclaim_idx; zone++)
+ nr_to_scan += READ_ONCE(lrugen->sizes[gen][type][zone]);
+ }
+ }
+
+ nr_to_scan = max(nr_to_scan, 0L);
+ nr_to_scan = round_up(nr_to_scan >> sc->priority, SWAP_CLUSTER_MAX);
+
+ if (get_hi_wmark(max_seq, min_seq, swappiness) > MIN_NR_GENS)
+ return nr_to_scan;
+
+ /* kswapd uses lru_gen_age_node() */
+ if (current_is_kswapd())
+ return 0;
+
+ return walk_mm_list(lruvec, max_seq, sc, swappiness, NULL) ? nr_to_scan : 0;
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ long scanned = 0;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ enum mem_cgroup_protection prot = mem_cgroup_protected(sc->target_mem_cgroup, memcg);
+
+ blk_start_plug(&plug);
+
+ while (true) {
+ long nr_to_scan;
+ int swappiness = sc->may_swap ? get_swappiness(lruvec) : 0;
+
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness) - scanned;
+ if (nr_to_scan < (long)SWAP_CLUSTER_MAX)
+ break;
+
+ scanned += nr_to_scan;
+
+ if (!evict_pages(lruvec, sc, swappiness, &nr_to_scan))
+ break;
+
+ scanned -= nr_to_scan;
+
+ if (prot == MEMCG_PROT_MIN || (prot == MEMCG_PROT_LOW && !sc->memcg_low_reclaim))
+ break;
+
+ cond_resched();
+ }
+
+ blk_finish_plug(&plug);
+}
+
+/******************************************************************************
+ * the background aging
+ ******************************************************************************/
+
+static int lru_gen_spread = MIN_NR_GENS;
+
+static void try_walk_mm_list(struct lruvec *lruvec, struct scan_control *sc)
+{
+ int gen, type, zone;
+ long old_and_young[2] = {};
+ int spread = READ_ONCE(lru_gen_spread);
+ int swappiness = get_swappiness(lruvec);
+ struct lrugen *lrugen = &lruvec->evictable;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ lru_add_drain();
+
+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ old_and_young[seq == max_seq] +=
+ READ_ONCE(lrugen->sizes[gen][type][zone]);
+ }
+ }
+
+ old_and_young[0] = max(old_and_young[0], 0L);
+ old_and_young[1] = max(old_and_young[1], 0L);
+
+ /* try to spread pages out across spread+1 generations */
+ if (old_and_young[0] >= old_and_young[1] * spread &&
+ get_lo_wmark(max_seq, min_seq, swappiness) > max(spread, MIN_NR_GENS))
+ return;
+
+ walk_mm_list(lruvec, max_seq, sc, swappiness, pgdat->mm_walk_args);
+}
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON(!current_is_kswapd());
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ enum mem_cgroup_protection prot = mem_cgroup_protected(sc->target_mem_cgroup,
+ memcg);
+
+ if (prot != MEMCG_PROT_MIN && (prot != MEMCG_PROT_LOW || sc->memcg_low_reclaim))
+ try_walk_mm_list(lruvec, sc);
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+}
+
+/******************************************************************************
* state change
******************************************************************************/
@@ -4020,6 +4501,21 @@ static int __meminit __maybe_unused lru_
return NOTIFY_DONE;
}
+static void lru_gen_start_kswapd(int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid);
+ WARN_ON_ONCE(!pgdat->mm_walk_args);
+}
+
+static void lru_gen_stop_kswapd(int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+ kvfree(pgdat->mm_walk_args);
+}
+
/******************************************************************************
* initialization
******************************************************************************/
@@ -4068,6 +4564,24 @@ static int __init init_lru_gen(void)
*/
arch_initcall(init_lru_gen);
+#else /* CONFIG_LRU_GEN */
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void lru_gen_start_kswapd(int nid)
+{
+}
+
+static void lru_gen_stop_kswapd(int nid)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
/*
@@ -4086,6 +4600,12 @@ static void shrink_node_memcg(struct pgl
struct blk_plug plug;
bool scan_adjusted;
+ if (lru_gen_enabled()) {
+ *lru_pages = 0;
+ lru_gen_shrink_lruvec(lruvec, sc);
+ return;
+ }
+
get_scan_count(lruvec, memcg, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
@@ -4560,6 +5080,9 @@ static void snapshot_refaults(struct mem
{
struct mem_cgroup *memcg;
+ if (lru_gen_enabled())
+ return;
+
memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
do {
unsigned long refaults;
@@ -4922,6 +5445,11 @@ static void age_active_anon(struct pglis
{
struct mem_cgroup *memcg;
+ if (lru_gen_enabled()) {
+ lru_gen_age_node(pgdat, sc);
+ return;
+ }
+
if (!total_swap_pages)
return;
@@ -5615,6 +6143,8 @@ int kswapd_run(int nid)
if (pgdat->kswapd)
return 0;
+ lru_gen_start_kswapd(nid);
+
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
@@ -5637,6 +6167,7 @@ void kswapd_stop(int nid)
if (kswapd) {
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd = NULL;
+ lru_gen_stop_kswapd(nid);
}
}