@@ -83,6 +83,9 @@ enum ttu_flags {
};
#ifdef CONFIG_MMU
+unsigned long discard_vrange_page_list(struct zone *zone,
+ struct list_head *page_list);
+
unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
static inline void get_anon_vma(struct anon_vma *anon_vma)
@@ -33,6 +33,13 @@ static inline int vrange_type(struct vrange *vrange)
return vrange->owner->type;
}
+static inline struct mm_struct *vrange_get_owner_mm(struct vrange *vrange)
+{
+ if (vrange_type(vrange) != VRANGE_MM)
+ return NULL;
+ return container_of(vrange->owner, struct mm_struct, vroot);
+}
+
void vrange_init(void);
extern void vrange_root_cleanup(struct vrange_root *vroot);
extern int vrange_root_duplicate(struct vrange_root *orig,
@@ -16,6 +16,7 @@ struct vrange {
struct vrange_root *owner;
bool purged;
struct list_head lru; /* protected by lru_lock */
+ atomic_t refcount;
};
#endif
@@ -683,7 +683,7 @@ static enum page_references page_check_references(struct page *page,
/*
* shrink_page_list() returns the number of reclaimed pages
*/
-static unsigned long shrink_page_list(struct list_head *page_list,
+unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,
struct scan_control *sc,
enum ttu_flags ttu_flags,
@@ -985,6 +985,35 @@ keep:
return nr_reclaimed;
}
+
+unsigned long discard_vrange_page_list(struct zone *zone,
+ struct list_head *page_list)
+{
+ unsigned long ret;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .may_discard = 1
+ };
+
+ unsigned long dummy1, dummy2;
+ struct page *page;
+
+ list_for_each_entry(page, page_list, lru) {
+ VM_BUG_ON(!PageAnon(page));
+ ClearPageActive(page);
+ }
+
+ /* page_list have pages from multiple zones */
+ ret = shrink_page_list(page_list, NULL, &sc,
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, false);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -ret);
+ return ret;
+}
+
unsigned long reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list)
{
@@ -2781,6 +2810,16 @@ loop_again:
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
!zone_balanced(zone, testorder,
balance_gap, end_zone)) {
+
+ unsigned int nr_discard;
+ if (testorder == 0) {
+ nr_discard = discard_vrange_pages(zone,
+ SWAP_CLUSTER_MAX);
+ sc.nr_reclaimed += nr_discard;
+ if (zone_balanced(zone, testorder, 0,
+ end_zone))
+ goto zone_balanced;
+ }
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2805,7 +2844,8 @@ loop_again:
continue;
}
- if (zone_balanced(zone, testorder, 0, end_zone))
+ if (zone_balanced(zone, testorder, 0, end_zone)) {
+zone_balanced:
/*
* If a zone reaches its high watermark,
* consider it to be no longer congested. It's
@@ -2814,6 +2854,7 @@ loop_again:
* speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
+ }
}
/*
@@ -14,13 +14,30 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
+
+struct vrange_walker_private {
+ struct zone *zone;
+ struct vm_area_struct *vma;
+ struct list_head *pagelist;
+};
static LIST_HEAD(lru_vrange);
static DEFINE_SPINLOCK(lru_lock);
static struct kmem_cache *vrange_cachep;
+static void vrange_ctor(void *data)
+{
+ struct vrange *vrange = data;
+ INIT_LIST_HEAD(&vrange->lru);
+}
+void __init vrange_init(void)
+{
+ vrange_cachep = kmem_cache_create("vrange", sizeof(struct vrange),
+ 0, SLAB_PANIC, vrange_ctor);
+}
void lru_add_vrange(struct vrange *vrange)
{
@@ -62,17 +79,13 @@ void lru_move_vrange_to_head(struct mm_struct *mm, unsigned long address)
vrange_unlock(vroot);
}
-void __init vrange_init(void)
-{
- vrange_cachep = KMEM_CACHE(vrange, SLAB_PANIC);
-}
-
static struct vrange *__vrange_alloc(void)
{
struct vrange *vrange = kmem_cache_alloc(vrange_cachep, GFP_KERNEL);
if (!vrange)
return vrange;
vrange->owner = NULL;
+ atomic_set(&vrange->refcount, 1);
INIT_LIST_HEAD(&vrange->lru);
return vrange;
}
@@ -84,6 +97,13 @@ static void __vrange_free(struct vrange *range)
kmem_cache_free(vrange_cachep, range);
}
+static void put_vrange(struct vrange *range)
+{
+ WARN_ON(atomic_read(&range->refcount) < 0);
+ if (atomic_dec_and_test(&range->refcount))
+ __vrange_free(range);
+}
+
static void __vrange_add(struct vrange *range, struct vrange_root *vroot)
{
range->owner = vroot;
@@ -136,7 +156,7 @@ static int vrange_add(struct vrange_root *vroot,
range = container_of(node, struct vrange, node);
if (node->start < start && node->last > end) {
- __vrange_free(new_range);
+ put_vrange(new_range);
goto out;
}
@@ -145,7 +165,7 @@ static int vrange_add(struct vrange_root *vroot,
purged |= range->purged;
__vrange_remove(range);
- __vrange_free(range);
+ put_vrange(range);
node = next;
}
@@ -182,7 +202,7 @@ static int vrange_remove(struct vrange_root *vroot,
if (start <= node->start && end >= node->last) {
__vrange_remove(range);
- __vrange_free(range);
+ put_vrange(range);
} else if (node->start >= start) {
__vrange_resize(range, end, node->last);
} else if (node->last <= end) {
@@ -200,7 +220,7 @@ static int vrange_remove(struct vrange_root *vroot,
vrange_unlock(vroot);
if (!used_new)
- __vrange_free(new_range);
+ put_vrange(new_range);
return 0;
}
@@ -255,7 +275,7 @@ again:
vrange_unlock(new);
vrange_unlock(old);
- __vrange_free(alloc_range);
+ put_vrange(alloc_range);
return ret;
}
@@ -271,7 +291,7 @@ void vrange_root_cleanup(struct vrange_root *vroot)
range = vrange_entry(next);
next = rb_next(next);
__vrange_remove(range);
- __vrange_free(range);
+ put_vrange(range);
}
vrange_unlock(vroot);
}
@@ -659,6 +679,7 @@ int discard_vpage(struct page *page)
if (page_freeze_refs(page, 1)) {
unlock_page(page);
+ dec_zone_page_state(page, NR_ISOLATED_ANON);
return 1;
}
}
@@ -684,3 +705,210 @@ bool is_purged_vrange(struct mm_struct *mm, unsigned long address)
vrange_unlock(vroot);
return ret;
}
+
+static void vrange_pte_entry(pte_t pteval, unsigned long address,
+ unsigned ptent_size, struct mm_walk *walk)
+{
+ struct page *page;
+ struct vrange_walker_private *vwp = walk->private;
+ struct vm_area_struct *vma = vwp->vma;
+ struct list_head *pagelist = vwp->pagelist;
+ struct zone *zone = vwp->zone;
+
+ if (pte_none(pteval))
+ return;
+
+ if (!pte_present(pteval))
+ return;
+
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page))
+ return;
+
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ return;
+
+ /* TODO : Support THP and HugeTLB */
+ if (unlikely(PageCompound(page)))
+ return;
+
+ if (zone_idx(page_zone(page)) > zone_idx(zone))
+ return;
+
+ if (isolate_lru_page(page))
+ return;
+
+ list_add(&page->lru, pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON);
+}
+
+static int vrange_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ vrange_pte_entry(*pte, addr, PAGE_SIZE, walk);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return 0;
+
+}
+
+unsigned int discard_vma_pages(struct zone *zone, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned int nr_to_discard)
+{
+ LIST_HEAD(pagelist);
+ int ret = 0;
+ struct vrange_walker_private vwp;
+ struct mm_walk vrange_walk = {
+ .pmd_entry = vrange_pte_range,
+ .mm = vma->vm_mm,
+ .private = &vwp,
+ };
+
+ vwp.pagelist = &pagelist;
+ vwp.vma = vma;
+ vwp.zone = zone;
+
+ walk_page_range(start, end, &vrange_walk);
+
+ if (!list_empty(&pagelist))
+ ret = discard_vrange_page_list(zone, &pagelist);
+
+ putback_lru_pages(&pagelist);
+ return ret;
+}
+
+unsigned int discard_vrange(struct zone *zone, struct vrange *vrange,
+ int nr_to_discard)
+{
+ struct mm_struct *mm;
+ unsigned long start = vrange->node.start;
+ unsigned long end = vrange->node.last;
+ struct vm_area_struct *vma;
+ unsigned int nr_discarded = 0;
+
+ mm = vrange_get_owner_mm(vrange);
+
+ if (!mm)
+ goto out;
+
+ if (!down_read_trylock(&mm->mmap_sem))
+ goto out;
+
+ vma = find_vma(mm, start);
+ if (!vma || (vma->vm_start > end))
+ goto out_unlock;
+
+ for (; vma; vma = vma->vm_next) {
+ if (vma->vm_start > end)
+ break;
+
+ if (vma->vm_file ||
+ (vma->vm_flags & (VM_SPECIAL | VM_LOCKED)))
+ continue;
+
+ cond_resched();
+ nr_discarded +=
+ discard_vma_pages(zone, mm, vma,
+ max_t(unsigned long, start, vma->vm_start),
+ min_t(unsigned long, end + 1, vma->vm_end),
+ nr_to_discard);
+ }
+out_unlock:
+ up_read(&mm->mmap_sem);
+out:
+ return nr_discarded;
+}
+
+/*
+ * Get next victim vrange from LRU and hold a vrange refcount
+ * and vrange->mm's refcount.
+ */
+struct vrange *get_victim_vrange(void)
+{
+ struct mm_struct *mm;
+ struct vrange *vrange = NULL;
+ struct list_head *cur, *tmp;
+
+ spin_lock(&lru_lock);
+ list_for_each_prev_safe(cur, tmp, &lru_vrange) {
+ vrange = list_entry(cur, struct vrange, lru);
+ mm = vrange_get_owner_mm(vrange);
+ /* the process is exiting so pass it */
+ if (atomic_read(&mm->mm_users) == 0) {
+ list_del_init(&vrange->lru);
+ vrange = NULL;
+ continue;
+ }
+
+ /* vrange is freeing so continue to loop */
+ if (!atomic_inc_not_zero(&vrange->refcount)) {
+ list_del_init(&vrange->lru);
+ vrange = NULL;
+ continue;
+ }
+
+ /*
+ * we need to access mmap_sem further routine so
+ * need to get a refcount of mm.
+ * NOTE: We guarantee mm_count isn't zero in here because
+ * if we found vrange from LRU list, it means we are
+ * before exit_vrange or remove_vrange.
+ */
+ atomic_inc(&mm->mm_count);
+
+ /* Isolate vrange */
+ list_del_init(&vrange->lru);
+ break;
+ }
+
+ spin_unlock(&lru_lock);
+ return vrange;
+}
+
+void put_victim_range(struct vrange *vrange)
+{
+ struct mm_struct *mm = vrange_get_owner_mm(vrange);
+ put_vrange(vrange);
+ mmdrop(mm);
+}
+
+unsigned int discard_vrange_pages(struct zone *zone, int nr_to_discard)
+{
+ struct vrange *vrange, *start_vrange;
+ unsigned int nr_discarded = 0;
+
+ start_vrange = vrange = get_victim_vrange();
+ if (start_vrange) {
+ struct mm_struct *mm = vrange_get_owner_mm(start_vrange);
+ atomic_inc(&start_vrange->refcount);
+ atomic_inc(&mm->mm_count);
+ }
+
+ while (vrange) {
+ nr_discarded += discard_vrange(zone, vrange, nr_to_discard);
+ lru_add_vrange(vrange);
+ put_victim_range(vrange);
+
+ if (nr_discarded >= nr_to_discard)
+ break;
+
+ vrange = get_victim_vrange();
+ /* break if we go round the loop */
+ if (vrange == start_vrange) {
+ lru_add_vrange(vrange);
+ put_victim_range(vrange);
+ break;
+ }
+ }
+
+ if (start_vrange)
+ put_victim_range(start_vrange);
+
+ return nr_discarded;
+}