fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags

Message ID	1321669813-30368-1-git-send-email-john.stultz@linaro.org
State	Superseded
Headers	show Return-Path: <patch+caf_=linaro-patchwork=canonical.com@linaro.org> Received-SPF: pass (google.com: domain of jstultz@us.ibm.com designates 32.97.110.149 as permitted sender) client-ip=32.97.110.149; Gateway: Authorized Use Only! Violators will be prosecuted for <patches@linaro.org> from <jstultz@us.ibm.com>; Fri, 18 Nov 2011 19:30:23 -0700 Gateway: Authorized Use Only! Violators will be prosecuted; Fri, 18 Nov 2011 19:30:20 -0700 From: John Stultz <john.stultz@linaro.org> To: Dave Hansen <dave@linux.vnet.ibm.com> Cc: John Stultz <john.stultz@linaro.org> Subject: [PATCH] fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags Date: Fri, 18 Nov 2011 18:30:13 -0800 Message-Id: <1321669813-30368-1-git-send-email-john.stultz@linaro.org>

different from FADV_DONTNEED since the pages are not immediately discarded; they are only discarded under pressure. This is very much influenced by the Android Ashmem interface by Robert Love so credits to him and the Android developers. In many cases the code & logic come directly from the ashmem patch. The intent of this patch is to allow for ashmem-like behavior, but embeds the idea a little deeper into the VM code, instead of isolating it into a specific driver. Also many thanks to Dave Hansen who helped design and develop the initial version of this patch, and has provided continued review and mentoring in the VM code. Signed-off-by: John Stultz <john.stultz@linaro.org> --- fs/inode.c | 1 + include/linux/fadvise.h | 6 ++ include/linux/fs.h | 53 +++++++++++++ mm/fadvise.c | 197 ++++++++++++++++++++++++++++++++++++++++++++++- mm/shmem.c | 18 ++++ 5 files changed, 274 insertions(+), 1 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index ee4e66b..c1f55f4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -278,6 +278,7 @@ void address_space_init_once(struct address_space *mapping) spin_lock_init(&mapping->private_lock); INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + INIT_LIST_HEAD(&mapping->unpinned_list); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h index e8e7471..988fb00 100644 --- a/include/linux/fadvise.h +++ b/include/linux/fadvise.h @@ -18,4 +18,10 @@ #define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ #endif +#define POSIX_FADV_VOLATILE 8 /* _can_ toss, but don't toss now */ +#define POSIX_FADV_NONVOLATILE 9 /* Remove VOLATILE flag */ +#define POSIX_FADV_ISVOLATILE 10 /* Returns volatile flag for region */ + + + #endif /* FADVISE_H_INCLUDED */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0c4df26..75c3b23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -632,6 +632,58 @@ int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); + + +/* upinned_mem_range & range helpers from Robert Love's Ashmem patch */ +struct unpinned_mem_range { + /* + * List is sorted, and no two ranges + * on the same list should overlap. + */ + struct list_head unpinned; + pgoff_t start_page; + pgoff_t end_page; + unsigned int purged; +}; + +static inline bool page_range_subsumes_range(struct unpinned_mem_range *range, + pgoff_t start_index, pgoff_t end_index) +{ + + return (range->start_page >= start_index) + && (range->end_page <= end_index); +} + +static inline bool page_range_subsumed_by_range( + struct unpinned_mem_range *range, + pgoff_t start_index, pgoff_t end_index) +{ + return (range->start_page <= start_index) + && (range->end_page >= end_index); +} + +static inline bool page_in_range(struct unpinned_mem_range *range, + pgoff_t page_index) +{ + return (range->start_page <= page_index) && + (range->end_page >= page_index); +} + +static inline bool page_range_in_range(struct unpinned_mem_range *range, + pgoff_t start_index, pgoff_t end_index) +{ + return page_in_range(range, start_index) || + page_in_range(range, end_index) || + page_range_subsumes_range(range, start_index, end_index); +} + +static inline bool range_before_page(struct unpinned_mem_range *range, + pgoff_t page_index) +{ + return range->end_page < page_index; +} + + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ @@ -650,6 +702,7 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ + struct list_head unpinned_list; /* unpinned range list */ } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but diff --git a/mm/fadvise.c b/mm/fadvise.c index 8d723c9..3a757c5 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -11,6 +11,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> @@ -20,6 +21,182 @@ #include <asm/unistd.h> + +/* + * Allocates a unpinned_mem_range, and adds it to the address_space's + * unpinned list + */ +static int unpinned_range_alloc(struct unpinned_mem_range *prev_range, + unsigned int purged, + pgoff_t start_index, pgoff_t end_index) +{ + struct unpinned_mem_range *range; + + range = kzalloc(sizeof(struct unpinned_mem_range), GFP_KERNEL); + if (!range) + return -ENOMEM; + + range->start_page = start_index; + range->end_page = end_index; + range->purged = purged; + + list_add_tail(&range->unpinned, &prev_range->unpinned); + + return 0; +} + +/* + * Deletes a unpinned_mem_range, removing it from the address_space's + * unpinned list + */ +static void unpinned_range_del(struct unpinned_mem_range *range) +{ + list_del(&range->unpinned); + kfree(range); +} + +/* + * Resizes a unpinned_mem_range + */ +static inline void unpinned_range_shrink(struct unpinned_mem_range *range, + pgoff_t start_index, pgoff_t end_index) +{ + range->start_page = start_index; + range->end_page = end_index; +} + + +/* + * Mark a region as volatile, allowing dirty pages to be purged + * under memory pressure + */ +static long fadvise_volatile(struct address_space *mapping, + pgoff_t start_index, pgoff_t end_index) +{ + struct unpinned_mem_range *range, *next; + unsigned int purged = 0; + int ret; + +printk("fadvise_volatile: start: 0x%lx end: 0x%lx\n", start_index, end_index); + +restart: + /* Iterate through the sorted range list */ + list_for_each_entry_safe(range, next, &mapping->unpinned_list, + unpinned) { + /* + * If the current existing range is before the start + * of tnew range, then we're done, since the list is + * sorted + */ + if (range_before_page(range, start_index)) + break; + /* + * If the new range is already covered by the existing + * range, then there is nothing we need to do. + */ + if (page_range_subsumed_by_range(range, start_index, end_index)) + return 0; + /* + * Coalesce if the new range overlaps the existing range, + * by growing the new range to cover the existing range, + * deleting the existing range, and start over. + * Starting over is necessary to make sure we also coalesce + * any other ranges we overlap with. + */ + if (page_range_in_range(range, start_index, end_index)) { + start_index = min_t(size_t, range->start_page, + start_index); + end_index = max_t(size_t, range->end_page, end_index); + purged |= range->purged; + unpinned_range_del(range); + goto restart; + } + + } + /* Allocate the new range and add it to the list */ + ret = unpinned_range_alloc(range, purged, start_index, end_index); + return ret; +} + +/* + * Mark a region as nonvolatile, returns 1 if any pages in the region + * were purged. + */ +static long fadvise_nonvolatile(struct address_space *mapping, + pgoff_t start_index, pgoff_t end_index) +{ + struct unpinned_mem_range *range, *next; + int ret = 0; + + + list_for_each_entry_safe(range, next, &mapping->unpinned_list, + unpinned) { + if (range_before_page(range, start_index)) + break; + + if (page_range_in_range(range, start_index, end_index)) { + ret |= range->purged; + /* Case #1: Easy. Just nuke the whole thing. */ + if (page_range_subsumes_range(range, start_index, + end_index)) { + unpinned_range_del(range); + continue; + } + + /* Case #2: We overlap from the start, so adjust it */ + if (range->start_page >= start_index) { + unpinned_range_shrink(range, end_index + 1, + range->end_page); + continue; + } + + /* Case #3: We overlap from the rear, so adjust it */ + if (range->end_page <= end_index) { + unpinned_range_shrink(range, range->start_page, + start_index - 1); + continue; + } + + /* + * Case #4: We eat a chunk out of the middle. A bit + * more complicated, we allocate a new range for the + * second half and adjust the first chunk's endpoint. + */ + unpinned_range_alloc(range, range->purged, + end_index + 1, range->end_page); + unpinned_range_shrink(range, range->start_page, + start_index - 1); + } + } + return ret; + + +} + +/* + * Returns if a region has been marked volatile or not. + * Does not return if the region has been purged. + */ +static long fadvise_isvolatile(struct address_space *mapping, + pgoff_t start_index, pgoff_t end_index) +{ + struct unpinned_mem_range *range; + long ret = 0; + + + list_for_each_entry(range, &mapping->unpinned_list, unpinned) { + if (range_before_page(range, start_index)) + break; + if (page_range_in_range(range, start_index, end_index)) { + ret = 1; + break; + } + } + return ret; +} + + + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. @@ -106,7 +283,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) nrpages = end_index - start_index + 1; if (!nrpages) nrpages = ~0UL; - + ret = force_page_cache_readahead(mapping, file, start_index, nrpages); @@ -127,6 +304,24 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) invalidate_mapping_pages(mapping, start_index, end_index); break; + case POSIX_FADV_VOLATILE: + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; + ret = fadvise_volatile(mapping, start_index, end_index); + break; + case POSIX_FADV_NONVOLATILE: + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; + ret = fadvise_nonvolatile(mapping, start_index, end_index); + break; + case POSIX_FADV_ISVOLATILE: + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; + ret = fadvise_isvolatile(mapping, start_index, end_index); + break; default: ret = -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index d672250..9594fe5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -679,6 +679,24 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) index = page->index; inode = mapping->host; info = SHMEM_I(inode); + + if (!list_empty(&mapping->unpinned_list)) { + struct unpinned_mem_range *range, *next; + printk("shmem_writepage:\n"); + list_for_each_entry_safe(range, next, &mapping->unpinned_list, + unpinned) { + printk(" range: 0x%lx - 0x%lx vs 0x%lx\n", + range->start_page, range->end_page, + (long)index ); + if (page_in_range(range, index)) { + printk(" Purged page!\n"); + range->purged = 1; + unlock_page(page); + return 0; + } + } + } + if (info->flags & VM_LOCKED) goto redirty; if (!total_swap_pages)

fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags

Commit Message

Patch