Martin Schwidefsky
2007-May-11 07:00 UTC
[patch 4/6] Guest page hinting: writable page table entries.
From: Martin Schwidefsky <schwidefsky@de.ibm.com> From: Hubertus Franke <frankeh@watson.ibm.com> From: Himanshu Raj <rhim@cc.gatech.edu> The volatile state for page cache and swap cache pages requires that the host system needs to be able to determine if a volatile page is dirty before removing it. This excludes almost all platforms from using the scheme. What is needed is a way to distinguish between pages that are purely read-only and pages that might get written to. This allows platforms with per-pte dirty bits to use the scheme and platforms with per-page dirty bits a small optimization. Whenever a writable pte is created a check is added that allows to move the page into the correct state. This needs to be done before the writable pte is established. To avoid unnecessary state transitions and the need for a counter, a new page flag PG_writable is added. Only the creation of the first writable pte will do a page state change. Even if all the writable ptes pointing to a page are removed again, the page stays in the safe state until all read-only users of the page have unmapped it as well. Only then is the PG_writable bit reset. The state a page needs to have if a writable pte is present depends on the platform. A platform with per-pte dirty bits wants to move the page into stable state, a platform with per-page dirty bits like s390 can decide to move the page into a special state that requires the host system to check the dirty bit before discarding a page. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- fs/exec.c | 7 +++-- include/linux/page-flags.h | 6 ++++ include/linux/page-states.h | 27 +++++++++++++++++++- mm/fremap.c | 1 mm/memory.c | 5 +++ mm/mprotect.c | 2 + mm/page-states.c | 58 ++++++++++++++++++++++++++++++++++++++++++-- mm/page_alloc.c | 3 +- mm/rmap.c | 1 9 files changed, 104 insertions(+), 6 deletions(-) diff -urpN linux-2.6/fs/exec.c linux-2.6-patched/fs/exec.c --- linux-2.6/fs/exec.c 2007-05-09 09:32:21.000000000 +0200 +++ linux-2.6-patched/fs/exec.c 2007-05-11 15:52:17.000000000 +0200 @@ -50,6 +50,7 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> +#include <linux/page-states.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -309,6 +310,7 @@ void install_arg_page(struct vm_area_str { struct mm_struct *mm = vma->vm_mm; pte_t * pte; + pte_t pte_val; spinlock_t *ptl; if (unlikely(anon_vma_prepare(vma))) @@ -324,8 +326,9 @@ void install_arg_page(struct vm_area_str } inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); - set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( - page, vma->vm_page_prot)))); + pte_val = pte_mkdirty(pte_mkwrite(mk_pte(page, vma->vm_page_prot))); + page_check_writable(page, pte_val, 2); + set_pte_at(mm, address, pte, pte_val); page_add_new_anon_rmap(page, vma, address); pte_unmap_unlock(pte, ptl); diff -urpN linux-2.6/include/linux/page-flags.h linux-2.6-patched/include/linux/page-flags.h --- linux-2.6/include/linux/page-flags.h 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/include/linux/page-flags.h 2007-05-11 15:52:17.000000000 +0200 @@ -105,6 +105,7 @@ #endif #define PG_discarded 20 /* Page discarded by the hypervisor. */ +#define PG_writable 21 /* Page is mapped writable. */ /* * Manipulation of page state flags @@ -283,6 +284,11 @@ static inline void __ClearPageTail(struc #define TestSetPageDiscarded(page) 0 #endif +#define PageWritable(page) test_bit(PG_writable, &(page)->flags) +#define TestSetPageWritable(page) \ + test_and_set_bit(PG_writable, &(page)->flags) +#define ClearPageWritable(page) clear_bit(PG_writable, &(page)->flags) + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); diff -urpN linux-2.6/include/linux/page-states.h linux-2.6-patched/include/linux/page-states.h --- linux-2.6/include/linux/page-states.h 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/include/linux/page-states.h 2007-05-11 15:52:17.000000000 +0200 @@ -55,6 +55,9 @@ extern void page_discard(struct page *pa extern int __page_make_stable(struct page *page); extern void __page_make_volatile(struct page *page, int offset); extern void __pagevec_make_volatile(struct pagevec *pvec); +extern void __page_check_writable(struct page *page, pte_t pte, + unsigned int offset); +extern void __page_reset_writable(struct page *page); /* * Extended guest page hinting functions defined by using the @@ -76,6 +79,12 @@ extern void __pagevec_make_volatile(stru * from the LRU list and the radix tree of its mapping. * page_discard uses page_unmap_all to remove all page table * entries for a page. + * - page_check_writable: + * Checks if the page states needs to be adapted because a new + * writable page table entry refering to the page is established. + * - page_reset_writable: + * Resets the page state after the last writable page table entry + * refering to the page has been removed. */ static inline int page_make_stable(struct page *page) @@ -95,12 +104,26 @@ static inline void pagevec_make_volatile __pagevec_make_volatile(pvec); } +static inline void page_check_writable(struct page *page, pte_t pte, + unsigned int offset) +{ + if (page_host_discards() && pte_write(pte) && + !test_bit(PG_writable, &page->flags)) + __page_check_writable(page, pte, offset); +} + +static inline void page_reset_writable(struct page *page) +{ + if (page_host_discards() && test_bit(PG_writable, &page->flags)) + __page_reset_writable(page); +} + #else #define page_host_discards() (0) #define page_set_unused(_page,_order) do { } while (0) #define page_set_stable(_page,_order) do { } while (0) -#define page_set_volatile(_page) do { } while (0) +#define page_set_volatile(_page,_writable) do { } while (0) #define page_set_stable_if_present(_page) (1) #define page_discarded(_page) (0) @@ -114,6 +137,8 @@ static inline void pagevec_make_volatile #define page_make_volatile(_page, offset) do { } while (0) #define pagevec_make_volatile(_pagevec) do { } while (0) #define page_discard(_page) do { } while (0) +#define page_check_writable(_page,_pte,_off) do { } while (0) +#define page_reset_writable(_page) do { } while (0) #endif diff -urpN linux-2.6/mm/fremap.c linux-2.6-patched/mm/fremap.c --- linux-2.6/mm/fremap.c 2007-04-02 17:11:19.000000000 +0200 +++ linux-2.6-patched/mm/fremap.c 2007-05-11 15:52:17.000000000 +0200 @@ -80,6 +80,7 @@ int install_page(struct mm_struct *mm, s flush_icache_page(vma, page); pte_val = mk_pte(page, prot); + page_check_writable(page, pte_val, 2); set_pte_at(mm, addr, pte, pte_val); page_add_file_rmap(page); update_mmu_cache(vma, addr, pte_val); diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c --- linux-2.6/mm/memory.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/memory.c 2007-05-11 15:52:17.000000000 +0200 @@ -1744,6 +1744,7 @@ static int do_wp_page(struct mm_struct * flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_check_writable(old_page, entry, 1); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); @@ -1793,6 +1794,7 @@ gotten: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_check_writable(new_page, entry, 2); lazy_mmu_prot_update(entry); /* * Clear the pte entry and flush it first, before updating the @@ -2245,6 +2247,7 @@ static int do_swap_page(struct mm_struct } flush_icache_page(vma, page); + page_check_writable(page, pte, 2); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); @@ -2299,6 +2302,7 @@ static int do_anonymous_page(struct mm_s entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_check_writable(page, entry, 2); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) @@ -2452,6 +2456,7 @@ retry: entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_check_writable(new_page, entry, 2); set_pte_at(mm, address, page_table, entry); if (anon) { inc_mm_counter(mm, anon_rss); diff -urpN linux-2.6/mm/mprotect.c linux-2.6-patched/mm/mprotect.c --- linux-2.6/mm/mprotect.c 2007-04-02 17:11:20.000000000 +0200 +++ linux-2.6-patched/mm/mprotect.c 2007-05-11 15:52:17.000000000 +0200 @@ -21,6 +21,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/page-states.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -52,6 +53,7 @@ static void change_pte_range(struct mm_s */ if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); + page_check_writable(pte_page(ptent), ptent, 1); set_pte_at(mm, addr, pte, ptent); lazy_mmu_prot_update(ptent); #ifdef CONFIG_MIGRATION diff -urpN linux-2.6/mm/page_alloc.c linux-2.6-patched/mm/page_alloc.c --- linux-2.6/mm/page_alloc.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/page_alloc.c 2007-05-11 15:52:17.000000000 +0200 @@ -608,7 +608,8 @@ static int prep_new_page(struct page *pa page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk | + 1 << PG_writable); set_page_private(page, 0); set_page_refcounted(page); diff -urpN linux-2.6/mm/page-states.c linux-2.6-patched/mm/page-states.c --- linux-2.6/mm/page-states.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/page-states.c 2007-05-11 15:52:17.000000000 +0200 @@ -74,7 +74,7 @@ void __page_make_volatile(struct page *p preempt_disable(); if (!page_test_set_state_change(page)) { if (check_bits(page) && check_counts(page, offset)) - page_set_volatile(page); + page_set_volatile(page, PageWritable(page)); page_clear_state_change(page); } preempt_enable(); @@ -100,7 +100,7 @@ void __pagevec_make_volatile(struct page page = pvec->pages[i]; if (!page_test_set_state_change(page)) { if (check_bits(page) && check_counts(page, 1)) - page_set_volatile(page); + page_set_volatile(page, PageWritable(page)); page_clear_state_change(page); } } @@ -133,6 +133,60 @@ int __page_make_stable(struct page *page EXPORT_SYMBOL(__page_make_stable); /** + * __page_check_writable() - check page state for new writable pte + * + * @page: the page the new writable pte refers to + * @pte: the new writable pte + */ +void __page_check_writable(struct page *page, pte_t pte, unsigned int offset) +{ + int count_ok = 0; + + preempt_disable(); + while (page_test_set_state_change(page)) + cpu_relax(); + + if (!TestSetPageWritable(page)) { + count_ok = check_counts(page, offset); + if (check_bits(page) && count_ok) + page_set_volatile(page, 1); + else + /* + * If two processes create a write mapping at the + * same time check_counts will return false or if + * the page is currently isolated from the LRU + * check_bits will return false but the page might + * be in volatile state. + * We have to take care about the dirty bit so the + * only option left is to make the page stable but + * we can try to make it volatile a bit later. + */ + page_set_stable_if_present(page); + } + page_clear_state_change(page); + if (!count_ok) + page_make_volatile(page, 1); + preempt_enable(); +} +EXPORT_SYMBOL(__page_check_writable); + +/** + * __page_reset_writable() - clear the PageWritable bit + * + * @page: the page + */ +void __page_reset_writable(struct page *page) +{ + preempt_disable(); + if (!page_test_set_state_change(page)) { + ClearPageWritable(page); + page_clear_state_change(page); + } + preempt_enable(); +} +EXPORT_SYMBOL(__page_reset_writable); + +/** * __page_discard() - remove a discarded page from the cache * * @page: the page diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c --- linux-2.6/mm/rmap.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/rmap.c 2007-05-11 15:52:17.000000000 +0200 @@ -627,6 +627,7 @@ void page_remove_rmap(struct page *page, } __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + page_reset_writable(page); } } -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
Martin Schwidefsky
2007-May-11 07:00 UTC
[patch 5/6] Guest page hinting: minor fault optimization.
From: Martin Schwidefsky <schwidefsky@de.ibm.com> From: Hubertus Franke <frankeh@watson.ibm.com> From: Himanshu Raj <rhim@cc.gatech.edu> On of the challenges of the guest page hinting scheme is the cost for the state transitions. If the cost gets too high the whole concept of page state information is in question. Therefore it is important to avoid the state transitions when possible. One place where the state transitions can be avoided are minor faults. Why change the page state to stable in find_get_page and back in page_add_anon_rmap/ page_add_file_rmap if the discarded pages can be handled by the discard fault handler? If the page is in page/swap cache just map it even if it is already discarded. The first access to the page will cause a discard fault which needs to be able to deal with this kind of situation anyway because of other races in the memory management. The special find_get_page_nodiscard variant introduced for volatile swap cache is used which does not change the page state. The calls to find_get_page in filemap_nopage and lookup_swap_cache are replaced with find_get_page_nodiscard. By the use of this function a new race is created. If a minor fault races with the discard of a page the page may not get mapped to the page table because the discard handler removed the page from the cache which removes the page->mapping that is needed to find the page table entry. A check for the discarded bit is added to do_swap_page and do_no_page. The page table lock for the pte takes care of the synchronization. That removes the state transitions on the minor fault path. A page that has been mapped will eventually be unmapped again. On the unmap path each page that has been removed from the page table is freed with a call to page_cache_release. In general that causes an unnecessary page state transition from volatile to volatile. To get rid of these state transitions as well a special variants of page_cache_release is added that does not attempt to make the page volatile. page_cache_release_nocheck is then used in free_page_and_swap_cache and release_pages. This makes the unmap of ptes state transitions free. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- include/linux/pagemap.h | 1 + include/linux/swap.h | 2 +- mm/filemap.c | 4 ++-- mm/fremap.c | 2 ++ mm/memory.c | 4 ++-- mm/rmap.c | 4 +--- mm/shmem.c | 7 +++++++ mm/swap_state.c | 4 ++-- 8 files changed, 18 insertions(+), 10 deletions(-) diff -urpN linux-2.6/include/linux/pagemap.h linux-2.6-patched/include/linux/pagemap.h --- linux-2.6/include/linux/pagemap.h 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/include/linux/pagemap.h 2007-05-11 15:52:17.000000000 +0200 @@ -68,6 +68,7 @@ extern struct page * find_get_page_nodis #define find_get_page_nodiscard(mapping, index) find_get_page(mapping, index) #define page_cache_release(page) put_page(page) #endif +#define page_cache_release_nocheck(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); #ifdef CONFIG_NUMA diff -urpN linux-2.6/include/linux/swap.h linux-2.6-patched/include/linux/swap.h --- linux-2.6/include/linux/swap.h 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/include/linux/swap.h 2007-05-11 15:52:17.000000000 +0200 @@ -290,7 +290,7 @@ static inline void disable_swap_token(vo /* only sparc can not include linux/pagemap.h in this file * so leave page_cache_release and release_pages undeclared... */ #define free_page_and_swap_cache(page) \ - page_cache_release(page) + page_cache_release_nocheck(page) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr), 0); diff -urpN linux-2.6/mm/filemap.c linux-2.6-patched/mm/filemap.c --- linux-2.6/mm/filemap.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/filemap.c 2007-05-11 15:52:17.000000000 +0200 @@ -1466,7 +1466,7 @@ retry_all: * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page_nodiscard(mapping, pgoff); if (!page) { unsigned long ra_pages; @@ -1500,7 +1500,7 @@ retry_find: start = pgoff - ra_pages / 2; do_page_cache_readahead(mapping, file, start, ra_pages); } - page = find_get_page(mapping, pgoff); + page = find_get_page_nodiscard(mapping, pgoff); if (!page) goto no_cached_page; } diff -urpN linux-2.6/mm/fremap.c linux-2.6-patched/mm/fremap.c --- linux-2.6/mm/fremap.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/fremap.c 2007-05-11 15:52:17.000000000 +0200 @@ -15,6 +15,7 @@ #include <linux/rmap.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/page-states.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> @@ -83,6 +84,7 @@ int install_page(struct mm_struct *mm, s page_check_writable(page, pte_val, 2); set_pte_at(mm, addr, pte, pte_val); page_add_file_rmap(page); + page_make_volatile(page, 1); update_mmu_cache(vma, addr, pte_val); lazy_mmu_prot_update(pte_val); err = 0; diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c --- linux-2.6/mm/memory.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/memory.c 2007-05-11 15:52:17.000000000 +0200 @@ -2229,7 +2229,7 @@ static int do_swap_page(struct mm_struct * Back out if somebody else already faulted in this pte. */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*page_table, orig_pte))) + if (unlikely(!pte_same(*page_table, orig_pte) || PageDiscarded(page))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2451,7 +2451,7 @@ retry: * handle that later. */ /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { + if (pte_none(*page_table) && likely(!PageDiscarded(new_page))) { flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c --- linux-2.6/mm/rmap.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/rmap.c 2007-05-11 15:52:17.000000000 +0200 @@ -557,7 +557,6 @@ void page_add_anon_rmap(struct page *pag if (atomic_inc_and_test(&page->_mapcount)) __page_set_anon_rmap(page, vma, address); /* else checking page index and mapping is racy */ - page_make_volatile(page, 1); } /* @@ -586,7 +585,6 @@ void page_add_file_rmap(struct page *pag { if (atomic_inc_and_test(&page->_mapcount)) __inc_zone_page_state(page, NR_FILE_MAPPED); - page_make_volatile(page, 1); } /** @@ -727,7 +725,7 @@ static int try_to_unmap_one(struct page } page_remove_rmap(page, vma); - page_cache_release(page); + page_cache_release_nocheck(page); out_unmap: pte_unmap_unlock(pte, ptl); diff -urpN linux-2.6/mm/shmem.c linux-2.6-patched/mm/shmem.c --- linux-2.6/mm/shmem.c 2007-05-08 09:31:18.000000000 +0200 +++ linux-2.6-patched/mm/shmem.c 2007-05-11 15:52:17.000000000 +0200 @@ -49,6 +49,7 @@ #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/backing-dev.h> +#include <linux/page-states.h> #include <asm/uaccess.h> #include <asm/div64.h> @@ -1124,6 +1125,12 @@ repeat: if (swap.val) { /* Look it up and read it in.. */ swappage = lookup_swap_cache(swap); + if (swappage && unlikely(!page_make_stable(swappage))) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + page_discard(swappage); + goto repeat; + } if (!swappage) { shmem_swp_unmap(entry); /* here we actually do the io */ diff -urpN linux-2.6/mm/swap_state.c linux-2.6-patched/mm/swap_state.c --- linux-2.6/mm/swap_state.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/swap_state.c 2007-05-11 15:52:17.000000000 +0200 @@ -288,7 +288,7 @@ static inline void free_swap_cache(struc void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - page_cache_release(page); + page_cache_release_nocheck(page); } /* @@ -322,7 +322,7 @@ struct page * lookup_swap_cache(swp_entr { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page_nodiscard(&swapper_space, entry.val); if (page) INC_CACHE_INFO(find_success); -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
From: Martin Schwidefsky <schwidefsky@de.ibm.com> From: Hubertus Franke <frankeh@watson.ibm.com> From: Himanshu Raj <rhim@cc.gatech.edu> s390 uses the milli-coded ESSA instruction to set the page state. The page state is formed by four guest page states called block usage states and three host page states called block content states. The guest states are: - stable (S): there is essential content in the page - unused (U): there is no useful content and any access to the page will cause an addressing exception - volatile (V): there is useful content in the page. The host system is allowed to discard the content anytime, but has to deliver a discard fault with the absolute address of the page if the guest tries to access it. - potential volatile (P): the page has useful content. The host system is allowed to discard the content after it has checked the dirty bit of the page. It has to deliver a discard fault with the absolute address of the page if the guest tries to access it. The host states are: - resident: the page is present in real memory. - preserved: the page is not present in real memory but the content is preserved elsewhere by the machine, e.g. on the paging device. - zero: the page is not present in real memory. The content of the page is logically-zero. There are 12 combinations of guest and host state, currently only 8 are valid page states: Sr: a stable, resident page. Sp: a stable, preserved page. Sz: a stable, logically zero page. A page filled with zeroes will be allocated on first access. Ur: an unused but resident page. The host could make it Uz anytime but it doesn't have to. Uz: an unused, logically zero page. Vr: a volatile, resident page. The guest can access it normally. Vz: a volatile, logically zero page. This is a discarded page. The host will deliver a discard fault for any access to the page. Pr: a potential volatile, resident page. The guest can access it normally. The remaining 4 combinations can't occur: Up: an unused, preserved page. If the host tries to get rid of a Ur page it will remove it without writing the page content to disk and set the page to Uz. Vp: a volatile, preserved page. If the host picks a Vr page for eviction it will discard it and set the page state to Vz. Pp: a potential volatile, preserved page. There are two cases for page out: 1) if the page is dirty then the host will preserved the page and set it to Sp or 2) if the page is clean then the host will discard it and set the page state to Vz. Pz: a potential volatile, logically zero page. The host system will always use Vz instead of Pz. The state transitions (a diagram would be nicer but that is too hard to do in ascii art...): {Ur,Sr,Vr,Pr}: a resident page will change its block usage state if the guest requests it with page_set_{unused,stable,volatile}. {Uz,Sz,Vz}: a logically zero page will change its block usage state if the guest requests it with page_set_{unused,stable,volatile}. The guest can't create the Pz state, the state will be Vz instead. Ur -> Uz: the host system can remove an unused, resident page from memory Sz -> Sr: on first access a stable, logically zero page will become resident Sr -> Sp: the host system can swap a stable page to disk Sp -> Sr: a guest access to a Sp page forces the host to retrieve it Vr -> Vz: the host can discard a volatile page Sp -> Uz: a page preserved by the host will be removed if the guest sets the block usage state to unused. Sp -> Vz: a page preserved by the host will be discarded if the guest sets the block usage state to volatile. Pr -> Sp: the host can move a page from Pr to Sp if it discovers that the page is dirty while trying to discard the page. The page content is written to the paging device. Pr -> Vz: the host can discard a Pr page. The Pz state is replaced by the Vz state. The are some hazards the code has to deal with: 1) For potential volatile pages the transfer of the hardware dirty bit to the software dirty bit needs to make sure that the page gets into the stable state before the hardware dirty bit is cleared. Between the page_test_dirty and the page_clear_dirty call a page_make_stable is required. 2) Since the access of unused pages causes addressing exceptions we need to take care with /dev/mem. The copy_{from_to}_user functions need to be able to cope with addressing exceptions for the kernel address space. 3) The discard fault on a s390 machine delivers the absolute address of the page that caused the fault instead of the virtual address. With the virtual address we could have used the page table entry of the current process to safely get a reference to the discarded page. We can get to the struct page from the absolute page address but it is rather hard to get to a proper page reference. The page that caused the fault could already have been freed and reused for a different purpose. None of the fields in the struct page would be reliable to use. The freeing of discarded pages therefore has to be postponed until all pending discard faults for this page have been dealt with. The discard fault handler is called disabled for interrupts and tries to get a page reference with get_page_unless_zero. A discarded page is only freed after all cpus have been enabled for interrupts at least once since the detection of the discarded page. This is done using the timer interrupts and the cpu-idle notifier. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- arch/s390/Kconfig | 3 arch/s390/kernel/head64.S | 10 ++ arch/s390/kernel/time.c | 11 ++ arch/s390/kernel/traps.c | 4 arch/s390/lib/uaccess_mvcos.c | 10 +- arch/s390/lib/uaccess_std.c | 7 - arch/s390/mm/fault.c | 197 +++++++++++++++++++++++++++++++++++++++++ include/asm-s390/page-states.h | 117 ++++++++++++++++++++++++ include/asm-s390/setup.h | 1 mm/rmap.c | 9 + 10 files changed, 362 insertions(+), 7 deletions(-) diff -urpN linux-2.6/arch/s390/Kconfig linux-2.6-patched/arch/s390/Kconfig --- linux-2.6/arch/s390/Kconfig 2007-05-11 09:18:14.000000000 +0200 +++ linux-2.6-patched/arch/s390/Kconfig 2007-05-11 15:52:17.000000000 +0200 @@ -402,6 +402,9 @@ config CMM_IUCV Select this option to enable the special message interface to the cooperative memory management. +config PAGE_STATES + bool "Enable support for guest page hinting." + config VIRT_TIMER bool "Virtual CPU timer support" help diff -urpN linux-2.6/arch/s390/kernel/head64.S linux-2.6-patched/arch/s390/kernel/head64.S --- linux-2.6/arch/s390/kernel/head64.S 2007-04-28 08:51:45.000000000 +0200 +++ linux-2.6-patched/arch/s390/kernel/head64.S 2007-05-11 15:52:17.000000000 +0200 @@ -187,6 +187,16 @@ startup_continue: oi 6(%r12),2 # set MVCOS flag 1: +# +# find out if we have the ESSA instruction +# + la %r1,0f-.LPG1(%r13) # set program check address + stg %r1,__LC_PGM_NEW_PSW+8 + lghi %r1,0 + .long 0xb9ab0001 # essa get state + oi 6(%r12),0x04 # set ESSA flag +0: + lpswe .Lentry-.LPG1(13) # jump to _stext in primary-space, # virtual and never return ... .align 16 diff -urpN linux-2.6/arch/s390/kernel/time.c linux-2.6-patched/arch/s390/kernel/time.c --- linux-2.6/arch/s390/kernel/time.c 2007-05-09 09:32:19.000000000 +0200 +++ linux-2.6-patched/arch/s390/kernel/time.c 2007-05-11 15:52:17.000000000 +0200 @@ -30,6 +30,7 @@ #include <linux/timex.h> #include <linux/notifier.h> #include <linux/clocksource.h> +#include <linux/page-states.h> #include <asm/uaccess.h> #include <asm/delay.h> @@ -228,6 +229,9 @@ static int nohz_idle_notify(struct notif switch (action) { case CPU_IDLE: stop_hz_timer(); +#ifdef CONFIG_PAGE_STATES + page_shrink_discard_list(); +#endif break; case CPU_NOT_IDLE: start_hz_timer(); @@ -276,6 +280,9 @@ void init_cpu_timer(void) static void clock_comparator_interrupt(__u16 code) { +#ifdef CONFIG_PAGE_STATES + page_shrink_discard_list(); +#endif /* set clock comparator for next tick */ set_clock_comparator(S390_lowcore.jiffy_timer + CPU_DEVIATION); } @@ -355,6 +362,10 @@ void __init time_init(void) #ifdef CONFIG_VIRT_TIMER vtime_init(); #endif + +#ifdef CONFIG_PAGE_STATES + page_discard_init(); +#endif } /* diff -urpN linux-2.6/arch/s390/kernel/traps.c linux-2.6-patched/arch/s390/kernel/traps.c --- linux-2.6/arch/s390/kernel/traps.c 2007-05-09 09:32:19.000000000 +0200 +++ linux-2.6-patched/arch/s390/kernel/traps.c 2007-05-11 15:52:17.000000000 +0200 @@ -58,6 +58,7 @@ int sysctl_userprocess_debug = 0; extern pgm_check_handler_t do_protection_exception; extern pgm_check_handler_t do_dat_exception; extern pgm_check_handler_t do_monitor_call; +extern pgm_check_handler_t do_discard_fault; #define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; }) @@ -717,5 +718,8 @@ void __init trap_init(void) pgm_check_table[0x1C] = &space_switch_exception; pgm_check_table[0x1D] = &hfp_sqrt_exception; pgm_check_table[0x40] = &do_monitor_call; +#ifdef CONFIG_PAGE_STATES + pgm_check_table[0x1a] = &do_discard_fault; +#endif pfault_irq_init(); } diff -urpN linux-2.6/arch/s390/lib/uaccess_mvcos.c linux-2.6-patched/arch/s390/lib/uaccess_mvcos.c --- linux-2.6/arch/s390/lib/uaccess_mvcos.c 2007-04-02 17:11:00.000000000 +0200 +++ linux-2.6-patched/arch/s390/lib/uaccess_mvcos.c 2007-05-11 15:52:17.000000000 +0200 @@ -36,7 +36,7 @@ static size_t copy_from_user_mvcos(size_ tmp1 = -4096UL; asm volatile( "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" - " jz 7f\n" + "10:jz 7f\n" "1:"ALR" %0,%3\n" " "SLR" %1,%3\n" " "SLR" %2,%3\n" @@ -47,7 +47,7 @@ static size_t copy_from_user_mvcos(size_ " "CLR" %0,%4\n" /* copy crosses next page boundary? */ " jnh 4f\n" "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n" - " "SLR" %0,%4\n" + "11:"SLR" %0,%4\n" " "ALR" %2,%4\n" "4:"LHI" %4,-1\n" " "ALR" %4,%0\n" /* copy remaining size, subtract 1 */ @@ -62,6 +62,7 @@ static size_t copy_from_user_mvcos(size_ "7:"SLR" %0,%0\n" "8: \n" EX_TABLE(0b,2b) EX_TABLE(3b,4b) + EX_TABLE(10b,8b) EX_TABLE(11b,8b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) : "d" (reg0) : "cc", "memory"); return size; @@ -82,7 +83,7 @@ static size_t copy_to_user_mvcos(size_t tmp1 = -4096UL; asm volatile( "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 4f\n" + "6: jz 4f\n" "1:"ALR" %0,%3\n" " "SLR" %1,%3\n" " "SLR" %2,%3\n" @@ -93,11 +94,12 @@ static size_t copy_to_user_mvcos(size_t " "CLR" %0,%4\n" /* copy crosses next page boundary? */ " jnh 5f\n" "3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n" - " "SLR" %0,%4\n" + "7:"SLR" %0,%4\n" " j 5f\n" "4:"SLR" %0,%0\n" "5: \n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) + EX_TABLE(6b,5b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) : "d" (reg0) : "cc", "memory"); return size; diff -urpN linux-2.6/arch/s390/lib/uaccess_std.c linux-2.6-patched/arch/s390/lib/uaccess_std.c --- linux-2.6/arch/s390/lib/uaccess_std.c 2007-04-02 17:11:00.000000000 +0200 +++ linux-2.6-patched/arch/s390/lib/uaccess_std.c 2007-05-11 15:52:17.000000000 +0200 @@ -36,12 +36,12 @@ size_t copy_from_user_std(size_t size, c tmp1 = -256UL; asm volatile( "0: mvcp 0(%0,%2),0(%1),%3\n" - " jz 8f\n" + "10:jz 8f\n" "1:"ALR" %0,%3\n" " la %1,256(%1)\n" " la %2,256(%2)\n" "2: mvcp 0(%0,%2),0(%1),%3\n" - " jnz 1b\n" + "11:jnz 1b\n" " j 8f\n" "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ " "LHI" %3,-4096\n" @@ -50,7 +50,7 @@ size_t copy_from_user_std(size_t size, c " "CLR" %0,%4\n" /* copy crosses next page boundary? */ " jnh 5f\n" "4: mvcp 0(%4,%2),0(%1),%3\n" - " "SLR" %0,%4\n" + "12:"SLR" %0,%4\n" " "ALR" %2,%4\n" "5:"LHI" %4,-1\n" " "ALR" %4,%0\n" /* copy remaining size, subtract 1 */ @@ -65,6 +65,7 @@ size_t copy_from_user_std(size_t size, c "8:"SLR" %0,%0\n" "9: \n" EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,5b) + EX_TABLE(10b,9b) EX_TABLE(11b,9b) EX_TABLE(12b,9b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) : : "cc", "memory"); return size; diff -urpN linux-2.6/arch/s390/mm/fault.c linux-2.6-patched/arch/s390/mm/fault.c --- linux-2.6/arch/s390/mm/fault.c 2007-05-11 09:18:14.000000000 +0200 +++ linux-2.6-patched/arch/s390/mm/fault.c 2007-05-11 15:52:17.000000000 +0200 @@ -19,6 +19,8 @@ #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/cpu.h> #include <linux/smp.h> #include <linux/kdebug.h> #include <linux/smp_lock.h> @@ -28,10 +30,12 @@ #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/uaccess.h> +#include <linux/page-states.h> #include <asm/system.h> #include <asm/pgtable.h> #include <asm/s390_ext.h> +#include <asm/io.h> #ifndef CONFIG_64BIT #define __FAIL_ADDR_MASK 0x7ffff000 @@ -575,4 +579,197 @@ void __init pfault_irq_init(void) unregister_early_external_interrupt(0x2603, pfault_interrupt, &ext_int_pfault); } + +#endif + +#ifdef CONFIG_PAGE_STATES + +int cmma_flag = 0; + +static int __init cmma(char *str) +{ + char *parm; + + parm = strstrip(str); + if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) { + cmma_flag = MACHINE_HAS_ESSA != 0; + return 1; + } + if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0) { + cmma_flag = 0; + return 1; + } + return 0; +} + +__setup("cmma=", cmma); + +static inline void fixup_user_copy(struct pt_regs *regs, + unsigned long address, unsigned short rx) +{ + const struct exception_table_entry *fixup; + unsigned long kaddr; + + kaddr = (regs->gprs[rx >> 12] + (rx & 0xfff)) & __FAIL_ADDR_MASK; + if (virt_to_phys((void *) kaddr) != address) + return; + + fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); + if (fixup) + regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; + else + die("discard fault", regs, SIGSEGV); +} + +/* + * Discarded pages with a page_count() of zero are placed on + * the page_discarded_list until all cpus have been at + * least once in enabled code. That closes the race of page + * free vs. discard faults. + */ +void do_discard_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address; + struct page *page; + + /* + * get the real address that caused the block validity + * exception. + */ + address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; + page = pfn_to_page(address >> PAGE_SHIFT); + + /* + * Check for the special case of a discard fault in + * copy_{from,to}_user. User copy is done using one of + * three special instructions: mvcp, mvcs or mvcos. + */ + if (!(regs->psw.mask & PSW_MASK_PSTATE)) { + switch (*(unsigned char *) regs->psw.addr) { + case 0xda: /* mvcp */ + fixup_user_copy(regs, address, + *(__u16 *)(regs->psw.addr + 2)); + break; + case 0xdb: /* mvcs */ + fixup_user_copy(regs, address, + *(__u16 *)(regs->psw.addr + 4)); + break; + case 0xc8: /* mvcos */ + if (regs->gprs[0] == 0x81) + fixup_user_copy(regs, address, + *(__u16*)(regs->psw.addr + 2)); + else if (regs->gprs[0] == 0x810000) + fixup_user_copy(regs, address, + *(__u16*)(regs->psw.addr + 4)); + break; + default: + break; + } + } + + if (likely(get_page_unless_zero(page))) { + local_irq_enable(); + page_discard(page); + } +} + +static DEFINE_PER_CPU(struct list_head, page_discard_list); +static struct list_head page_gather_list = LIST_HEAD_INIT(page_gather_list); +static struct list_head page_signoff_list = LIST_HEAD_INIT(page_signoff_list); +static cpumask_t page_signoff_cpumask = CPU_MASK_NONE; +static DEFINE_SPINLOCK(page_discard_lock); + +/* + * page_free_discarded + * + * free_hot_cold_page calls this function if it is about to free a + * page that has PG_discarded set. Since there might be pending + * discard faults on other cpus on s390 we have to postpone the + * freeing of the page until each cpu has "signed-off" the page. + * + * returns 1 to stop free_hot_cold_page from freeing the page. + */ +int page_free_discarded(struct page *page) +{ + local_irq_disable(); + list_add_tail(&page->lru, &__get_cpu_var(page_discard_list)); + local_irq_enable(); + return 1; +} + +/* + * page_shrink_discard_list + * + * This function is called from the timer tick for an active cpu or + * from the idle notifier. It frees discarded pages in three stages. + * In the first stage it moves the pages from the per-cpu discard + * list to a global list. From the global list the pages are moved + * to the signoff list in a second step. The third step is to free + * the pages after all cpus acknoledged the signoff. That prevents + * that a page is freed when a cpus still has a pending discard + * fault for the page. + */ +void page_shrink_discard_list(void) +{ + struct list_head *cpu_list = &__get_cpu_var(page_discard_list); + struct list_head free_list = LIST_HEAD_INIT(free_list); + struct page *page, *next; + int cpu = smp_processor_id(); + + if (list_empty(cpu_list) && !cpu_isset(cpu, page_signoff_cpumask)) + return; + spin_lock(&page_discard_lock); + if (!list_empty(cpu_list)) + list_splice_init(cpu_list, &page_gather_list); + cpu_clear(cpu, page_signoff_cpumask); + if (cpus_empty(page_signoff_cpumask)) { + list_splice_init(&page_signoff_list, &free_list); + list_splice_init(&page_gather_list, &page_signoff_list); + if (!list_empty(&page_signoff_list)) { + /* Take care of the nohz race.. */ + page_signoff_cpumask = cpu_online_map; + smp_wmb(); + cpus_andnot(page_signoff_cpumask, + page_signoff_cpumask, nohz_cpu_mask); + cpu_clear(cpu, page_signoff_cpumask); + if (cpus_empty(page_signoff_cpumask)) + list_splice_init(&page_signoff_list, + &free_list); + } + } + spin_unlock(&page_discard_lock); + list_for_each_entry_safe(page, next, &free_list, lru) { + ClearPageDiscarded(page); + free_cold_page(page); + } +} + +static int page_discard_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long) hcpu; + + if (action == CPU_DEAD) { + local_irq_disable(); + list_splice_init(&per_cpu(page_discard_list, cpu), + &__get_cpu_var(page_discard_list)); + local_irq_enable(); + } + return NOTIFY_OK; +} + +static struct notifier_block page_discard_cpu_notifier = { + .notifier_call = page_discard_cpu_notify, +}; + +void __init page_discard_init(void) +{ + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(page_discard_list, i)); + if (register_cpu_notifier(&page_discard_cpu_notifier)) + panic("Couldn't register page discard cpu notifier"); +} + #endif diff -urpN linux-2.6/include/asm-s390/page-states.h linux-2.6-patched/include/asm-s390/page-states.h --- linux-2.6/include/asm-s390/page-states.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6-patched/include/asm-s390/page-states.h 2007-05-11 15:52:17.000000000 +0200 @@ -0,0 +1,117 @@ +#ifndef _ASM_S390_PAGE_STATES_H +#define _ASM_S390_PAGE_STATES_H + +#define ESSA_GET_STATE 0 +#define ESSA_SET_STABLE 1 +#define ESSA_SET_UNUSED 2 +#define ESSA_SET_VOLATILE 3 +#define ESSA_SET_PVOLATILE 4 +#define ESSA_SET_STABLE_MAKE_RESIDENT 5 +#define ESSA_SET_STABLE_IF_NOT_DISCARDED 6 + +#define ESSA_USTATE_MASK 0x0c +#define ESSA_USTATE_STABLE 0x00 +#define ESSA_USTATE_UNUSED 0x04 +#define ESSA_USTATE_PVOLATILE 0x08 +#define ESSA_USTATE_VOLATILE 0x0c + +#define ESSA_CSTATE_MASK 0x03 +#define ESSA_CSTATE_RESIDENT 0x00 +#define ESSA_CSTATE_PRESERVED 0x02 +#define ESSA_CSTATE_ZERO 0x03 + +extern int cmma_flag; +extern struct page *mem_map; + +/* + * ESSA <rc-reg>,<page-address-reg>,<command-immediate> + */ +#define page_essa(_page,_command) ({ \ + int _rc; \ + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" \ + : "=&d" (_rc) : "a" (((_page)-mem_map)<<PAGE_SHIFT), \ + "i" (_command)); \ + _rc; \ +}) + +static inline int page_host_discards(void) +{ + return cmma_flag; +} + +static inline int page_discarded(struct page *page) +{ + int state; + + if (!cmma_flag) + return 0; + state = page_essa(page, ESSA_GET_STATE); + return (state & ESSA_USTATE_MASK) == ESSA_USTATE_VOLATILE && + (state & ESSA_CSTATE_MASK) == ESSA_CSTATE_ZERO; +} + +static inline void page_set_unused(struct page *page, int order) +{ + int i; + + if (!cmma_flag) + return; + for (i = 0; i < (1 << order); i++) + page_essa(page + i, ESSA_SET_UNUSED); +} + +static inline void page_set_stable(struct page *page, int order) +{ + int i; + + if (!cmma_flag) + return; + for (i = 0; i < (1 << order); i++) + page_essa(page + i, ESSA_SET_STABLE); +} + +static inline void page_set_volatile(struct page *page, int writable) +{ + if (!cmma_flag) + return; + if (writable) + page_essa(page, ESSA_SET_PVOLATILE); + else + page_essa(page, ESSA_SET_VOLATILE); +} + +static inline int page_set_stable_if_present(struct page *page) +{ + int rc; + + if (!cmma_flag || PageReserved(page)) + return 1; + + rc = page_essa(page, ESSA_SET_STABLE_IF_NOT_DISCARDED); + return (rc & ESSA_USTATE_MASK) != ESSA_USTATE_VOLATILE || + (rc & ESSA_CSTATE_MASK) != ESSA_CSTATE_ZERO; +} + +/* + * Page locking is done with the architecture page bit PG_arch_1. + */ +static inline int page_test_set_state_change(struct page *page) +{ + return test_and_set_bit(PG_arch_1, &page->flags); +} + +static inline void page_clear_state_change(struct page *page) +{ + clear_bit(PG_arch_1, &page->flags); +} + +static inline int page_state_change(struct page *page) +{ + return test_bit(PG_arch_1, &page->flags); +} + +int page_free_discarded(struct page *page); +void page_shrink_discard_list(void); +void page_discard_init(void); + +#endif /* _ASM_S390_PAGE_STATES_H */ diff -urpN linux-2.6/include/asm-s390/setup.h linux-2.6-patched/include/asm-s390/setup.h --- linux-2.6/include/asm-s390/setup.h 2007-04-28 08:51:49.000000000 +0200 +++ linux-2.6-patched/include/asm-s390/setup.h 2007-05-11 15:52:17.000000000 +0200 @@ -64,6 +64,7 @@ extern unsigned long machine_flags; #define MACHINE_HAS_MVPG (machine_flags & 16) #define MACHINE_HAS_IDTE (machine_flags & 128) #define MACHINE_HAS_DIAG9C (machine_flags & 256) +#define MACHINE_HAS_ESSA (machine_flags & 1024) #ifndef __s390x__ #define MACHINE_HAS_IEEE (machine_flags & 2) diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c --- linux-2.6/mm/rmap.c 2007-05-11 15:52:17.000000000 +0200 +++ linux-2.6-patched/mm/rmap.c 2007-05-11 15:52:17.000000000 +0200 @@ -620,6 +620,15 @@ void page_remove_rmap(struct page *page, * faster for those pages still in swapcache. */ if (page_test_dirty(page)) { + int stable = page_make_stable(page); + VM_BUG_ON(!stable); + /* + * We decremented the mapcount so we now have an + * extra reference for the page. That prevents + * page_make_volatile from making the page + * volatile again while the dirty bit is in + * transit. + */ page_clear_dirty(page); set_page_dirty(page); } -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
Martin Schwidefsky
2007-May-11 07:01 UTC
[patch 1/6] Guest page hinting: core + volatile page cache.
From: Martin Schwidefsky <schwidefsky@de.ibm.com> From: Hubertus Franke <frankeh@watson.ibm.com> From: Himanshu Raj <rhim@cc.gatech.edu> The guest page hinting patchset introduces code that passes guest page usage information to the host system that virtualizes the memory of its guests. There are three different page states: * Unused: The page content is of no interest to the guest. The host can forget the page content anr replace it with a page containing zeroes. * Stable: The page content is needed by the guest and has to be preserved by the host. * Volatile: The page content is useful to the guest but not essential. The host can discard the page but has to deliver a special kind of fault to the guest if the guest accesses a page discarded by the host. The unused state is used for free pages, it allows the host to avoid the paging of empty pages. The default state for non-free pages is stable. The host can write stable pages to a swap device but has to restore the page if the guest accesses it. The volatile page state is used for clean uptodate page cache pages. The host can choose to discard volatile pages as part of its vmscan operation instead of writing them to the hosts paging device. The guest system doesn't notice that a volatile page is gone until it tries to access the page or if it tries to make the page stable again. For a guest access to a discarded page the host generates a discard fault to notify the guest. The guest has to remove the page from the cache and reload the page from its backing device. The volatile state is used for all page cache pages, even for pages which are referenced by writable ptes. The host needs to be able to check the dirty state of the pages. Since the host doesn't know where the page table entries of the guest are located, the volatile state as introduced by this patch is only usable on architectures with per-page dirty bits (s390 only). For per-pte dirty bit architectures some additional code is needed, see patch #4. The main question is where to put the state transitions between the volatile and the stable state. The simple solution is to make a page stable whenever a lookup is done or a page reference is derived from a page table entry. Attempts to make pages volatile are added at strategic points. The conditions that prevent a page from being made volatile: 1) The page is reserved. Some sort of special page. 2) The page is marked dirty in the struct page. The page content is more recent than the data on the backing device. The host cannot access the linux internal dirty bit so the page needs to be stable. 3) The page is in writeback. The page content is needed for i/o. 4) The page is locked. Someone has exclusive access to the page. 5) The page is anonymous. Swap cache support needs additional code. See patch #2. 6) The page has no mapping. Without a backing the page cannot be recreated. 7) The page is not uptodate. 8) The page has private information. try_to_release_page can fail, e.g. in case the private information is journaling data. The discard fault need to be able to remove the page. 9) The page is already discarded. 10) The page is not on the LRU list. The page has been isolated, some processing is done. 11) The page map count is not equal to the page reference count - 1. The discard fault handler can remove the page cache reference and all mappers of a page. It cannot remove the page reference for any other user of the page. The transitions to stable are done by find_get_pages() and its variants, in follow_page if the FOLL_GET flag is set, by copy-on-write in do_wp_page, and by the early copy-on-write in do_no_page. For page cache page this is always done with a call to page_make_stable(). To make enough pages discardable by the host an attempt to do the transition to volatile state is done at several places: 1) When a page gets unlocked (unlock_page). 2) When writeback has finished (test_clear_page_writeback). 3) When the page reference counter is decreased (__free_pages, page_cache_release alias put_page_check and __pagevec_release_nonlru right before the put_page_testzero call). 4) When the map counter in increased (page_add_file_rmap). 5) When a page is moved from the active list to the inactive list. 6) In filemap_nopage after the wait for readpage has finished. This try is necessary because filemap_nopage held an additional reference to the page so that the page_make_volatile call in unlock_page could not do the state transition. The function for the state transitions to volatile is page_make_volatile(). The major obstacles that need to get addressed: * Concurrent page state changes: To guard against concurrent page state updates some kind of lock is needed. If page_make_volatile() has already done the 11 checks it will issue the state change primitive. If in the meantime one of the conditions has changed the user that requires that page in stable state will have to wait in the page_make_stable() function until the make volatile operation has finished. It is up to the architecture to define how this is done with the three primitives page_test_set_state_change, page_clear_state_change and page_state_change. There are some alternatives how this can be done, e.g. a global lock, or lock per segment in the kernel page table, or the per page bit PG_arch_1 if it is still free. * Page references acquired from page tables: All page references acquired with find_get_page and friends can be used to access the page frame content. A page reference grabbed from a page table cannot be used to access the page content, the page has to be made stable first. If the make stable operation fails because the page has been discarded it has to be removed from page cache. That removes the page table entry as well. * Page discard vs. __remove_from_page_cache race A new page flag PG_discarded is added. This bit is set for discarded pages. It prevents multiple removes of a page from the page cache due to concurrent discard faults and/or normal page removals. It also prevents the re-add of isolated pages to the lru list in vmscan if the page has been discarded while it was not on the lru list. * Page discard vs. pte establish The discard fault handler does three things: 1) set the PG_discarded bit for the page, 2) remove the page from all page tables and 3) remove the page from the page cache. All page references of the discarded page that are still around after step 2 may not be used to establish new mappings because step 3 clears the page->mapping field that is required to find the mappers. Code that establishes new ptes to pages that might be discarded has to check the PG_discarded bit. Step 2 has to check all possible location for a pte of a particular page and check if the pte exists or another processor might be in the process of establishing one. To do that the page table lock for the pte is used. See page_unmap_all and the modified quick check in page_check_address for the details. * copy_one_pte vs. discarded pages The code that copies the page tables may not copy ptes for discarded pages because this races with the discard fault handler. copy_one_pte cannot back out either since there is no automatic repeat of the fault that caused the pte modification. Ptes to discarded pages only show up in copy_one_pte if a fork races with a discard fault. In this case copy_one_pte has to create a pte in the new page table that looks like the one that the discard fault handler would have created in the original page table if copy_one_pte would not have grabed the page table lock first. * get_user_pages with FOLL_GET If get_user_pages is called with a non-NULL pages argument the caller has to be able to access the page content using the references returned in the pages array. This is done with a check in follow_page for the FOLL_GET bit and a call to page_make_stable. If get_user_pages is called with NULL as the pages argument the pages are not made stable. The caller cannot expect that the pages are available after the call because vmscan might have removed them. * buffer heads / page_private A page that is modified with sys_write will get a buffer-head to keep track of the dirty state. The existence of a buffer-head makes PagePrivate(page) return true. Pages with private information cannot be made volatile. Until the buffer-head is removed the page will stay stable. The standard logic is to call try_to_release_page which frees the buffer-head only if more than 10% of GFP_USER memory are used for buffer heads. Without high memory every page can have a buffer-head without running over the limit. The result is that every page written to with sys_write will stay stable until it is removed. To get these pages volatile again max_buffer_heads is set to zero (!) to force a call to try_to_release_page whenever a page is moved from the active to the inactive list. * page_free_discarded hook The architecture might want/need to do special things for discarded pages before they are freed. E.g. s390 has to delay the freeing of discarded pages. To allow this a hook in added to free_hot_cold_page. Another noticable change is that the first few lines of code in try_to_unmap_one that calculates the address from the page and the vma is moved out of try_to_unmap_one to the callers. This is done to make try_to_unmap_one usable for the removal of discarded pages in page_unmap_all. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- fs/buffer.c | 12 ++ include/linux/mm.h | 1 include/linux/page-flags.h | 13 ++ include/linux/page-states.h | 120 ++++++++++++++++++++++++++ include/linux/pagemap.h | 6 + mm/Makefile | 1 mm/filemap.c | 78 ++++++++++++++++- mm/memory.c | 55 ++++++++++++ mm/page-states.c | 197 ++++++++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 5 - mm/page_alloc.c | 14 ++- mm/rmap.c | 93 ++++++++++++++++++-- mm/swap.c | 14 +++ mm/vmscan.c | 63 ++++++++++---- 14 files changed, 638 insertions(+), 34 deletions(-) diff -urpN linux-2.6/fs/buffer.c linux-2.6-patched/fs/buffer.c --- linux-2.6/fs/buffer.c 2007-05-10 09:29:56.000000000 +0200 +++ linux-2.6-patched/fs/buffer.c 2007-05-11 15:52:15.000000000 +0200 @@ -2962,11 +2962,23 @@ void __init buffer_init(void) init_buffer_head, NULL); +#ifdef CONFIG_PAGE_STATES + /* + * If volatile page cache is enabled we want to get as many + * pages into volatile state as possible. Pages with private + * information cannot be made stable. Set max_buffer_heads + * to zero to make shrink_active_list to release the private + * information when moving page from the active to the inactive + * list. + */ + max_buffer_heads = 0; +#else /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ nrpages = (nr_free_buffer_pages() * 10) / 100; max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); +#endif hotcpu_notifier(buffer_cpu_notify, 0); } diff -urpN linux-2.6/include/linux/mm.h linux-2.6-patched/include/linux/mm.h --- linux-2.6/include/linux/mm.h 2007-05-08 09:31:18.000000000 +0200 +++ linux-2.6-patched/include/linux/mm.h 2007-05-11 15:52:15.000000000 +0200 @@ -302,6 +302,7 @@ static inline void init_page_count(struc } void put_page(struct page *page); +void put_page_check(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff -urpN linux-2.6/include/linux/page-flags.h linux-2.6-patched/include/linux/page-flags.h --- linux-2.6/include/linux/page-flags.h 2007-05-08 09:31:18.000000000 +0200 +++ linux-2.6-patched/include/linux/page-flags.h 2007-05-11 15:52:15.000000000 +0200 @@ -104,6 +104,8 @@ #define PG_uncached 31 /* Page has been mapped as uncached */ #endif +#define PG_discarded 20 /* Page discarded by the hypervisor. */ + /* * Manipulation of page state flags */ @@ -270,6 +272,17 @@ static inline void __ClearPageTail(struc #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#ifdef CONFIG_PAGE_STATES +#define PageDiscarded(page) test_bit(PG_discarded, &(page)->flags) +#define ClearPageDiscarded(page) clear_bit(PG_discarded, &(page)->flags) +#define TestSetPageDiscarded(page) \ + test_and_set_bit(PG_discarded, &(page)->flags) +#else +#define PageDiscarded(page) 0 +#define ClearPageDiscarded(page) do { } while (0) +#define TestSetPageDiscarded(page) 0 +#endif + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); diff -urpN linux-2.6/include/linux/pagemap.h linux-2.6-patched/include/linux/pagemap.h --- linux-2.6/include/linux/pagemap.h 2007-05-09 09:32:22.000000000 +0200 +++ linux-2.6-patched/include/linux/pagemap.h 2007-05-11 15:52:15.000000000 +0200 @@ -12,6 +12,7 @@ #include <asm/uaccess.h> #include <linux/gfp.h> #include <linux/bitops.h> +#include <linux/page-states.h> /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page @@ -59,7 +60,11 @@ static inline void mapping_set_gfp_mask( #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) #define page_cache_get(page) get_page(page) +#ifdef CONFIG_PAGE_STATES +#define page_cache_release(page) put_page_check(page) +#else #define page_cache_release(page) put_page(page) +#endif void release_pages(struct page **pages, int nr, int cold); #ifdef CONFIG_NUMA @@ -136,6 +141,7 @@ int add_to_page_cache_lru(struct page *p unsigned long index, gfp_t gfp_mask); extern void remove_from_page_cache(struct page *page); extern void __remove_from_page_cache(struct page *page); +extern void __remove_from_page_cache_nocheck(struct page *page); /* * Return byte-offset into filesystem object for page. diff -urpN linux-2.6/include/linux/page-states.h linux-2.6-patched/include/linux/page-states.h --- linux-2.6/include/linux/page-states.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6-patched/include/linux/page-states.h 2007-05-11 15:52:15.000000000 +0200 @@ -0,0 +1,120 @@ +#ifndef _LINUX_PAGE_STATES_H +#define _LINUX_PAGE_STATES_H + +/* + * include/linux/page-states.h + * + * Copyright IBM Corp. 2005, 2007 + * + * Authors: Martin Schwidefsky <schwidefsky@de.ibm.com> + * Hubertus Franke <frankeh@watson.ibm.com> + * Himanshu Raj <rhim@cc.gatech.edu> + */ + +#include <linux/pagevec.h> + +#ifdef CONFIG_PAGE_STATES +/* + * Guest page hinting primitives that need to be defined in the + * architecture header file if PAGE_STATES=y: + * - page_host_discards: + * Indicates whether the host system discards guest pages or not. + * - page_set_unused: + * Indicates to the host that the page content is of no interest + * to the guest. The host can "forget" the page content and replace + * it with a page containing zeroes. + * - page_set_stable: + * Indicate to the host that the page content is needed by the guest. + * - page_set_volatile: + * Make the page discardable by the host. Instead of writing the + * page to the hosts swap device, the host can remove the page. + * A guest that accesses such a discarded page gets a special + * discard fault. + * - page_set_stable_if_present: + * The page state is set to stable if the page has not been discarded + * by the host. The check and the state change have to be done + * atomically. + * - page_discarded: + * Returns true if the page has been discarded by the host. + * - page_test_set_state_change: + * Tries to lock the page for state change. The primitive does not need + * to have page granularity, it can lock a range of pages. + * - page_clear_state_change: + * Unlocks a page for state changes. + * - page_state_change: + * Returns true if the page is locked for state change. + * - page_free_discarded: + * Free a discarded page. This might require to put the page on a + * discard list and a synchronization over all cpus. Returns true + * if the architecture backend wants to do special things on free. + */ +#include <asm/page-states.h> + +extern void page_unmap_all(struct page *page); +extern void page_discard(struct page *page); +extern int __page_make_stable(struct page *page); +extern void __page_make_volatile(struct page *page, int offset); +extern void __pagevec_make_volatile(struct pagevec *pvec); + +/* + * Extended guest page hinting functions defined by using the + * architecture primitives: + * - page_make_stable: + * Tries to make a page stable. This operation can fail if the + * host has discarded a page. The function returns != 0 if the + * page could not be made stable. + * - page_make_volatile: + * Tries to make a page volatile. There are a number of conditions + * that prevent a page from becoming volatile. If at least one + * is true the function does nothing. See mm/page-states.c for + * details. + * - pagevec_make_volatile: + * Tries to make a vector of pages volatile. For each page in the + * vector the same conditions apply as for page_make_volatile. + * - page_discard: + * Removes a discarded page from the system. The page is removed + * from the LRU list and the radix tree of its mapping. + * page_discard uses page_unmap_all to remove all page table + * entries for a page. + */ + +static inline int page_make_stable(struct page *page) +{ + return page_host_discards() ? __page_make_stable(page) : 1; +} + +static inline void page_make_volatile(struct page *page, int offset) +{ + if (page_host_discards()) + __page_make_volatile(page, offset); +} + +static inline void pagevec_make_volatile(struct pagevec *pvec) +{ + if (page_host_discards()) + __pagevec_make_volatile(pvec); +} + +#else + +#define page_host_discards() (0) +#define page_set_unused(_page,_order) do { } while (0) +#define page_set_stable(_page,_order) do { } while (0) +#define page_set_volatile(_page) do { } while (0) +#define page_set_stable_if_present(_page) (1) +#define page_discarded(_page) (0) + +#define page_test_set_state_change(_page) (0) +#define page_clear_state_change(_page) do { } while (0) +#define page_state_change(_page) (0) + +#define page_free_discarded(_page) (0) + +#define page_make_stable(_page) (1) +#define page_make_volatile(_page, offset) do { } while (0) +#define pagevec_make_volatile(_pagevec) do { } while (0) +#define page_discard(_page) do { } while (0) + +#endif + +#endif /* _LINUX_PAGE_STATES_H */ diff -urpN linux-2.6/mm/filemap.c linux-2.6-patched/mm/filemap.c --- linux-2.6/mm/filemap.c 2007-05-10 09:29:57.000000000 +0200 +++ linux-2.6-patched/mm/filemap.c 2007-05-11 15:52:15.000000000 +0200 @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/cpuset.h> +#include <linux/page-states.h> #include "filemap.h" #include "internal.h" @@ -112,7 +113,7 @@ generic_file_direct_IO(int rw, struct ki * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold a write_lock on the mapping's tree_lock. */ -void __remove_from_page_cache(struct page *page) +void inline __remove_from_page_cache_nocheck(struct page *page) { struct address_space *mapping = page->mapping; @@ -122,6 +123,28 @@ void __remove_from_page_cache(struct pag __dec_zone_page_state(page, NR_FILE_PAGES); } +void __remove_from_page_cache(struct page *page) +{ + /* + * Check if the discard fault handler already removed + * the page from the page cache. If not set the discard + * bit in the page flags to prevent double page free if + * a discard fault is racing with normal page free. + */ + if (TestSetPageDiscarded(page)) + return; + + __remove_from_page_cache_nocheck(page); + + /* + * Check the hardware page state and clear the discard + * bit in the page flags only if the page is not + * discarded. + */ + if (!page_discarded(page)) + ClearPageDiscarded(page); +} + void remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; @@ -536,6 +559,7 @@ void fastcall unlock_page(struct page *p if (!TestClearPageLocked(page)) BUG(); smp_mb__after_clear_bit(); + page_make_volatile(page, 1); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -601,6 +625,14 @@ struct page * find_get_page(struct addre if (page) page_cache_get(page); read_unlock_irq(&mapping->tree_lock); + if (page && unlikely(!page_make_stable(page))) { + /* + * The page has been discarded by the host. Run the + * discard handler and return NULL. + */ + page_discard(page); + page = NULL; + } return page; } EXPORT_SYMBOL(find_get_page); @@ -625,7 +657,15 @@ repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); - if (TestSetPageLocked(page)) { + if (unlikely(!page_make_stable(page))) { + /* + * The page has been discarded by the host. Run the + * discard handler and return NULL. + */ + read_unlock_irq(&mapping->tree_lock); + page_discard(page); + return NULL; + } else if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); __lock_page(page); read_lock_irq(&mapping->tree_lock); @@ -710,11 +750,24 @@ unsigned find_get_pages(struct address_s unsigned int i; unsigned int ret; +repeat: read_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) + for (i = 0; i < ret; i++) { page_cache_get(pages[i]); + if (likely(page_make_stable(pages[i]))) + continue; + /* + * Make stable failed, we discard the page and retry the + * whole operation. + */ + read_unlock_irq(&mapping->tree_lock); + page_discard(pages[i]); + while (i--) + page_cache_release(pages[i]); + goto repeat; + } read_unlock_irq(&mapping->tree_lock); return ret; } @@ -769,11 +822,24 @@ unsigned find_get_pages_tag(struct addre unsigned int i; unsigned int ret; +repeat: read_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup_tag(&mapping->page_tree, (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) + for (i = 0; i < ret; i++) { page_cache_get(pages[i]); + if (likely(page_make_stable(pages[i]))) + continue; + /* + * Make stable failed, we discard the page and retry the + * whole operation. + */ + read_unlock_irq(&mapping->tree_lock); + page_discard(pages[i]); + while (i--) + page_cache_release(pages[i]); + goto repeat; + } if (ret) *index = pages[ret - 1]->index + 1; read_unlock_irq(&mapping->tree_lock); @@ -1501,8 +1567,10 @@ page_not_uptodate: error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); - if (PageUptodate(page)) + if (PageUptodate(page)) { + page_make_volatile(page, 2); goto success; + } } else if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto retry_find; diff -urpN linux-2.6/mm/Makefile linux-2.6-patched/mm/Makefile --- linux-2.6/mm/Makefile 2007-05-08 09:31:18.000000000 +0200 +++ linux-2.6-patched/mm/Makefile 2007-05-11 15:52:15.000000000 +0200 @@ -31,4 +31,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_PAGE_STATES) += page-states.o diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c --- linux-2.6/mm/memory.c 2007-05-08 09:31:18.000000000 +0200 +++ linux-2.6-patched/mm/memory.c 2007-05-11 15:52:15.000000000 +0200 @@ -50,6 +50,7 @@ #include <linux/delayacct.h> #include <linux/init.h> #include <linux/writeback.h> +#include <linux/page-states.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -480,6 +481,8 @@ copy_one_pte(struct mm_struct *dst_mm, s page = vm_normal_page(vma, addr, pte); if (page) { + if (unlikely(PageDiscarded(page))) + goto out_discard_pte; get_page(page); page_dup_rmap(page); rss[!!PageAnon(page)]++; @@ -487,6 +490,21 @@ copy_one_pte(struct mm_struct *dst_mm, s out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); + return; + +out_discard_pte: + /* + * If the page referred by the pte has the PG_discarded bit set, + * copy_one_pte is racing with page_discard. The pte may not be + * copied or we can end up with a pte pointing to a page not + * in page cache anymore. Do what try_to_unmap_one would do + * if the copy_one_pte had taken place before page_discard. + */ + if (page->index != linear_page_index(vma, addr)) + /* If nonlinear, store the file page offset in the pte. */ + set_pte_at(dst_mm, addr, dst_pte, pgoff_to_pte(page->index)); + else + pte_clear(dst_mm, addr, dst_pte); } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -951,6 +969,19 @@ struct page *follow_page(struct vm_area_ if (flags & FOLL_GET) get_page(page); + + if (flags & FOLL_GET) { + /* + * The page is made stable if a reference is acquired. + * If the caller does not get a reference it implies that + * the caller can deal with page faults in case the page + * is swapped out. In this case the caller can deal with + * discard faults as well. + */ + if (unlikely(!page_make_stable(page))) + goto out_discard; + } + if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -974,6 +1005,11 @@ no_page_table: BUG_ON(flags & FOLL_WRITE); } return page; + +out_discard: + pte_unmap_unlock(ptep, ptl); + page_discard(page); + return NULL; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -1685,6 +1721,11 @@ static int do_wp_page(struct mm_struct * dirty_page = old_page; get_page(dirty_page); reuse = 1; + /* + * dirty_page will be set dirty, so it needs to be stable. + */ + if (unlikely(!page_make_stable(dirty_page))) + goto discard; } if (reuse) { @@ -1702,6 +1743,12 @@ static int do_wp_page(struct mm_struct * * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); + /* + * To copy the content of old_page it needs to be stable. + * page_cache_release on old_page will make it volatile again. + */ + if (unlikely(!page_make_stable(old_page))) + goto discard; gotten: pte_unmap_unlock(page_table, ptl); @@ -1770,6 +1817,10 @@ oom: unwritable_page: page_cache_release(old_page); return VM_FAULT_SIGBUS; +discard: + pte_unmap_unlock(page_table, ptl); + page_discard(old_page); + return VM_FAULT_MINOR; } /* @@ -2333,6 +2384,10 @@ retry: if (unlikely(anon_vma_prepare(vma))) goto oom; + if (unlikely(!page_make_stable(new_page))) { + page_discard(new_page); + goto retry; + } page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; diff -urpN linux-2.6/mm/page_alloc.c linux-2.6-patched/mm/page_alloc.c --- linux-2.6/mm/page_alloc.c 2007-05-11 09:18:14.000000000 +0200 +++ linux-2.6-patched/mm/page_alloc.c 2007-05-11 15:52:15.000000000 +0200 @@ -41,6 +41,7 @@ #include <linux/pfn.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> +#include <linux/page-states.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -201,7 +202,8 @@ static void bad_page(struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_buddy ); + 1 << PG_buddy | + 1 << PG_discarded ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -503,6 +505,7 @@ static void __free_pages_ok(struct page reserved += free_pages_check(page + i); if (reserved) return; + page_set_unused(page, order); if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); @@ -796,10 +799,16 @@ static void fastcall free_hot_cold_page( struct per_cpu_pages *pcp; unsigned long flags; + if (unlikely(PageDiscarded(page))) { + if (page_free_discarded(page)) + return; + } + if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) return; + page_set_unused(page, 0); if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page), PAGE_SIZE); @@ -890,6 +899,7 @@ again: put_cpu(); VM_BUG_ON(bad_range(zone, page)); + page_set_stable(page, order); if (prep_new_page(page, order, gfp_flags)) goto again; return page; @@ -1432,6 +1442,8 @@ void __pagevec_free(struct pagevec *pvec fastcall void __free_pages(struct page *page, unsigned int order) { + if (page_count(page) > 1) + page_make_volatile(page, 2); if (put_page_testzero(page)) { if (order == 0) free_hot_page(page); diff -urpN linux-2.6/mm/page-states.c linux-2.6-patched/mm/page-states.c --- linux-2.6/mm/page-states.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6-patched/mm/page-states.c 2007-05-11 15:52:15.000000000 +0200 @@ -0,0 +1,197 @@ +/* + * mm/page-states.c + * + * (C) Copyright IBM Corp. 2005, 2007 + * + * Guest page hinting functions. + * + * Authors: Martin Schwidefsky <schwidefsky@de.ibm.com> + * Hubertus Franke <frankeh@watson.ibm.com> + * Himanshu Raj <rhim@cc.gatech.edu> + */ + +#include <linux/mm.h> +#include <linux/mm_inline.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/buffer_head.h> +#include <linux/pagevec.h> +#include <linux/page-states.h> + +#include "internal.h" + +/* + * Check if there is anything in the page flags or the mapping + * that prevents the page from changing its state to volatile. + */ +static inline int check_bits(struct page *page) +{ + /* + * There are several conditions that prevent a page from becoming + * volatile. The first check is for the page bits. + */ + if (PageDirty(page) || PageReserved(page) || PageWriteback(page) || + PageLocked(page) || PagePrivate(page) || PageDiscarded(page) || + !PageUptodate(page) || !PageLRU(page) || PageAnon(page)) + return 0; + + /* + * If the page has been truncated there is no point in making + * it volatile. It will be freed soon. And if the mapping ever + * had locked pages all pages of the mapping will stay stable. + */ + return page_mapping(page) != NULL; +} + +/* + * Check the reference counter of the page against the number of + * mappings. The caller passes an offset, that is the number of + * extra, known references. The page cache itself is one extra + * reference. If the caller acquired an additional reference then + * the offset would be 2. If the page map counter is equal to the + * page count minus the offset then there is no other, unknown + * user of the page in the system. + */ +static inline int check_counts(struct page *page, unsigned int offset) +{ + return page_mapcount(page) + offset == page_count(page); +} + +/* + * Attempts to change the state of a page to volatile. + * If there is something preventing the state change the page stays + * int its current state. + */ +void __page_make_volatile(struct page *page, int offset) +{ + preempt_disable(); + if (!page_test_set_state_change(page)) { + if (check_bits(page) && check_counts(page, offset)) + page_set_volatile(page); + page_clear_state_change(page); + } + preempt_enable(); +} +EXPORT_SYMBOL(__page_make_volatile); + +/* + * Attempts to change the state of a vector of pages to volatile. + * If there is something preventing the state change the page stays + * int its current state. + */ +void __pagevec_make_volatile(struct pagevec *pvec) +{ + struct page *page; + int i = pagevec_count(pvec); + + while (--i >= 0) { + /* + * If we can't get the state change bit just give up. + * The worst that can happen is that the page will stay + * in the stable state although it might be volatile. + */ + page = pvec->pages[i]; + if (!page_test_set_state_change(page)) { + if (check_bits(page) && check_counts(page, 1)) + page_set_volatile(page); + page_clear_state_change(page); + } + } +} +EXPORT_SYMBOL(__pagevec_make_volatile); + +/* + * Attempts to change the state of a page to stable. The host could + * have removed a volatile page, the page_set_stable_if_present call + * can fail. + * + * returns "0" on success and "1" on failure + */ +int __page_make_stable(struct page *page) +{ + /* + * Postpone state change to stable until the state change bit is + * cleared. As long as the state change bit is set another cpu + * is in page_make_volatile for this page. That makes sure that + * no caller of make_stable "overtakes" a make_volatile leaving + * the page in volatile where stable is required. + * The caller of make_stable need to make sure that no caller + * of make_volatile can make the page volatile right after + * make_stable has finished. + */ + while (page_state_change(page)) + cpu_relax(); + return page_set_stable_if_present(page); +} +EXPORT_SYMBOL(__page_make_stable); + +/** + * __page_discard() - remove a discarded page from the cache + * + * @page: the page + * + * The page passed to this function needs to be locked. + */ +static void __page_discard(struct page *page) +{ + struct address_space *mapping; + struct zone *zone; + + /* Paranoia checks. */ + VM_BUG_ON(PageWriteback(page)); + VM_BUG_ON(PageDirty(page)); + VM_BUG_ON(PagePrivate(page)); + + /* Set the discarded bit early. */ + if (TestSetPageDiscarded(page)) + return; + + /* Unmap the page from all page tables. */ + page_unmap_all(page); + + /* Check if really all mappers of this page are gone. */ + VM_BUG_ON(page_mapcount(page) != 0); + + /* + * Remove the page from LRU if it is currently added. + * The users of isolate_lru_pages need to check the + * discarded bit before readding the page to the LRU. + */ + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + /* Unlink page from lru. */ + __ClearPageLRU(page); + del_page_from_lru(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + + /* We can't handle swap cache pages (yet). */ + VM_BUG_ON(PageSwapCache(page)); + + /* Remove page from page cache. */ + mapping = page->mapping; + write_lock_irq(&mapping->tree_lock); + __remove_from_page_cache_nocheck(page); + write_unlock_irq(&mapping->tree_lock); + __put_page(page); +} + +/** + * page_discard() - remove a discarded page from the cache + * + * @page: the page + * + * Before calling this function an additional page reference needs to + * be acquired. This reference is released by the function. + */ +void page_discard(struct page *page) +{ + lock_page(page); + __page_discard(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(page_discard); diff -urpN linux-2.6/mm/page-writeback.c linux-2.6-patched/mm/page-writeback.c --- linux-2.6/mm/page-writeback.c 2007-05-09 09:32:22.000000000 +0200 +++ linux-2.6-patched/mm/page-writeback.c 2007-05-11 15:52:15.000000000 +0200 @@ -33,6 +33,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> #include <linux/pagevec.h> +#include <linux/page-states.h> /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -951,10 +952,12 @@ int test_clear_page_writeback(struct pag write_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); - if (ret) + if (ret) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + page_make_volatile(page, 1); + } write_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c --- linux-2.6/mm/rmap.c 2007-05-09 09:32:22.000000000 +0200 +++ linux-2.6-patched/mm/rmap.c 2007-05-11 15:52:15.000000000 +0200 @@ -48,6 +48,7 @@ #include <linux/rcupdate.h> #include <linux/module.h> #include <linux/kallsyms.h> +#include <linux/page-states.h> #include <asm/tlbflush.h> @@ -270,13 +271,24 @@ pte_t *page_check_address(struct page *p return NULL; pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); /* Make a quick check before getting the lock */ +#ifndef CONFIG_PAGE_STATES + /* + * If the page table lock for this pte is taken we have to + * assume that someone might be mapping the page. To solve + * the race of a page discard vs. mapping the page we have + * to serialize the two operations by taking the lock, + * otherwise we end up with a pte for a page that has been + * removed from page cache by the discard fault handler. + */ + if (!spin_is_locked(ptl)) +#endif if (!pte_present(*pte)) { pte_unmap(pte); return NULL; } - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -573,6 +585,7 @@ void page_add_file_rmap(struct page *pag { if (atomic_inc_and_test(&page->_mapcount)) __inc_zone_page_state(page, NR_FILE_MAPPED); + page_make_volatile(page, 1); } /** @@ -621,19 +634,14 @@ void page_remove_rmap(struct page *page, * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) + unsigned long address, int migration) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl); if (!pte) goto out; @@ -698,8 +706,14 @@ static int try_to_unmap_one(struct page set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else #endif + { +#ifdef CONFIG_PAGE_STATES + /* If nonlinear, store the file page offset in the pte. */ + if (page->index != linear_page_index(vma, address)) + set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); +#endif dec_mm_counter(mm, file_rss); - + } page_remove_rmap(page, vma); page_cache_release(page); @@ -803,6 +817,7 @@ static int try_to_unmap_anon(struct page { struct anon_vma *anon_vma; struct vm_area_struct *vma; + unsigned long address; int ret = SWAP_AGAIN; anon_vma = page_lock_anon_vma(page); @@ -810,7 +825,10 @@ static int try_to_unmap_anon(struct page return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma, migration); + address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, migration); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -835,6 +853,7 @@ static int try_to_unmap_file(struct page struct vm_area_struct *vma; struct prio_tree_iter iter; int ret = SWAP_AGAIN; + unsigned long address; unsigned long cursor; unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; @@ -842,7 +861,10 @@ static int try_to_unmap_file(struct page spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, migration); + address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, migration); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -943,3 +965,54 @@ int try_to_unmap(struct page *page, int return ret; } +#ifdef CONFIG_PAGE_STATES + +/** + * page_unmap_all - removes all mappings of a page + * + * @page: the page which mapping in the vma should be struck down + * + * the caller needs to hold page lock + */ +void page_unmap_all(struct page* page) +{ + struct address_space *mapping = page_mapping(page); + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + unsigned long address; + int rc; + + VM_BUG_ON(!PageLocked(page) || PageReserved(page) || PageAnon(page)); + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + address = vma_address(page, vma); + if (address == -EFAULT) + continue; + rc = try_to_unmap_one(page, vma, address, 0); + VM_BUG_ON(rc == SWAP_FAIL); + } + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto out; + + /* + * Remove the non-linear mappings of the page. This is + * awfully slow, but we have to find that discarded page.. + */ + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, + shared.vm_set.list) { + address = vma->vm_start; + while (address < vma->vm_end) { + rc = try_to_unmap_one(page, vma, address, 0); + VM_BUG_ON(rc == SWAP_FAIL); + address += PAGE_SIZE; + } + } + +out: + spin_unlock(&mapping->i_mmap_lock); +} + +#endif diff -urpN linux-2.6/mm/swap.c linux-2.6-patched/mm/swap.c --- linux-2.6/mm/swap.c 2007-05-10 09:29:57.000000000 +0200 +++ linux-2.6-patched/mm/swap.c 2007-05-11 15:52:15.000000000 +0200 @@ -30,6 +30,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/init.h> +#include <linux/page-states.h> /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -73,6 +74,16 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +#ifdef CONFIG_PAGE_STATES +void put_page_check(struct page *page) +{ + if (page_count(page) > 1) + page_make_volatile(page, 2); + put_page(page); +} +EXPORT_SYMBOL(put_page_check); +#endif + /** * put_pages_list(): release a list of pages * @@ -337,6 +348,8 @@ void __pagevec_release_nonlru(struct pag struct page *page = pvec->pages[i]; VM_BUG_ON(PageLRU(page)); + if (page_count(page) > 1) + page_make_volatile(page, 2); if (put_page_testzero(page)) pagevec_add(&pages_to_free, page); } @@ -366,6 +379,7 @@ void __pagevec_lru_add(struct pagevec *p VM_BUG_ON(PageLRU(page)); SetPageLRU(page); add_page_to_inactive_list(zone, page); + page_make_volatile(page, 2); } if (zone) spin_unlock_irq(&zone->lru_lock); diff -urpN linux-2.6/mm/vmscan.c linux-2.6-patched/mm/vmscan.c --- linux-2.6/mm/vmscan.c 2007-05-10 09:29:57.000000000 +0200 +++ linux-2.6-patched/mm/vmscan.c 2007-05-11 15:52:15.000000000 +0200 @@ -37,6 +37,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/page-states.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -699,13 +700,20 @@ static unsigned long shrink_inactive_lis */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + /* + * Only readd the page to lru list if it has not + * been discarded. + */ + if (likely(!PageDiscarded(page))) { + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); + } else + ClearPageActive(page); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -846,13 +854,22 @@ force_reclaim_mapped: while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); + /* + * Only readd the page to lru list if it has not + * been discarded. + */ + if (likely(!PageDiscarded(page))) { + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + VM_BUG_ON(!PageActive(page)); + ClearPageActive(page); + list_move(&page->lru, &zone->inactive_list); + pgmoved++; + } else { + ClearPageActive(page); + list_del(&page->lru); + } - list_move(&page->lru, &zone->inactive_list); - pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); spin_unlock_irq(&zone->lru_lock); @@ -860,6 +877,7 @@ force_reclaim_mapped: pgmoved = 0; if (buffer_heads_over_limit) pagevec_strip(&pvec); + pagevec_make_volatile(&pvec); __pagevec_release(&pvec); spin_lock_irq(&zone->lru_lock); } @@ -869,6 +887,7 @@ force_reclaim_mapped: if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); pagevec_strip(&pvec); + pagevec_make_volatile(&pvec); spin_lock_irq(&zone->lru_lock); } @@ -876,11 +895,21 @@ force_reclaim_mapped: while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - list_move(&page->lru, &zone->active_list); - pgmoved++; + /* + * Only readd the page to lru list if it has not + * been discarded. + */ + if (likely(!PageDiscarded(page))) { + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + VM_BUG_ON(!PageActive(page)); + list_move(&page->lru, &zone->active_list); + pgmoved++; + } else { + ClearPageActive(page); + list_del(&page->lru); + } + if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); pgmoved = 0; -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
After way to many months here is the fifth version of the guest page hinting patches. Compared to version four a few improvements have been added: - Avoid page_host_discards() calls outside of page-states.h - The discard list is now implemented via the page_free_discarded hook and architecture specific code. - PG_state_change page flag has been replaced with architecture specficic primitives. s390 now uses PG_arch_1 and avoids to waste another page flag (it still uses two additional bits). - Add calls to make pages volatile when pages are moved from the active to the inactive list and set max_buffer_heads to zero to force a try_to_release_page call to get more page into volatile state. - remap_file_pages now works with guest page hinting, although the discard of a page contained in a non-linear mapping is slow. - Simplified the check in the mlock code. - In general the code looks a bit nicer now. I tried to implement batched state transitions to volatile but after a few failures I gave up. Basically, most pages are made volatile with the unlock_page call after the end of i/o. To postpone a make volatile attempt requires to take a page reference. Trouble is you can't release a page reference from interrupt context. This has to be done in task context, so we can't use a pvec/array for keep the references. There is no room in struct page for a list, so it turns out lazy make volatile is hard to implement. The patches apply on the current git tree. Many thanks go to Oliver Paukstadt who kept me busy with bug reports and uncountable dumps .. -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
From: Martin Schwidefsky <schwidefsky@de.ibm.com> From: Hubertus Franke <frankeh@watson.ibm.com> From: Himanshu Raj <rhim@cc.gatech.edu> Add code to get mlock() working with guest page hinting. The problem with mlock is that locked pages may not be removed from page cache. That means they need to be stable. page_make_volatile needs a way to check if a page has been locked. To avoid traversing vma lists - which would hurt performance a lot - a field is added in the struct address_space. This field is set in mlock_fixup if a vma gets mlocked. The bit never gets removed - once a file had an mlocked vma all future pages added to it will stay stable. The pages of an mlocked area are made present in the linux page table by a call to make_pages_present which calls get_user_pages and follow_page. The follow_page function is called for each page in the mlocked vma, if the VM_LOCKED bit in the vma flags is set the page is made stable. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- include/linux/fs.h | 10 ++++++++++ mm/memory.c | 5 +++-- mm/mlock.c | 2 ++ mm/page-states.c | 5 ++++- mm/rmap.c | 13 +++++++++++-- 5 files changed, 30 insertions(+), 5 deletions(-) diff -urpN linux-2.6/include/linux/fs.h linux-2.6-patched/include/linux/fs.h --- linux-2.6/include/linux/fs.h 2007-05-09 09:32:22.000000000 +0200 +++ linux-2.6-patched/include/linux/fs.h 2007-05-11 15:52:16.000000000 +0200 @@ -450,6 +450,9 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ +#ifdef CONFIG_PAGE_STATES + unsigned int mlocked; /* set if VM_LOCKED vmas present */ +#endif } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but @@ -457,6 +460,13 @@ struct address_space { * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ +static inline void mapping_set_mlocked(struct address_space *mapping) +{ +#ifdef CONFIG_PAGE_STATES + mapping->mlocked = 1; +#endif +} + struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ struct inode * bd_inode; /* will die */ diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c --- linux-2.6/mm/memory.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/memory.c 2007-05-11 15:52:16.000000000 +0200 @@ -981,9 +981,10 @@ struct page *follow_page(struct vm_area_ if (flags & FOLL_GET) get_page(page); - if (flags & FOLL_GET) { + if ((flags & FOLL_GET) || (vma->vm_flags & VM_LOCKED)) { /* - * The page is made stable if a reference is acquired. + * The page is made stable if a reference is acquired or + * the vm area is locked. * If the caller does not get a reference it implies that * the caller can deal with page faults in case the page * is swapped out. In this case the caller can deal with diff -urpN linux-2.6/mm/mlock.c linux-2.6-patched/mm/mlock.c --- linux-2.6/mm/mlock.c 2007-04-02 17:11:20.000000000 +0200 +++ linux-2.6-patched/mm/mlock.c 2007-05-11 15:52:16.000000000 +0200 @@ -60,6 +60,8 @@ success: */ pages = (end - start) >> PAGE_SHIFT; if (newflags & VM_LOCKED) { + if (vma->vm_file && vma->vm_file->f_mapping) + mapping_set_mlocked(vma->vm_file->f_mapping); pages = -pages; if (!(newflags & VM_IO)) ret = make_pages_present(start, end); diff -urpN linux-2.6/mm/page-states.c linux-2.6-patched/mm/page-states.c --- linux-2.6/mm/page-states.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/page-states.c 2007-05-11 15:52:16.000000000 +0200 @@ -29,6 +29,8 @@ */ static inline int check_bits(struct page *page) { + struct address_space *mapping; + /* * There are several conditions that prevent a page from becoming * volatile. The first check is for the page bits. @@ -44,7 +46,8 @@ static inline int check_bits(struct page * it volatile. It will be freed soon. And if the mapping ever * had locked pages all pages of the mapping will stay stable. */ - return page_mapping(page) != NULL; + mapping = page_mapping(page); + return mapping && !mapping->mlocked; } /* diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c --- linux-2.6/mm/rmap.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/rmap.c 2007-05-11 15:52:16.000000000 +0200 @@ -654,8 +654,17 @@ static int try_to_unmap_one(struct page */ if (!migration && ((vma->vm_flags & VM_LOCKED) || (ptep_clear_flush_young(vma, address, pte)))) { - ret = SWAP_FAIL; - goto out_unmap; + /* + * Check for discarded pages. This can happen if there have + * been discarded pages before a vma gets mlocked. The code + * in make_pages_present will force all discarded pages out + * and reload them. That happens after the VM_LOCKED bit + * has been set. + */ + if (likely(!PageDiscarded(page))) { + ret = SWAP_FAIL; + goto out_unmap; + } } /* Nuke the page table entry. */ -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.
Martin Schwidefsky
2007-May-11 07:01 UTC
[patch 2/6] Guest page hinting: volatile swap cache.
From: Martin Schwidefsky <schwidefsky@de.ibm.com> From: Hubertus Franke <frankeh@watson.ibm.com> From: Himanshu Raj <rhim@cc.gatech.edu> The volatile page state can be used for anonymous pages as well, if they have been added to the swap cache and the swap write is finished. The tricky bit is in free_swap_and_cache. The call to find_get_page dead-locks with the discard handler. If the page has been discarded find_get_page will try to remove it. To do that it needs the page table lock of all mappers but one is held by the caller of free_swap_and_cache. A special variant of find_get_page is needed that does not check the page state and returns a page reference even if the page is discarded. The second pitfall is that the page needs to be made stable before the swap slot gets freed. If the page cannot be made stable because it has been discarded the swap slot may not be freed because it is still needed to reload the discarded page from the swap device. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> --- include/linux/pagemap.h | 3 ++ include/linux/swap.h | 5 ++++ mm/filemap.c | 19 +++++++++++++++++ mm/memory.c | 13 +++++++++++- mm/page-states.c | 26 ++++++++++++++++-------- mm/rmap.c | 51 ++++++++++++++++++++++++++++++++++++++++++++---- mm/swap_state.c | 25 ++++++++++++++++++++++- mm/swapfile.c | 30 ++++++++++++++++++++++------ mm/vmscan.c | 3 ++ 9 files changed, 154 insertions(+), 21 deletions(-) diff -urpN linux-2.6/include/linux/pagemap.h linux-2.6-patched/include/linux/pagemap.h --- linux-2.6/include/linux/pagemap.h 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/include/linux/pagemap.h 2007-05-11 15:52:16.000000000 +0200 @@ -61,8 +61,11 @@ static inline void mapping_set_gfp_mask( #define page_cache_get(page) get_page(page) #ifdef CONFIG_PAGE_STATES +extern struct page * find_get_page_nodiscard(struct address_space *mapping, + unsigned long index); #define page_cache_release(page) put_page_check(page) #else +#define find_get_page_nodiscard(mapping, index) find_get_page(mapping, index) #define page_cache_release(page) put_page(page) #endif void release_pages(struct page **pages, int nr, int cold); diff -urpN linux-2.6/include/linux/swap.h linux-2.6-patched/include/linux/swap.h --- linux-2.6/include/linux/swap.h 2007-04-02 17:11:19.000000000 +0200 +++ linux-2.6-patched/include/linux/swap.h 2007-05-11 15:52:16.000000000 +0200 @@ -228,6 +228,7 @@ extern struct address_space swapper_spac extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, gfp_t); extern void __delete_from_swap_cache(struct page *); +extern void __delete_from_swap_cache_nocheck(struct page *); extern void delete_from_swap_cache(struct page *); extern int move_to_swap_cache(struct page *, swp_entry_t); extern int move_from_swap_cache(struct page *, unsigned long, @@ -343,6 +344,10 @@ static inline void __delete_from_swap_ca { } +static inline void __delete_from_swap_cache_nocheck(struct page *page) +{ +} + static inline void delete_from_swap_cache(struct page *page) { } diff -urpN linux-2.6/mm/filemap.c linux-2.6-patched/mm/filemap.c --- linux-2.6/mm/filemap.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/filemap.c 2007-05-11 15:52:16.000000000 +0200 @@ -507,6 +507,25 @@ static int __sleep_on_page_lock(void *wo return 0; } +#ifdef CONFIG_PAGE_STATES + +struct page * find_get_page_nodiscard(struct address_space *mapping, + unsigned long offset) +{ + struct page *page; + + read_lock_irq(&mapping->tree_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page) + page_cache_get(page); + read_unlock_irq(&mapping->tree_lock); + return page; +} + +EXPORT_SYMBOL(find_get_page_nodiscard); + +#endif + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c --- linux-2.6/mm/memory.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/memory.c 2007-05-11 15:52:16.000000000 +0200 @@ -500,7 +500,18 @@ out_discard_pte: * in page cache anymore. Do what try_to_unmap_one would do * if the copy_one_pte had taken place before page_discard. */ - if (page->index != linear_page_index(vma, addr)) + if (PageAnon(page)) { + swp_entry_t entry = { .val = page_private(page) }; + swap_duplicate(entry); + if (list_empty(&dst_mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + pte = swp_entry_to_pte(entry); + set_pte_at(dst_mm, addr, dst_pte, pte); + } else if (page->index != linear_page_index(vma, addr)) /* If nonlinear, store the file page offset in the pte. */ set_pte_at(dst_mm, addr, dst_pte, pgoff_to_pte(page->index)); else diff -urpN linux-2.6/mm/page-states.c linux-2.6-patched/mm/page-states.c --- linux-2.6/mm/page-states.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/page-states.c 2007-05-11 15:52:16.000000000 +0200 @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> #include <linux/pagevec.h> #include <linux/page-states.h> +#include <linux/swap.h> #include "internal.h" @@ -34,7 +35,8 @@ static inline int check_bits(struct page */ if (PageDirty(page) || PageReserved(page) || PageWriteback(page) || PageLocked(page) || PagePrivate(page) || PageDiscarded(page) || - !PageUptodate(page) || !PageLRU(page) || PageAnon(page)) + !PageUptodate(page) || !PageLRU(page) || + (PageAnon(page) && !PageSwapCache(page))) return 0; /* @@ -168,15 +170,21 @@ static void __page_discard(struct page * } spin_unlock_irq(&zone->lru_lock); - /* We can't handle swap cache pages (yet). */ - VM_BUG_ON(PageSwapCache(page)); - - /* Remove page from page cache. */ + /* Remove page from page cache/swap cache. */ mapping = page->mapping; - write_lock_irq(&mapping->tree_lock); - __remove_from_page_cache_nocheck(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); + if (PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + write_lock_irq(&swapper_space.tree_lock); + __delete_from_swap_cache_nocheck(page); + write_unlock_irq(&swapper_space.tree_lock); + swap_free(entry); + page_cache_release(page); + } else { + write_lock_irq(&mapping->tree_lock); + __remove_from_page_cache_nocheck(page); + write_unlock_irq(&mapping->tree_lock); + __put_page(page); + } } /** diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c --- linux-2.6/mm/rmap.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/rmap.c 2007-05-11 15:52:16.000000000 +0200 @@ -557,6 +557,7 @@ void page_add_anon_rmap(struct page *pag if (atomic_inc_and_test(&page->_mapcount)) __page_set_anon_rmap(page, vma, address); /* else checking page index and mapping is racy */ + page_make_volatile(page, 1); } /* @@ -968,13 +969,13 @@ int try_to_unmap(struct page *page, int #ifdef CONFIG_PAGE_STATES /** - * page_unmap_all - removes all mappings of a page + * page_unmap_file - removes all mappings of a file page * * @page: the page which mapping in the vma should be struck down * * the caller needs to hold page lock */ -void page_unmap_all(struct page* page) +static void page_unmap_file(struct page* page) { struct address_space *mapping = page_mapping(page); pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -983,8 +984,6 @@ void page_unmap_all(struct page* page) unsigned long address; int rc; - VM_BUG_ON(!PageLocked(page) || PageReserved(page) || PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { address = vma_address(page, vma); @@ -1015,4 +1014,48 @@ out: spin_unlock(&mapping->i_mmap_lock); } +/** + * page_unmap_anon - removes all mappings of an anonymous page + * + * @page: the page which mapping in the vma should be struck down + * + * the caller needs to hold page lock + */ +static void page_unmap_anon(struct page* page) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long address; + int rc; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return; + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + address = vma_address(page, vma); + if (address == -EFAULT) + continue; + rc = try_to_unmap_one(page, vma, address, 0); + VM_BUG_ON(rc == SWAP_FAIL); + } + page_unlock_anon_vma(anon_vma); +} + +/** + * page_unmap_all - removes all mappings of a page + * + * @page: the page which mapping in the vma should be struck down + * + * the caller needs to hold page lock + */ +void page_unmap_all(struct page *page) +{ + VM_BUG_ON(!PageLocked(page) || PageReserved(page)); + + if (PageAnon(page)) + page_unmap_anon(page); + else + page_unmap_file(page); +} + #endif diff -urpN linux-2.6/mm/swapfile.c linux-2.6-patched/mm/swapfile.c --- linux-2.6/mm/swapfile.c 2007-05-08 09:31:18.000000000 +0200 +++ linux-2.6-patched/mm/swapfile.c 2007-05-11 15:52:16.000000000 +0200 @@ -27,6 +27,7 @@ #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> +#include <linux/page-states.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -369,9 +370,11 @@ int remove_exclusive_swap_page(struct pa /* Recheck the page count with the swapcache lock held.. */ write_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { - __delete_from_swap_cache(page); - SetPageDirty(page); - retval = 1; + if (likely(page_make_stable(page))) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } } write_unlock_irq(&swapper_space.tree_lock); } @@ -400,7 +403,13 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { - page = find_get_page(&swapper_space, entry.val); + /* + * Use find_get_page_nodiscard to avoid the deadlock + * on the swap_lock and the page table lock if the + * page has been discarded. + */ + page = find_get_page_nodiscard(&swapper_space, + entry.val); if (page && unlikely(TestSetPageLocked(page))) { page_cache_release(page); page = NULL; @@ -417,8 +426,17 @@ void free_swap_and_cache(swp_entry_t ent /* Also recheck PageSwapCache after page is locked (above) */ if (PageSwapCache(page) && !PageWriteback(page) && (one_user || vm_swap_full())) { - delete_from_swap_cache(page); - SetPageDirty(page); + /* + * To be able to reload the page from swap the + * swap slot may not be freed. The caller of + * free_swap_and_cache holds a page table lock + * for this page. The discarded page can not be + * removed here. + */ + if (likely(page_make_stable(page))) { + delete_from_swap_cache(page); + SetPageDirty(page); + } } unlock_page(page); page_cache_release(page); diff -urpN linux-2.6/mm/swap_state.c linux-2.6-patched/mm/swap_state.c --- linux-2.6/mm/swap_state.c 2007-04-02 17:11:20.000000000 +0200 +++ linux-2.6-patched/mm/swap_state.c 2007-05-11 15:52:16.000000000 +0200 @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/migrate.h> +#include <linux/page-states.h> #include <asm/pgtable.h> @@ -121,7 +122,7 @@ static int add_to_swap_cache(struct page * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page) +void inline __delete_from_swap_cache_nocheck(struct page *page) { BUG_ON(!PageLocked(page)); BUG_ON(!PageSwapCache(page)); @@ -136,6 +137,28 @@ void __delete_from_swap_cache(struct pag INC_CACHE_INFO(del_total); } +void __delete_from_swap_cache(struct page *page) +{ + /* + * Check if the discard fault handler already removed + * the page from the page cache. If not set the discard + * bit in the page flags to prevent double page free if + * a discard fault is racing with normal page free. + */ + if (TestSetPageDiscarded(page)) + return; + + __delete_from_swap_cache_nocheck(page); + + /* + * Check the hardware page state and clear the discard + * bit in the page flags only if the page is not + * discarded. + */ + if (!page_discarded(page)) + ClearPageDiscarded(page); +} + /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap diff -urpN linux-2.6/mm/vmscan.c linux-2.6-patched/mm/vmscan.c --- linux-2.6/mm/vmscan.c 2007-05-11 15:52:16.000000000 +0200 +++ linux-2.6-patched/mm/vmscan.c 2007-05-11 15:52:16.000000000 +0200 @@ -470,6 +470,9 @@ static unsigned long shrink_page_list(st sc->nr_scanned++; + if (unlikely(PageDiscarded(page))) + goto free_it; + if (!sc->may_swap && page_mapped(page)) goto keep_locked; -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.