This patch support exchange a page for a suspened PV guest from user space. The basic idea to offline a page is: 1) mark a page offline pending 2) If the page is owned by a HVM domain, user have to live migrate it. In future, with stub-domain support, we can also exchange the page without migration. 3) If the page is owned by a PV domain, we will try to exchange the offline pending page to a new one and free the old page. This patch achieves item 3. The method to exchange the offline pending page for PV domain is: 1) Suspend the guest. 2) If the page is being granted out, return with offline pending. 3) Get a copy for the content 4) Scan all page table page to see if any reference to the offending page, if yes, make the entry to be non-present to reduce the reference count. 5) After update all page tables, user space tools will try to exchange the old page .If the new mfn has no reference anymore (i.e. count_info & count_mask = 1), the exchange will allocate a new page, update the m2p and return success, otherwise it will return fail. 6) If step 5 is success, user space tools will update the content of the new page , change the p2m table, and change all entries scaned in step 4 to point to new entry. if step failed, it will try to undo step 4 to revert page table. 7) Resume the guest. Please refer to thread in http://www.mailinglistarchive.com/xen-devel@lists.xensource.com/msg63084.html for more informatin. Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com> diff -r 79c875126e1c tools/libxc/xc_offline_page.c --- a/tools/libxc/xc_offline_page.c Sun May 31 04:55:34 2009 +0800 +++ b/tools/libxc/xc_offline_page.c Sun May 31 04:59:18 2009 +0800 @@ -12,11 +12,41 @@ #include <stdlib.h> #include <unistd.h> #include <sys/time.h> +#include <xs.h> +#include <xc_core.h> #include "xc_private.h" #include "xc_dom.h" #include "xg_private.h" #include "xg_save_restore.h" + +struct domain_mem_info{ + int domid; + unsigned int pt_level; + unsigned int guest_width; + uint32_t *pfn_type; + xen_pfn_t *p2m_table; + unsigned long p2m_size; + xen_pfn_t *m2p_table; + int max_mfn; +}; + +struct pte_backup_entry +{ + xen_pfn_t table_mfn; + int offset; +}; + +#define DEFAULT_BACKUP_COUNT 1024 +struct pte_backup +{ + struct pte_backup_entry *entries; + int max; + int cur; +}; + +/* Global definition for some MACRO */ +int guest_width, p2m_size; int xc_mark_page_online(int xc, unsigned long start, unsigned long end, uint32_t *status) @@ -98,3 +128,637 @@ int xc_query_page_offline_status(int xc, return ret; } + + /* + * There should no update to the grant when domain paused + */ +static int xc_is_page_granted(int xc_handle, xen_pfn_t gpfn, + struct grant_entry *gnttab, int gnt_num) +{ + int i = 0; + + if (!gnttab) + return 0; + + for (i = 0; i < gnt_num; i++) + if ( ((gnttab[i].flags & GTF_type_mask) != GTF_invalid) && + (gnttab[i].frame == gpfn) ) + break; + + return (i != gnt_num); +} + +static xen_pfn_t pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m, int gwidth) +{ + return ((xen_pfn_t) ((gwidth==8)? + (((uint64_t *)p2m)[(pfn)]): + ((((uint32_t *)p2m)[(pfn)]) == 0xffffffffU ? + (-1UL) : + (((uint32_t *)p2m)[(pfn)])))); +} + +static int get_pt_level(int xc_handle, uint32_t domid, + unsigned int *pt_level, + unsigned int *gwidth) +{ + DECLARE_DOMCTL; + xen_capabilities_info_t xen_caps = ""; + + if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0) + return -1; + + memset(&domctl, 0, sizeof(domctl)); + domctl.domain = domid; + domctl.cmd = XEN_DOMCTL_get_address_size; + + if ( do_domctl(xc_handle, &domctl) != 0 ) + return -1; + + *gwidth = domctl.u.address_size.size / 8; + + if (strstr(xen_caps, "xen-3.0-x86_64")) + /* Depends on whether it''s a compat 32-on-64 guest */ + *pt_level = ( (*gwidth == 8) ? 4 : 3 ); + else if (strstr(xen_caps, "xen-3.0-x86_32p")) + *pt_level = 3; + else if (strstr(xen_caps, "xen-3.0-x86_32")) + *pt_level = 2; + else + return -1; + + return 0; +} + +static int close_mem_info(int xc_handle, struct domain_mem_info *minfo) +{ + if (minfo->pfn_type) + free(minfo->pfn_type); + munmap(minfo->m2p_table, M2P_SIZE(minfo->max_mfn)); + munmap(minfo->p2m_table, P2M_FLL_ENTRIES * PAGE_SIZE); + minfo->p2m_table = minfo->m2p_table = NULL; + + return 0; +} + +static int init_mem_info(int xc_handle, int domid, + struct domain_mem_info *minfo, + xc_dominfo_t *info) +{ + uint64_aligned_t shared_info_frame; + shared_info_any_t *live_shinfo = NULL; + int i, rc; + + /* Only be initialized once */ + if (minfo->pfn_type || minfo->m2p_table || minfo->p2m_table) + return -EINVAL; + + if ( get_pt_level(xc_handle, domid, &minfo->pt_level, + &minfo->guest_width) ) + { + ERROR("Unable to get PT level info."); + return -EFAULT; + } + guest_width = minfo->guest_width; + + shared_info_frame = info->shared_info_frame; + + live_shinfo = xc_map_foreign_range(xc_handle, domid, + PAGE_SIZE, PROT_READ, shared_info_frame); + if ( !live_shinfo ) + { + ERROR("Couldn''t map live_shinfo"); + return -EFAULT; + } + + if ( (rc = xc_core_arch_map_p2m_writable(xc_handle, minfo->guest_width, + info, live_shinfo, &minfo->p2m_table, &minfo->p2m_size)) ) + { + ERROR("Couldn''t map p2m table %x\n", rc); + goto failed; + } + munmap(live_shinfo, PAGE_SIZE); + live_shinfo = NULL; + + p2m_size = minfo->p2m_size; + + minfo->max_mfn = xc_memory_op(xc_handle, XENMEM_maximum_ram_page, NULL); + if ( !(minfo->m2p_table + xc_map_m2p(xc_handle, minfo->max_mfn, PROT_READ, NULL)) ) + { + ERROR("Failed to map live M2P table"); + goto failed; + } + + /* Get pfn type */ + minfo->pfn_type = malloc(sizeof(uint32_t) * minfo->p2m_size); + if (!minfo->pfn_type) + { + ERROR("Failed to malloc pfn_type\n"); + goto failed; + } + memset(minfo->pfn_type, 0, sizeof(uint32_t) * minfo->p2m_size); + + for (i = 0; i < minfo->p2m_size; i++) + minfo->pfn_type[i] = pfn_to_mfn(i, minfo->p2m_table, + minfo->guest_width); + + if ( lock_pages(minfo->pfn_type, minfo->p2m_size * sizeof(uint32_t)) ) + { + ERROR("Unable to lock pfn_type array"); + goto failed; + } + + for (i = 0; i < minfo->p2m_size ; i+=1024) + { + int count = ((p2m_size - i ) > 1024 ) ? 1024: (p2m_size - i); + if ( ( rc = xc_get_pfn_type_batch(xc_handle, domid, count, + minfo->pfn_type + i)) ) + { + ERROR("Failed to get pfn_type %x\n", rc); + goto unlock; + } + } + return 0; + +unlock: + unlock_pages(minfo->pfn_type, minfo->p2m_size * sizeof(uint32_t)); +failed: + if (minfo->pfn_type) + { + minfo->pfn_type = NULL; + free(minfo->pfn_type); + } + if (live_shinfo) + munmap(live_shinfo, PAGE_SIZE); + munmap(minfo->m2p_table, M2P_SIZE(minfo->max_mfn)); + munmap(minfo->p2m_table, P2M_FLL_ENTRIES * PAGE_SIZE); + minfo->p2m_table = minfo->m2p_table = NULL; + + return -1; +} + +static int backup_ptes(xen_pfn_t table_mfn, int offset, + struct pte_backup *backup) +{ + if (!backup) + return -EINVAL; + + if (backup->max == backup->cur) + { + backup->entries = realloc(backup->entries, + backup->max * 2 * sizeof(struct pte_backup_entry)); + if (backup->entries == NULL) + return -1; + else + backup->max *= 2; + } + + backup->entries[backup->cur].table_mfn = table_mfn; + backup->entries[backup->cur++].offset = offset; + + return 0; +} + +/* + * return: + * 1 when MMU update is required + * 0 when no changes + * <0 when error happen + */ +typedef int (*pte_func)(uint64_t pte, uint64_t *new_pte, + unsigned long table_mfn, int table_offset, + struct pte_backup *backup, + unsigned long no_use); + +static int __clear_pte(uint64_t pte, uint64_t *new_pte, + unsigned long table_mfn, int table_offset, + struct pte_backup *backup, + unsigned long mfn) +{ + /* If no new_pte pointer, same as no changes needed */ + if (!new_pte || !backup) + return -EINVAL; + + if ( !(pte & _PAGE_PRESENT)) + return 0; + + /* XXX Check for PSE bit here */ + /* Hit one entry */ + if ( ((pte >> PAGE_SHIFT_X86) & MFN_MASK_X86) == mfn) + { + *new_pte = pte & ~_PAGE_PRESENT; + if (!backup_ptes(table_mfn, table_offset, backup)) + return 1; + } + + return 0; +} + +static int __update_pte(uint64_t pte, uint64_t *new_pte, + unsigned long table_mfn, int table_offset, + struct pte_backup *backup, + unsigned long new_mfn) +{ + int index; + + if (!new_pte) + return 0; + + for (index = 0; index < backup->cur; index ++) + if ( (backup->entries[index].table_mfn == table_mfn) && + (backup->entries[index].offset == table_offset) ) + break; + + if (index != backup->cur) + { + if (pte & _PAGE_PRESENT) + ERROR("Page present while in backup ptes\n"); + pte &= ~MFN_MASK_X86; + pte |= (new_mfn << PAGE_SHIFT_X86) | _PAGE_PRESENT; + *new_pte = pte; + return 1; + } + + return 0; +} + +static int change_pte(int xc_handle, int domid, + struct domain_mem_info *minfo, + struct pte_backup *backup, + struct xc_mmu *mmu, + pte_func func, + unsigned long data) +{ + int pte_num, rc; + uint64_t i; + void *content = NULL; + + pte_num = PAGE_SIZE / ((minfo->pt_level == 2) ? 4 : 8); + + for (i = 0; i < minfo->p2m_size; i++) + { + xen_pfn_t table_mfn = pfn_to_mfn(i, minfo->p2m_table, + minfo->guest_width); + uint64_t pte, new_pte; + int j; + + if ( table_mfn == INVALID_P2M_ENTRY ) + continue; + + if ( minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) + { + content = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, + PROT_READ, table_mfn); + if (!content) + goto failed; + + for (j = 0; j < pte_num; j++) + { + if ( minfo->pt_level == 2 ) + pte = ((const uint32_t*)content)[j]; + else + pte = ((const uint64_t*)content)[j]; + + rc = func(pte, &new_pte, table_mfn, j, backup, data); + + switch (rc) + { + case 1: + if ( xc_add_mmu_update(xc_handle, mmu, + table_mfn << PAGE_SHIFT | + j * ( (minfo->pt_level == 2) ? + sizeof(uint32_t): sizeof(uint64_t)) | + MMU_PT_UPDATE_PRESERVE_AD, + new_pte) ) + goto failed; + break; + + case 0: + break; + + default: + goto failed; + } + } + } + + munmap(content, PAGE_SIZE); + content = NULL; + } + + if ( xc_flush_mmu_updates(xc_handle, mmu) ) + goto failed; + + return 0; +failed: + /* XXX Shall we take action if we have fail to swap? */ + if (content) + munmap(content, PAGE_SIZE); + + return -1; +} + +static int update_pte(int xc_handle, int domid, + struct domain_mem_info *minfo, + struct pte_backup *backup, + struct xc_mmu *mmu, + unsigned long new_mfn) +{ + return change_pte(xc_handle, domid, minfo, backup, mmu, + __update_pte, new_mfn); +} + +static int clear_pte(int xc_handle, int domid, + struct domain_mem_info *minfo, + struct pte_backup *backup, + struct xc_mmu *mmu, + xen_pfn_t mfn) +{ + return change_pte(xc_handle, domid, minfo, backup, mmu, + __clear_pte, mfn); +} + +static int exchange_page(int xc_handle, xen_pfn_t mfn, + xen_pfn_t *new_mfn, int domid) +{ + int rc; + xen_pfn_t out_mfn; + + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = 1, + .extent_order = 0, + .domid = domid + }, + .out = { + .nr_extents = 1, + .extent_order = 0, + .domid = domid + } + }; + set_xen_guest_handle(exchange.in.extent_start, &mfn); + set_xen_guest_handle(exchange.out.extent_start, &out_mfn); + + rc = xc_memory_op(xc_handle, XENMEM_exchange, &exchange); + + if (!rc) + *new_mfn = out_mfn; + + return rc; +} + +/* + * Check if a page can be exchanged successfully + */ + +static int is_page_exchangable(int xc_handle, int domid, xen_pfn_t mfn, + xc_dominfo_t *info) +{ + uint32_t status; + int rc; + + /* domain checking */ + if ( !domid || (domid > DOMID_FIRST_RESERVED) ) + { + DPRINTF("Dom0''s page can''t be LM"); + return 0; + } + if (info->hvm) + { + DPRINTF("Currently we can only live change PV guest''s page\n"); + return 0; + } + + /* Check if pages are offline pending or not */ + rc = xc_query_page_offline_status(xc_handle, mfn, mfn, &status); + + if ( rc || !(status & PG_OFFLINE_STATUS_OFFLINE_PENDING) ) + { + ERROR("Page %lx is not offline pending %x\n", + mfn, status); + return 0; + } + + return 1; +} + +/* The domain should be suspended when called here */ +int xc_exchange_page(int xc_handle, int domid, xen_pfn_t mfn) +{ + xc_dominfo_t info; + struct domain_mem_info minfo; + struct xc_mmu *mmu = NULL; + struct pte_backup old_ptes = {NULL, 0, 0}; + struct grant_entry *gnttab = NULL; + struct mmuext_op mops; + int gnt_num, unpined = 0; + void *old_p, *backup = NULL; + int rc, result = -1; + uint32_t status; + xen_pfn_t new_mfn, gpfn; + + if ( xc_domain_getinfo(xc_handle, domid, 1, &info) != 1 ) + { + ERROR("Could not get domain info"); + return -EFAULT; + } + + if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend) + { + ERROR("Can''t exchange page unless domain is suspended\n"); + return -EINVAL; + } + + if (!is_page_exchangable(xc_handle, domid, mfn, &info)) + { + ERROR("Could not exchange page\n"); + return -EINVAL; + } + + /* Get domain''s memory information */ + memset(&minfo, 0, sizeof(minfo)); + init_mem_info(xc_handle, domid, &minfo, &info); + gpfn = minfo.m2p_table[mfn]; + + /* Don''t exchange CR3 for PAE guest in PAE host environment */ + if (minfo.guest_width > sizeof(long)) + { + if ( (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) =+ XEN_DOMCTL_PFINFO_L3TAB ) + goto failed; + } + + gnttab = xc_gnttab_map_table(xc_handle, domid, &gnt_num); + if (!gnttab) + { + ERROR("Failed to map grant table\n"); + goto failed; + } + + if (xc_is_page_granted(xc_handle, mfn, gnttab, gnt_num)) + { + ERROR("Page %lx is granted now\n", mfn); + goto failed; + } + + /* allocate required data structure */ + backup = malloc(PAGE_SIZE); + if (!backup) + { + ERROR("Failed to allocate backup pages pointer\n"); + goto failed; + } + + old_ptes.max = DEFAULT_BACKUP_COUNT; + old_ptes.entries = malloc(sizeof(struct pte_backup_entry) * + DEFAULT_BACKUP_COUNT); + + if (!old_ptes.entries) + { + ERROR("Faield to allocate backup\n"); + goto failed; + } + old_ptes.cur = 0; + + /* Unpin the page if it is pined */ + if (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB) + { + mops.cmd = MMUEXT_UNPIN_TABLE; + mops.arg1.mfn = mfn; + + if ( xc_mmuext_op(xc_handle, &mops, 1, domid) < 0 ) + { + ERROR("Failed to unpin page %lx", mfn); + goto failed; + } + mops.arg1.mfn = mfn; + unpined = 1; + } + + /* backup the content */ + old_p = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, + PROT_READ, mfn); + if (!old_p) + { + ERROR("Failed to map foreign page %lx\n", mfn); + goto failed; + } + + memcpy(backup, old_p, PAGE_SIZE); + munmap(old_p, PAGE_SIZE); + + mmu = xc_alloc_mmu_updates(xc_handle, domid); + if ( mmu == NULL ) + { + ERROR("%s: failed at %d\n", __FUNCTION__, __LINE__); + goto failed; + } + + /* Firstly update all pte to be invalid to remove the reference */ + rc = clear_pte(xc_handle, domid, &minfo, &old_ptes, mmu, mfn); + + if (rc) + { + ERROR("clear pte failed\n"); + goto failed; + } + + rc = exchange_page(xc_handle, mfn, &new_mfn, domid); + + if (rc) + { + ERROR("Exchange the page failed\n"); + /* Exchange fail means there are refere to the page still */ + rc = update_pte(xc_handle, domid, &minfo, &old_ptes, mmu, mfn); + if (rc) + result = -2; + goto failed; + } + + rc = update_pte(xc_handle, domid, &minfo, &old_ptes, mmu, new_mfn); + + if (rc) + { + ERROR("update pte failed guest may be broken now\n"); + /* No recover action now for swap fail */ + result = -2; + goto failed; + } + + /* Check if pages are offlined already */ + rc = xc_query_page_offline_status(xc_handle, mfn, mfn, + &status); + + if (rc) + { + ERROR("Fail to query offline status\n"); + }else if ( !(status & PG_OFFLINE_STATUS_OFFLINED) ) + { + ERROR("page is still online or pending\n"); + goto failed; + } + else + { + void *new_p; + IPRINTF("Now page is offlined %lx\n", mfn); + /* Update the p2m table */ + minfo.p2m_table[gpfn] = new_mfn; + + new_p = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, + PROT_READ|PROT_WRITE, new_mfn); + memcpy(new_p, backup, PAGE_SIZE); + munmap(new_p, PAGE_SIZE); + mops.arg1.mfn = new_mfn; + result = 0; + } + +failed: + + if (unpined && (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB)) + { + switch ( minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) + { + case XEN_DOMCTL_PFINFO_L1TAB: + mops.cmd = MMUEXT_PIN_L1_TABLE; + break; + + case XEN_DOMCTL_PFINFO_L2TAB: + mops.cmd = MMUEXT_PIN_L2_TABLE; + break; + + case XEN_DOMCTL_PFINFO_L3TAB: + mops.cmd = MMUEXT_PIN_L3_TABLE; + break; + + case XEN_DOMCTL_PFINFO_L4TAB: + mops.cmd = MMUEXT_PIN_L4_TABLE; + break; + + default: + ERROR("Unpined for non pate table page\n"); + break; + } + + if ( xc_mmuext_op(xc_handle, &mops, 1, domid) < 0 ) + { + ERROR("failed to pin the mfn again\n"); + result = -2; + } + } + + if (mmu) + free(mmu); + + if (old_ptes.entries) + free(old_ptes.entries); + + if (backup) + free(backup); + + if (gnttab) + munmap(gnttab, gnt_num / (PAGE_SIZE/sizeof(struct grant_entry))); + + close_mem_info(xc_handle, &minfo); + + return result; +} diff -r 79c875126e1c tools/libxc/xc_suspend.c --- a/tools/libxc/xc_suspend.c Sun May 31 04:55:34 2009 +0800 +++ b/tools/libxc/xc_suspend.c Sun May 31 04:57:30 2009 +0800 @@ -110,7 +110,7 @@ int xc_suspend_evtchn_init(int xc, int x return suspend_evtchn; cleanup: - if (suspend_evtchn > 0) + if (suspend_evtchn != -1) xc_suspend_evtchn_release(xce, suspend_evtchn); return -1; diff -r 79c875126e1c tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Sun May 31 04:55:34 2009 +0800 +++ b/tools/libxc/xenguest.h Sun May 31 04:57:30 2009 +0800 @@ -163,6 +163,8 @@ int xc_query_page_offline_status(int xc, int xc_query_page_offline_status(int xc, unsigned long start, unsigned long end, uint32_t *status); +int xc_exchange_page(int xc_handle, int domid, xen_pfn_t mfn); + /** * This function map m2p table _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel