Zhai, Edwin
2007-Jan-11 14:11 UTC
[Xen-devel] [PATCH 6/8] HVM save restore: guest memory handling
[PATCH 6/8] HVM save restore: guest memory handling Signed-off-by: Zhai Edwin <edwin.zhai@intel.com> add support for save/restore HVM guest memory diff -r bb1c450b2739 tools/libxc/xc_hvm_restore.c --- a/tools/libxc/xc_hvm_restore.c Thu Jan 11 21:03:11 2007 +0800 +++ b/tools/libxc/xc_hvm_restore.c Thu Jan 11 21:05:45 2007 +0800 @@ -31,6 +31,40 @@ #include <xen/hvm/ioreq.h> #include <xen/hvm/params.h> #include <xen/hvm/e820.h> + +/* max mfn of the whole machine */ +static unsigned long max_mfn; + +/* virtual starting address of the hypervisor */ +static unsigned long hvirt_start; + +/* #levels of page tables used by the currrent guest */ +static unsigned int pt_levels; + +/* total number of pages used by the current guest */ +static unsigned long max_pfn; + +/* A table mapping each PFN to its new MFN. */ +static xen_pfn_t *p2m = NULL; + +static ssize_t +read_exact(int fd, void *buf, size_t count) +{ + int r = 0, s; + unsigned char *b = buf; + + while (r < count) { + s = read(fd, &b[r], count - r); + if ((s == -1) && (errno == EINTR)) + continue; + if (s <= 0) { + break; + } + r += s; + } + + return (r == count) ? 1 : 0; +} int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom, unsigned long nr_pfns, @@ -38,5 +72,289 @@ int xc_hvm_restore(int xc_handle, int io unsigned int console_evtchn, unsigned long *console_mfn, unsigned int pae, unsigned int apic) { - return 0; + DECLARE_DOMCTL; + + /* The new domain''s shared-info frame number. */ + unsigned long shared_info_frame; + + /* A copy of the CPU context of the guest. */ + vcpu_guest_context_t ctxt; + + char *region_base; + + unsigned long buf[PAGE_SIZE/sizeof(unsigned long)]; + + xc_dominfo_t info; + unsigned int rc = 1, n, i; + uint32_t rec_len, nr_vcpus; + hvm_domain_context_t hvm_ctxt; + unsigned long long v_end, memsize; + unsigned long shared_page_nr; + + unsigned long mfn, pfn; + unsigned int prev_pc, this_pc; + int verify = 0; + + /* Types of the pfns in the current region */ + unsigned long region_pfn_type[MAX_BATCH_SIZE]; + + /* hvm guest mem size (Mb) */ + memsize = (unsigned long long)*store_mfn; + v_end = memsize << 20; + + DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, *store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n", + dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, *console_mfn, pae, apic); + + max_pfn = nr_pfns; + + if(!get_platform_info(xc_handle, dom, + &max_mfn, &hvirt_start, &pt_levels)) { + ERROR("Unable to get platform info."); + return 1; + } + + DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, hvirt_start=%lx, pt_levels=%d\n", + max_pfn, + max_mfn, + hvirt_start, + pt_levels); + + if (mlock(&ctxt, sizeof(ctxt))) { + /* needed for build dom0 op, but might as well do early */ + ERROR("Unable to mlock ctxt"); + return 1; + } + + + p2m = malloc(max_pfn * sizeof(xen_pfn_t)); + + if (p2m == NULL) { + ERROR("memory alloc failed"); + errno = ENOMEM; + goto out; + } + + /* Get the domain''s shared-info frame. */ + domctl.cmd = XEN_DOMCTL_getdomaininfo; + domctl.domain = (domid_t)dom; + if (xc_domctl(xc_handle, &domctl) < 0) { + ERROR("Could not get information on new domain"); + goto out; + } + shared_info_frame = domctl.u.getdomaininfo.shared_info_frame; + + if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { + errno = ENOMEM; + goto out; + } + + for ( i = 0; i < max_pfn; i++ ) + p2m[i] = i; + for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn; i++ ) + p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; + + /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */ + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn, + 0, 0, &p2m[0x00]); + if ( (rc == 0) && (max_pfn > 0xc0) ) + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]); + if ( rc != 0 ) + { + PERROR("Could not allocate memory for HVM guest.\n"); + goto out; + } + + + /**********XXXXXXXXXXXXXXXX******************/ + if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) { + ERROR("Could not get domain info"); + return 1; + } + + domctl.cmd = XEN_DOMCTL_getdomaininfo; + domctl.domain = (domid_t)dom; + if (xc_domctl(xc_handle, &domctl) < 0) { + ERROR("Could not get information on new domain"); + goto out; + } + + for ( i = 0; i < max_pfn; i++) + p2m[i] = i; + + prev_pc = 0; + + n = 0; + while (1) { + + int j; + + this_pc = (n * 100) / max_pfn; + if ( (this_pc - prev_pc) >= 5 ) + { + PPRINTF("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + if (!read_exact(io_fd, &j, sizeof(int))) { + ERROR("HVM restore Error when reading batch size"); + goto out; + } + + PPRINTF("batch %d\n",j); + + if (j == -1) { + verify = 1; + DPRINTF("Entering page verify mode\n"); + continue; + } + + if (j == 0) + break; /* our work here is done */ + + if (j > MAX_BATCH_SIZE) { + ERROR("Max batch size exceeded. Giving up."); + goto out; + } + + if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) { + ERROR("Error when reading region pfn types"); + goto out; + } + + region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_WRITE, region_pfn_type, j); + + for ( i = 0; i < j; i++ ) + { + void *page; + + pfn = region_pfn_type[i]; + if ( pfn > max_pfn ) + { + ERROR("pfn out of range"); + goto out; + } + + if ( pfn >= 0xa0 && pfn < 0xc0) { + ERROR("hvm restore:pfn in vga hole"); + goto out; + } + + + mfn = p2m[pfn]; + + /* In verify mode, we use a copy; otherwise we work in place */ + page = verify ? (void *)buf : (region_base + i*PAGE_SIZE); + + if (!read_exact(io_fd, page, PAGE_SIZE)) { + ERROR("Error when reading page (%x)", i); + goto out; + } + + if (verify) { + + int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE); + + if (res) { + + int v; + + DPRINTF("************** pfn=%lx mfn=%lx gotcs=%08lx " + "actualcs=%08lx\n", pfn, p2m[pfn], + csum_page(region_base + i*PAGE_SIZE), + csum_page(buf)); + + for (v = 0; v < 4; v++) { + + unsigned long *p = (unsigned long *) + (region_base + i*PAGE_SIZE); + if (buf[v] != p[v]) + DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]); + } + } + } + + } /* end of ''batch'' for loop */ + munmap(region_base, j*PAGE_SIZE); + n+= j; /* crude stats */ + + }/*while 1*/ + +/* xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/ + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae); + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn); + + if ( v_end > HVM_BELOW_4G_RAM_END ) + shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1; + else + shared_page_nr = (v_end >> PAGE_SHIFT) - 1; + + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1); + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2); + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr); + + /* caculate the store_mfn , wrong val cause hang when introduceDomain */ + *store_mfn = (v_end >> PAGE_SHIFT) - 2; + DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", *store_mfn, v_end); + + /* restore hvm context including pic/pit/shpage */ + if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) { + ERROR("error read hvm context size!\n"); + goto out; + } + if (rec_len != sizeof(hvm_ctxt)) { + ERROR("hvm context size dismatch!\n"); + goto out; + } + + if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) { + ERROR("error read hvm context!\n"); + goto out; + } + + if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) { + ERROR("error set hvm context!\n"); + goto out; + } + + if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) { + ERROR("error read nr vcpu !\n"); + goto out; + } + DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus); + + for (i =0; i < nr_vcpus; i++) { + if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) { + ERROR("error read vcpu context size!\n"); + goto out; + } + if (rec_len != sizeof(ctxt)) { + ERROR("vcpu context size dismatch!\n"); + goto out; + } + + if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) { + ERROR("error read vcpu context.\n"); + goto out; + } + + if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) { + ERROR("Could not set vcpu context, rc=%d", rc); + goto out; + } + } + + rc = 0; + goto out; + + out: + if ( (rc != 0) && (dom != 0) ) + xc_domain_destroy(xc_handle, dom); + free(p2m); + + DPRINTF("Restore exit with rc=%d\n", rc); + + return rc; } diff -r bb1c450b2739 tools/libxc/xc_hvm_save.c --- a/tools/libxc/xc_hvm_save.c Thu Jan 11 21:03:11 2007 +0800 +++ b/tools/libxc/xc_hvm_save.c Thu Jan 11 21:05:10 2007 +0800 @@ -32,9 +32,696 @@ #include "xg_private.h" #include "xg_save_restore.h" +/* +** Default values for important tuning parameters. Can override by passing +** non-zero replacement values to xc_hvm_save(). +** +** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. +** +*/ +#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ +#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */ + +/* max mfn of the whole machine */ +static unsigned long max_mfn; + +/* virtual starting address of the hypervisor */ +static unsigned long hvirt_start; + +/* #levels of page tables used by the currrent guest */ +static unsigned int pt_levels; + +/* total number of pages used by the current guest */ +static unsigned long max_pfn; + +/* +** During (live) save/migrate, we maintain a number of bitmaps to track +** which pages we have to send, to fixup, and to skip. +*/ + +#define BITS_PER_LONG (sizeof(unsigned long) * 8) +#define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8) + +#define BITMAP_ENTRY(_nr,_bmap) \ + ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] + +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) + +static inline int test_bit (int nr, volatile void * addr) +{ + return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; +} + +static inline void clear_bit (int nr, volatile void * addr) +{ + BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr)); +} + +static inline int permute( int i, int nr, int order_nr ) +{ + /* Need a simple permutation function so that we scan pages in a + pseudo random order, enabling us to get a better estimate of + the domain''s page dirtying rate as we go (there are often + contiguous ranges of pfns that have similar behaviour, and we + want to mix them up. */ + + /* e.g. nr->oder 15->4 16->4 17->5 */ + /* 512MB domain, 128k pages, order 17 */ + + /* + QPONMLKJIHGFEDCBA + QPONMLKJIH + GFEDCBA + */ + + /* + QPONMLKJIHGFEDCBA + EDCBA + QPONM + LKJIHGF + */ + + do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); } + while ( i >= nr ); /* this won''t ever loop if nr is a power of 2 */ + + return i; +} + +static uint64_t tv_to_us(struct timeval *new) +{ + return (new->tv_sec * 1000000) + new->tv_usec; +} + +static uint64_t llgettimeofday(void) +{ + struct timeval now; + gettimeofday(&now, NULL); + return tv_to_us(&now); +} + +static uint64_t tv_delta(struct timeval *new, struct timeval *old) +{ + return ((new->tv_sec - old->tv_sec)*1000000 ) + + (new->tv_usec - old->tv_usec); +} + + +#define RATE_IS_MAX() (0) +#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n)) +#define initialize_mbit_rate() + +static inline ssize_t write_exact(int fd, void *buf, size_t count) +{ + if(write(fd, buf, count) != count) + return 0; + return 1; +} + +static int print_stats(int xc_handle, uint32_t domid, int pages_sent, + xc_shadow_op_stats_t *stats, int print) +{ + static struct timeval wall_last; + static long long d0_cpu_last; + static long long d1_cpu_last; + + struct timeval wall_now; + long long wall_delta; + long long d0_cpu_now, d0_cpu_delta; + long long d1_cpu_now, d1_cpu_delta; + + gettimeofday(&wall_now, NULL); + + d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; + d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; + + if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) + DPRINTF("ARRHHH!!\n"); + + wall_delta = tv_delta(&wall_now,&wall_last)/1000; + + if (wall_delta == 0) wall_delta = 1; + + d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000; + d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; + + if (print) + DPRINTF( + "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " + "dirtied %dMb/s %" PRId32 " pages\n", + wall_delta, + (int)((d0_cpu_delta*100)/wall_delta), + (int)((d1_cpu_delta*100)/wall_delta), + (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), + (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), + stats->dirty_count); + + d0_cpu_last = d0_cpu_now; + d1_cpu_last = d1_cpu_now; + wall_last = wall_now; + + return 0; +} + +static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn, + unsigned long *arr, int runs) +{ + long long start, now; + xc_shadow_op_stats_t stats; + int j; + + start = llgettimeofday(); + + for (j = 0; j < runs; j++) { + int i; + + xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + arr, max_pfn, NULL, 0, NULL); + DPRINTF("#Flush\n"); + for ( i = 0; i < 40; i++ ) { + usleep(50000); + now = llgettimeofday(); + xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK, + NULL, 0, NULL, 0, &stats); + + DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n", + ((now-start)+500)/1000, + stats.fault_count, stats.dirty_count); + } + } + + return -1; +} + +static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, + int dom, xc_dominfo_t *info, + vcpu_guest_context_t *ctxt) +{ + int i = 0; + + if (!(*suspend)(dom)) { + ERROR("Suspend request failed"); + return -1; + } + + retry: + + if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) { + ERROR("Could not get domain info"); + return -1; + } + + if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt)) + ERROR("Could not get vcpu context"); + + + if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend) + return 0; // success + + if (info->paused) { + // try unpausing domain, wait, and retest + xc_domain_unpause( xc_handle, dom ); + + ERROR("Domain was paused. Wait and re-test."); + usleep(10000); // 10ms + + goto retry; + } + + + if( ++i < 100 ) { + ERROR("Retry suspend domain."); + usleep(10000); // 10ms + goto retry; + } + + ERROR("Unable to suspend domain."); + + return -1; +} + int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, uint32_t max_factor, uint32_t flags, int (*suspend)(int)) { - - return 0; -} + xc_dominfo_t info; + + int rc = 1, i, last_iter, iter = 0; + int live = (flags & XCFLAGS_LIVE); + int debug = (flags & XCFLAGS_DEBUG); + int sent_last_iter, skip_this_iter; + + /* The new domain''s shared-info frame number. */ + unsigned long shared_info_frame; + + /* A copy of the CPU context of the guest. */ + vcpu_guest_context_t ctxt; + + /* A table containg the type of each PFN (/not/ MFN!). */ + unsigned long *pfn_type = NULL; + unsigned long *pfn_batch = NULL; + + /* A copy of hvm domain context */ + hvm_domain_context_t hvm_ctxt; + + /* Live mapping of shared info structure */ + shared_info_t *live_shinfo = NULL; + + /* base of the region in which domain memory is mapped */ + unsigned char *region_base = NULL; + + uint32_t nr_pfns, rec_size, nr_vcpus; + unsigned long *page_array = NULL; + + /* power of 2 order of max_pfn */ + int order_nr; + + /* bitmap of pages: + - that should be sent this iteration (unless later marked as skip); + - to skip this iteration because already dirty; */ + unsigned long *to_send = NULL, *to_skip = NULL; + + xc_shadow_op_stats_t stats; + + unsigned long total_sent = 0; + + DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, live=%d, debug=%d.\n", + dom, max_iters, max_factor, flags, + live, debug); + + /* If no explicit control parameters given, use defaults */ + if(!max_iters) + max_iters = DEF_MAX_ITERS; + if(!max_factor) + max_factor = DEF_MAX_FACTOR; + + initialize_mbit_rate(); + + if(!get_platform_info(xc_handle, dom, + &max_mfn, &hvirt_start, &pt_levels)) { + ERROR("HVM:Unable to get platform info."); + return 1; + } + + if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) { + ERROR("HVM:Could not get domain info"); + return 1; + } + nr_vcpus = info.nr_online_vcpus; + + if (mlock(&ctxt, sizeof(ctxt))) { + ERROR("HVM:Unable to mlock ctxt"); + return 1; + } + + /* Only have to worry about vcpu 0 even for SMP */ + if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) { + ERROR("HVM:Could not get vcpu context"); + goto out; + } + shared_info_frame = info.shared_info_frame; + + /* A cheesy test to see whether the domain contains valid state. */ + if (ctxt.ctrlreg[3] == 0) + { + ERROR("Domain is not in a valid HVM guest state"); + goto out; + } + + /* cheesy sanity check */ + if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) { + ERROR("Invalid HVM state record -- pfn count out of range: %lu", + (info.max_memkb >> (PAGE_SHIFT - 10))); + goto out; + } + + /* Map the shared info frame */ + if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, + PROT_READ, shared_info_frame))) { + ERROR("HVM:Couldn''t map live_shinfo"); + goto out; + } + + max_pfn = live_shinfo->arch.max_pfn; + + DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); + + /* nr_pfns: total pages excluding vga acc mem + * max_pfn: nr_pfns + 0x20 vga hole(0xa0~0xc0) + * getdomaininfo.tot_pages: all the allocated pages for this domain + */ + if (live) { + ERROR("hvm domain doesn''t support live migration now.\n"); + goto out; + + if (xc_shadow_control(xc_handle, dom, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL) < 0) { + ERROR("Couldn''t enable shadow mode"); + goto out; + } + + /* excludes vga acc mem */ + nr_pfns = info.nr_pages - 0x800; + + last_iter = 0; + DPRINTF("hvm domain live migration debug start: logdirty enable.\n"); + } else { + /* This is a non-live suspend. Issue the call back to get the + domain suspended */ + + last_iter = 1; + + /* suspend hvm domain */ + if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) { + ERROR("HVM Domain appears not to have suspended"); + goto out; + } + nr_pfns = info.nr_pages; + DPRINTF("after suspend hvm domain nr_pages=0x%x.\n", nr_pfns); + } + + DPRINTF("after 1st handle hvm domain nr_pfns=0x%x, nr_pages=0x%lx, max_memkb=0x%lx, live=%d.\n", + nr_pfns, + info.nr_pages, + info.max_memkb, + live); + + nr_pfns = info.nr_pages; + + /*XXX: caculate the VGA hole*/ + max_pfn = nr_pfns + 0x20; + + skip_this_iter = 0;/*XXX*/ + /* pretend we sent all the pages last iteration */ + sent_last_iter = max_pfn; + + /* calculate the power of 2 order of max_pfn, e.g. + 15->4 16->4 17->5 */ + for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++) + continue; + + /* Setup to_send / to_fix and to_skip bitmaps */ + to_send = malloc(BITMAP_SIZE); + to_skip = malloc(BITMAP_SIZE); + + if (!to_send ||!to_skip) { + ERROR("Couldn''t allocate to_send array"); + goto out; + } + + memset(to_send, 0xff, BITMAP_SIZE); + + if (lock_pages(to_send, BITMAP_SIZE)) { + ERROR("Unable to lock to_send"); + return 1; + } + + /* (to fix is local only) */ + if (lock_pages(to_skip, BITMAP_SIZE)) { + ERROR("Unable to lock to_skip"); + return 1; + } + + analysis_phase(xc_handle, dom, max_pfn, to_skip, 0); + + /* get all the HVM domain pfns */ + if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * max_pfn)) == NULL) { + ERROR("HVM:malloc fail!\n"); + goto out; + } + + for ( i = 0; i < max_pfn; i++) + page_array[i] = i; + + + /* We want zeroed memory so use calloc rather than malloc. */ + pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type)); + pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch)); + + if ((pfn_type == NULL) || (pfn_batch == NULL)) { + ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays"); + errno = ENOMEM; + goto out; + } + + if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) { + ERROR("Unable to lock"); + goto out; + } + + /* Start writing out the saved-domain record. */ + if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { + ERROR("write: max_pfn"); + goto out; + } + + while(1) { + + unsigned int prev_pc, sent_this_iter, N, batch; + + iter++; + sent_this_iter = 0; + skip_this_iter = 0; + prev_pc = 0; + N=0; + + DPRINTF("Saving HVM domain memory pages: iter %d 0%%", iter); + + while( N < max_pfn ){ + + unsigned int this_pc = (N * 100) / max_pfn; + + if ((this_pc - prev_pc) >= 5) { + DPRINTF("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + /* slightly wasteful to peek the whole array evey time, + but this is fast enough for the moment. */ + if (!last_iter && xc_shadow_control( + xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, + to_skip, max_pfn, NULL, 0, NULL) != max_pfn) { + ERROR("Error peeking HVM shadow bitmap"); + goto out; + } + + + /* load pfn_type[] with the mfn of all the pages we''re doing in + this batch. */ + for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) { + + int n = permute(N, max_pfn, order_nr); + + if (debug) { + DPRINTF("%d pfn= %08lx mfn= %08lx %d \n", + iter, (unsigned long)n, page_array[n], + test_bit(n, to_send)); + } + + if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip)) + skip_this_iter++; /* stats keeping */ + + if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) || + (test_bit(n, to_send) && last_iter))) + continue; + + if (n >= 0xa0 && n < 0xc0) { +/* DPRINTF("get a vga hole pfn= %x.\n", n);*/ + continue; + } + /* + ** we get here if: + ** 1. page is marked to_send & hasn''t already been re-dirtied + ** 2. (ignore to_skip in last iteration) + */ + + pfn_batch[batch] = n; + pfn_type[batch] = page_array[n]; + + batch++; + } + + if (batch == 0) + goto skip; /* vanishingly unlikely... */ + + /* map_foreign use pfns now !*/ + if ((region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_READ, pfn_batch, batch)) == 0) { + ERROR("map batch failed"); + goto out; + } + + /* write num of pfns */ + if(!write_exact(io_fd, &batch, sizeof(unsigned int))) { + ERROR("Error when writing to state file (2)"); + goto out; + } + + /* write all the pfns */ + if(!write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch)) { + ERROR("Error when writing to state file (3)"); + goto out; + } + + if (ratewrite(io_fd, region_base, PAGE_SIZE * batch) != PAGE_SIZE * batch) { + ERROR("ERROR when writting to state file (4)"); + goto out; + } + + + sent_this_iter += batch; + + munmap(region_base, batch*PAGE_SIZE); + + } /* end of this while loop for this iteration */ + + skip: + + total_sent += sent_this_iter; + + DPRINTF("\r %d: sent %d, skipped %d, ", + iter, sent_this_iter, skip_this_iter ); + + if (last_iter) { + print_stats( xc_handle, dom, sent_this_iter, &stats, 1); + + DPRINTF("Total pages sent= %ld (%.2fx)\n", + total_sent, ((float)total_sent)/max_pfn ); + } + + if (last_iter && debug){ + int minusone = -1; + memset(to_send, 0xff, BITMAP_SIZE); + debug = 0; + DPRINTF("Entering debug resend-all mode\n"); + + /* send "-1" to put receiver into debug mode */ + if(!write_exact(io_fd, &minusone, sizeof(int))) { + ERROR("Error when writing to state file (6)"); + goto out; + } + + continue; + } + + if (last_iter) break; + + if (live) { + + + if( + ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) || + (iter >= max_iters) || + (sent_this_iter+skip_this_iter < 50) || + (total_sent > max_pfn*max_factor) ) { + + DPRINTF("Start last iteration for HVM domain\n"); + last_iter = 1; + + if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, + &ctxt)) { + ERROR("Domain appears not to have suspended"); + goto out; + } + + DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n", + info.shared_info_frame, + (unsigned long)ctxt.user_regs.eip, + (unsigned long)ctxt.user_regs.edx); + } + + if (xc_shadow_control(xc_handle, dom, + XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, + max_pfn, NULL, 0, &stats) != max_pfn) { + ERROR("Error flushing shadow PT"); + goto out; + } + + sent_last_iter = sent_this_iter; + + print_stats(xc_handle, dom, sent_this_iter, &stats, 1); + + } + + + } /* end of while 1 */ + + + DPRINTF("All HVM memory is saved\n"); + + /* Zero terminate */ + i = 0; + if (!write_exact(io_fd, &i, sizeof(int))) { + ERROR("Error when writing to state file (6)"); + goto out; + } + + /* save hvm hypervisor state including pic/pit/shpage */ + if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) { + ERROR("Unable to mlock ctxt"); + return 1; + } + + if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){ + ERROR("HVM:Could not get hvm context"); + goto out; + } + + rec_size = sizeof(hvm_ctxt); + if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) { + ERROR("error write hvm ctxt size"); + goto out; + } + + if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) { + ERROR("write HVM info failed!\n"); + } + + /* save vcpu/vmcs context */ + if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) { + ERROR("error write nr vcpus"); + goto out; + } + + /*XXX: need a online map to exclude down cpu */ + for (i = 0; i < nr_vcpus; i++) { + + if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) { + ERROR("HVM:Could not get vcpu context"); + goto out; + } + + rec_size = sizeof(ctxt); + DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus); + if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) { + ERROR("error write vcpu ctxt size"); + goto out; + } + + if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) { + ERROR("write vmcs failed!\n"); + goto out; + } + } + + /* Success! */ + rc = 0; + + out: + + if (live) { + if(xc_shadow_control(xc_handle, dom, + XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL) < 0) { + DPRINTF("Warning - couldn''t disable shadow mode"); + } + } + + free(page_array); + + free(pfn_type); + free(pfn_batch); + free(to_send); + free(to_skip); + + return !!rc; +} _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel