Zhai, Edwin wrote:> ian/keir,
> this is the new hvm save/restore patch on top of r12898.
> although not tested intensively across all platform combinations, general
> linux/windows guest works fine W/O break anything.
>
So what are the plans for merging save/restore?
If possible, it would be nice to see some changes first. For instance,
it seems a real shame to not use the existing QEMU format. There''s
really no technical reason why a saved Xen VM shouldn''t be loadable in
QEMU (and vice versa).
Perhaps we should also considering using a higher level kernel interface
too? 2.6.20 is already going to contain a kernel interface for
transferring guest state (for KVM). It would be nice if we didn''t keep
introducing Xen specific things to our qemu fork so that we could
eventually get this stuff upstream.
Regards,
Anthony Liguori
> because recent changes in unstable make rebase hard, we submit this patch
now
> with hope that check in first and fix bug over time if possible.
>
> thanks,
>
>
> --
> best rgds,
> edwin
>
> ------------------------------------------------------------------------
>
> Signed-off-by: Zhai Edwin <edwin.zhai@intel.com>
> Signed-off-by: Nakajima Jun <jun.nakajima@intel.com>
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/hw/cirrus_vga.c
> --- a/tools/ioemu/hw/cirrus_vga.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/ioemu/hw/cirrus_vga.c Wed Dec 13 22:52:02 2006 +0800
> @@ -3010,11 +3010,44 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
> cirrus_mmio_writel,
> };
>
> +void cirrus_stop_acc(CirrusVGAState *s)
> +{
> + if (s->map_addr){
> + int error;
> + s->map_addr = 0;
> + error = unset_vram_mapping(s->cirrus_lfb_addr,
> + s->cirrus_lfb_end);
> + fprintf(stderr,
"cirrus_stop_acc:unset_vram_mapping.\n");
> +
> + munmap(s->vram_ptr, VGA_RAM_SIZE);
> + }
> +}
> +
> +void cirrus_restart_acc(CirrusVGAState *s)
> +{
> + if (s->cirrus_lfb_addr && s->cirrus_lfb_end) {
> + void *vram_pointer, *old_vram;
> + fprintf(stderr, "cirrus_vga_load:re-enable vga
acc.lfb_addr=0x%lx, lfb_end=0x%lx.\n",
> + s->cirrus_lfb_addr, s->cirrus_lfb_end);
> + vram_pointer = set_vram_mapping(s->cirrus_lfb_addr
,s->cirrus_lfb_end);
> + if (!vram_pointer){
> + fprintf(stderr, "cirrus_vga_load:NULL
vram_pointer\n");
> + } else {
> + old_vram = vga_update_vram((VGAState *)s, vram_pointer,
> + VGA_RAM_SIZE);
> + qemu_free(old_vram);
> + s->map_addr = s->cirrus_lfb_addr;
> + s->map_end = s->cirrus_lfb_end;
> + }
> + }
> +}
> +
> /* load/save state */
>
> static void cirrus_vga_save(QEMUFile *f, void *opaque)
> {
> CirrusVGAState *s = opaque;
> + uint8_t vga_acc;
>
> qemu_put_be32s(f, &s->latch);
> qemu_put_8s(f, &s->sr_index);
> @@ -3049,11 +3082,20 @@ static void cirrus_vga_save(QEMUFile *f,
> qemu_put_be32s(f, &s->hw_cursor_y);
> /* XXX: we do not save the bitblt state - we assume we do not save
> the state when the blitter is active */
> +
> + vga_acc = (!!s->map_addr);
> + qemu_put_8s(f, &vga_acc);
> + qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
> + qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
> + qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
> + if (vga_acc)
> + cirrus_stop_acc(s);
> }
>
> static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
> {
> CirrusVGAState *s = opaque;
> + uint8_t vga_acc = 0;
>
> if (version_id != 1)
> return -EINVAL;
> @@ -3091,6 +3133,14 @@ static int cirrus_vga_load(QEMUFile *f,
>
> qemu_get_be32s(f, &s->hw_cursor_x);
> qemu_get_be32s(f, &s->hw_cursor_y);
> +
> + qemu_get_8s(f, &vga_acc);
> + qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
> + qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
> + qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
> + if (vga_acc){
> + cirrus_restart_acc(s);
> + }
>
> /* force refresh */
> s->graphic_mode = -1;
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/helper2.c
> --- a/tools/ioemu/target-i386-dm/helper2.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/ioemu/target-i386-dm/helper2.c Wed Dec 13 22:52:02 2006 +0800
> @@ -525,6 +525,7 @@ int main_loop(void)
> {
> extern int vm_running;
> extern int shutdown_requested;
> + extern int suspend_requested;
> CPUState *env = cpu_single_env;
> int evtchn_fd = xc_evtchn_fd(xce_handle);
>
> @@ -542,12 +543,24 @@ int main_loop(void)
> qemu_system_reset();
> reset_requested = 0;
> }
> + if (suspend_requested) {
> + fprintf(logfile, "device model received suspend
signal!\n");
> + break;
> + }
> }
>
> /* Wait up to 10 msec. */
> main_loop_wait(10);
> }
> - destroy_hvm_domain();
> + if (!suspend_requested)
> + destroy_hvm_domain();
> + else {
> + char qemu_file[20];
> + sprintf(qemu_file, "/tmp/xen.qemu-dm.%d", domid);
> + if (qemu_savevm(qemu_file) < 0)
> + fprintf(stderr, "qemu save fail.\n");
> + }
> +
> return 0;
> }
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9
tools/ioemu/target-i386-dm/piix_pci-dm.c
> --- a/tools/ioemu/target-i386-dm/piix_pci-dm.c Fri Sep 15 17:05:38 2006
+0800
> +++ b/tools/ioemu/target-i386-dm/piix_pci-dm.c Wed Dec 13 22:52:02 2006
+0800
> @@ -83,6 +83,11 @@ PCIBus *i440fx_init(void)
> /* PIIX3 PCI to ISA bridge */
>
> static PCIDevice *piix3_dev;
> +static int pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
> +{
> + /* This is the barber''s pole mapping used by Xen. */
> + return (irq_num + (pci_dev->devfn >> 3)) & 3;
> +}
>
> static void piix3_write_config(PCIDevice *d,
> uint32_t address, uint32_t val, int len)
> @@ -150,3 +155,227 @@ int piix3_init(PCIBus *bus)
> }
>
> void pci_bios_init(void) {}
> +
> +/***********************************************************/
> +/* XXX: the following should be moved to the PC BIOS */
> +
> +static __attribute__((unused)) uint32_t isa_inb(uint32_t addr)
> +{
> + return cpu_inb(NULL, addr);
> +}
> +
> +static void isa_outb(uint32_t val, uint32_t addr)
> +{
> + cpu_outb(NULL, addr, val);
> +}
> +
> +static __attribute__((unused)) uint32_t isa_inw(uint32_t addr)
> +{
> + return cpu_inw(NULL, addr);
> +}
> +
> +static __attribute__((unused)) void isa_outw(uint32_t val, uint32_t addr)
> +{
> + cpu_outw(NULL, addr, val);
> +}
> +
> +static __attribute__((unused)) uint32_t isa_inl(uint32_t addr)
> +{
> + return cpu_inl(NULL, addr);
> +}
> +
> +static __attribute__((unused)) void isa_outl(uint32_t val, uint32_t addr)
> +{
> + cpu_outl(NULL, addr, val);
> +}
> +
> +static uint32_t pci_bios_io_addr;
> +static uint32_t pci_bios_mem_addr;
> +/* host irqs corresponding to PCI irqs A-D */
> +static uint8_t pci_irqs[4] = { 5, 6, 10, 11 };
> +
> +static void pci_config_writel(PCIDevice *d, uint32_t addr, uint32_t val)
> +{
> + PCIBus *s = d->bus;
> + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
> + pci_data_write(s, addr, val, 4);
> +}
> +
> +static void pci_config_writew(PCIDevice *d, uint32_t addr, uint32_t val)
> +{
> + PCIBus *s = d->bus;
> + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
> + pci_data_write(s, addr, val, 2);
> +}
> +
> +static void pci_config_writeb(PCIDevice *d, uint32_t addr, uint32_t val)
> +{
> + PCIBus *s = d->bus;
> + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
> + pci_data_write(s, addr, val, 1);
> +}
> +
> +static __attribute__((unused)) uint32_t pci_config_readl(PCIDevice *d,
uint32_t addr)
> +{
> + PCIBus *s = d->bus;
> + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
> + return pci_data_read(s, addr, 4);
> +}
> +
> +static uint32_t pci_config_readw(PCIDevice *d, uint32_t addr)
> +{
> + PCIBus *s = d->bus;
> + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
> + return pci_data_read(s, addr, 2);
> +}
> +
> +static uint32_t pci_config_readb(PCIDevice *d, uint32_t addr)
> +{
> + PCIBus *s = d->bus;
> + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
> + return pci_data_read(s, addr, 1);
> +}
> +
> +static void pci_set_io_region_addr(PCIDevice *d, int region_num, uint32_t
addr)
> +{
> + PCIIORegion *r;
> + uint16_t cmd;
> + uint32_t ofs;
> +
> + if ( region_num == PCI_ROM_SLOT ) {
> + ofs = 0x30;
> + }else{
> + ofs = 0x10 + region_num * 4;
> + }
> +
> + pci_config_writel(d, ofs, addr);
> + r = &d->io_regions[region_num];
> +
> + /* enable memory mappings */
> + cmd = pci_config_readw(d, PCI_COMMAND);
> + if ( region_num == PCI_ROM_SLOT )
> + cmd |= 2;
> + else if (r->type & PCI_ADDRESS_SPACE_IO)
> + cmd |= 1;
> + else
> + cmd |= 2;
> + pci_config_writew(d, PCI_COMMAND, cmd);
> +}
> +
> +static void pci_bios_init_device(PCIDevice *d)
> +{
> + int class;
> + PCIIORegion *r;
> + uint32_t *paddr;
> + int i, pin, pic_irq, vendor_id, device_id;
> +
> + class = pci_config_readw(d, PCI_CLASS_DEVICE);
> + vendor_id = pci_config_readw(d, PCI_VENDOR_ID);
> + device_id = pci_config_readw(d, PCI_DEVICE_ID);
> + switch(class) {
> + case 0x0101:
> + if (vendor_id == 0x8086 && device_id == 0x7010) {
> + /* PIIX3 IDE */
> + pci_config_writew(d, 0x40, 0x8000); // enable IDE0
> + pci_config_writew(d, 0x42, 0x8000); // enable IDE1
> + goto default_map;
> + } else {
> + /* IDE: we map it as in ISA mode */
> + pci_set_io_region_addr(d, 0, 0x1f0);
> + pci_set_io_region_addr(d, 1, 0x3f4);
> + pci_set_io_region_addr(d, 2, 0x170);
> + pci_set_io_region_addr(d, 3, 0x374);
> + }
> + break;
> + case 0x0680:
> + if (vendor_id == 0x8086 && device_id == 0x7113) {
> + /*
> + * PIIX4 ACPI PM.
> + * Special device with special PCI config space. No ordinary
BARs.
> + */
> + pci_config_writew(d, 0x20, 0x0000); // No smb bus IO enable
> + pci_config_writew(d, 0x22, 0x0000);
> + pci_config_writew(d, 0x3c, 0x0009); // Hardcoded IRQ9
> + pci_config_writew(d, 0x3d, 0x0001);
> + }
> + break;
> + case 0x0300:
> + if (vendor_id != 0x1234)
> + goto default_map;
> + /* VGA: map frame buffer to default Bochs VBE address */
> + pci_set_io_region_addr(d, 0, 0xE0000000);
> + break;
> + case 0x0800:
> + /* PIC */
> + vendor_id = pci_config_readw(d, PCI_VENDOR_ID);
> + device_id = pci_config_readw(d, PCI_DEVICE_ID);
> + if (vendor_id == 0x1014) {
> + /* IBM */
> + if (device_id == 0x0046 || device_id == 0xFFFF) {
> + /* MPIC & MPIC2 */
> + pci_set_io_region_addr(d, 0, 0x80800000 + 0x00040000);
> + }
> + }
> + break;
> + case 0xff00:
> + if (vendor_id == 0x0106b &&
> + (device_id == 0x0017 || device_id == 0x0022)) {
> + /* macio bridge */
> + pci_set_io_region_addr(d, 0, 0x80800000);
> + }
> + break;
> + default:
> + default_map:
> + /* default memory mappings */
> + for(i = 0; i < PCI_NUM_REGIONS; i++) {
> + r = &d->io_regions[i];
> + if (r->size) {
> + if (r->type & PCI_ADDRESS_SPACE_IO)
> + paddr = &pci_bios_io_addr;
> + else
> + paddr = &pci_bios_mem_addr;
> + *paddr = (*paddr + r->size - 1) & ~(r->size -
1);
> + pci_set_io_region_addr(d, i, *paddr);
> + *paddr += r->size;
> + }
> + }
> + break;
> + }
> +
> + /* map the interrupt */
> + pin = pci_config_readb(d, PCI_INTERRUPT_PIN);
> + if (pin != 0) {
> + pin = pci_slot_get_pirq(d, pin - 1);
> + pic_irq = pci_irqs[pin];
> + pci_config_writeb(d, PCI_INTERRUPT_LINE, pic_irq);
> + }
> +}
> +
> +/*
> + * This function initializes the PCI devices as a normal PCI BIOS
> + * would do. It is provided just in case the BIOS has no support for
> + * PCI.
> + */
> +void pci_setup(void)
> +{
> + int i, irq;
> + uint8_t elcr[2];
> +
> + pci_bios_io_addr = 0xc000;
> + pci_bios_mem_addr = HVM_BELOW_4G_MMIO_START;
> +
> + /* activate IRQ mappings */
> + elcr[0] = 0x00;
> + elcr[1] = 0x00;
> + for(i = 0; i < 4; i++) {
> + irq = pci_irqs[i];
> + /* set to trigger level */
> + elcr[irq >> 3] |= (1 << (irq & 7));
> + /* activate irq remapping in PIIX */
> + pci_config_writeb(piix3_dev, 0x60 + i, irq);
> + }
> + isa_outb(elcr[0], 0x4d0);
> + isa_outb(elcr[1], 0x4d1);
> +
> + pci_for_each_device(pci_bios_init_device);
> +}
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/vl.c
> --- a/tools/ioemu/vl.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/ioemu/vl.c Wed Dec 13 22:52:02 2006 +0800
> @@ -4441,6 +4441,11 @@ int qemu_loadvm(const char *filename)
> qemu_fseek(f, cur_pos + record_len, SEEK_SET);
> }
> fclose(f);
> +
> + /* del tmp file */
> + if (unlink(filename) == -1)
> + fprintf(stderr, "delete tmp qemu state file failed.\n");
> +
> ret = 0;
> the_end:
> if (saved_vm_running)
> @@ -5027,6 +5032,7 @@ static QEMUResetEntry *first_reset_entry
> static QEMUResetEntry *first_reset_entry;
> int reset_requested;
> int shutdown_requested;
> +int suspend_requested;
> static int powerdown_requested;
>
> void qemu_register_reset(QEMUResetHandler *func, void *opaque)
> @@ -5806,6 +5812,14 @@ int set_mm_mapping(int xc_handle, uint32
> }
>
> return 0;
> +}
> +
> +void suspend(int sig)
> +{
> + fprintf(logfile, "suspend sig handler called with
requested=%d!\n", suspend_requested);
> + if (sig != SIGUSR1)
> + fprintf(logfile, "suspend signal dismatch, get
sig=%d!\n", sig);
> + suspend_requested = 1;
> }
>
> #if defined(__i386__) || defined(__x86_64__)
> @@ -6709,8 +6723,12 @@ int main(int argc, char **argv)
> }
> } else
> #endif
> - if (loadvm)
> + if (loadvm) {
> + /*XXX: ugly, since pci_bios_init are moved to hvmloader*/
> + extern void pci_setup(void);
> + pci_setup();
> qemu_loadvm(loadvm);
> + }
>
> {
> /* XXX: simplify init */
> @@ -6719,6 +6737,26 @@ int main(int argc, char **argv)
> vm_start();
> }
> }
> +
> + /* register signal for the suspend request when save */
> + {
> + struct sigaction act;
> + sigset_t set;
> + act.sa_handler = suspend;
> + act.sa_flags = SA_RESTART;
> + sigemptyset(&act.sa_mask);
> +
> + sigaction(SIGUSR1, &act, NULL);
> +
> + /* control panel mask some signals when spawn qemu, need unmask
here*/
> + sigemptyset(&set);
> + sigaddset(&set, SIGUSR1);
> + sigaddset(&set, SIGTERM);
> + if (sigprocmask(SIG_UNBLOCK, &set, NULL) == -1)
> + fprintf(stderr, "unblock signal fail, possible issue for
HVM save!\n");
> +
> + }
> +
> main_loop();
> quit_timers();
> return 0;
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/Makefile
> --- a/tools/libxc/Makefile Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/Makefile Wed Dec 13 22:52:02 2006 +0800
> @@ -27,7 +27,7 @@ GUEST_SRCS-$(CONFIG_X86) += xc_linux_bui
> GUEST_SRCS-$(CONFIG_X86) += xc_linux_build.c
> GUEST_SRCS-$(CONFIG_IA64) += xc_linux_build.c
> GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c
> -GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
> +GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c
>
> -include $(XEN_TARGET_ARCH)/Makefile
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_domain.c
> --- a/tools/libxc/xc_domain.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/xc_domain.c Wed Dec 13 22:52:02 2006 +0800
> @@ -233,6 +233,50 @@ int xc_domain_getinfolist(int xc_handle,
> unlock_pages(info, max_domains*sizeof(xc_domaininfo_t));
>
> return ret;
> +}
> +
> +/* get info from hvm guest for save */
> +int xc_domain_hvm_getcontext(int xc_handle,
> + uint32_t domid,
> + hvm_domain_context_t *hvm_ctxt)
> +{
> + int rc;
> + DECLARE_DOMCTL;
> +
> + domctl.cmd = XEN_DOMCTL_gethvmcontext;
> + domctl.domain = (domid_t)domid;
> + set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt);
> +
> + if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
> + return rc;
> +
> + rc = do_domctl(xc_handle, &domctl);
> +
> + safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
> +
> + return rc;
> +}
> +
> +/* set info to hvm guest for restore */
> +int xc_domain_hvm_setcontext(int xc_handle,
> + uint32_t domid,
> + hvm_domain_context_t *hvm_ctxt)
> +{
> + int rc;
> + DECLARE_DOMCTL;
> +
> + domctl.cmd = XEN_DOMCTL_sethvmcontext;
> + domctl.domain = domid;
> + set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt);
> +
> + if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
> + return rc;
> +
> + rc = do_domctl(xc_handle, &domctl);
> +
> + safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
> +
> + return rc;
> }
>
> int xc_vcpu_getcontext(int xc_handle,
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_build.c
> --- a/tools/libxc/xc_hvm_build.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/xc_hvm_build.c Wed Dec 13 22:52:02 2006 +0800
> @@ -86,7 +86,7 @@ static void build_e820map(void *e820_pag
>
> /* 0x0-0x9F000: Ordinary RAM. */
> e820entry[nr_map].addr = 0x0;
> - e820entry[nr_map].size = 0x9F000;
> + e820entry[nr_map].size = 0x90000;
> e820entry[nr_map].type = E820_RAM;
> nr_map++;
>
> @@ -96,7 +96,7 @@ static void build_e820map(void *e820_pag
> * TODO: SMBIOS tables should be moved higher (>=0xE0000).
> * They are unusually low in our memory map: could cause
problems?
> */
> - e820entry[nr_map].addr = 0x9F000;
> + e820entry[nr_map].addr = 0x90000;
> e820entry[nr_map].size = 0x1000;
> e820entry[nr_map].type = E820_RESERVED;
> nr_map++;
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_linux_save.c
> --- a/tools/libxc/xc_linux_save.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/xc_linux_save.c Wed Dec 13 22:52:02 2006 +0800
> @@ -261,15 +261,6 @@ static int ratewrite(int io_fd, void *bu
> #endif
>
>
> -static inline ssize_t write_exact(int fd, void *buf, size_t count)
> -{
> - if(write(fd, buf, count) != count)
> - return 0;
> - return 1;
> -}
> -
> -
> -
> static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
> xc_shadow_op_stats_t *stats, int print)
> {
> @@ -356,7 +347,7 @@ static int analysis_phase(int xc_handle,
> }
>
>
> -static int suspend_and_state(int (*suspend)(int), int xc_handle, int
io_fd,
> +int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
> int dom, xc_dominfo_t *info,
> vcpu_guest_context_t *ctxt)
> {
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenctrl.h
> --- a/tools/libxc/xenctrl.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/xenctrl.h Wed Dec 13 22:52:02 2006 +0800
> @@ -313,6 +313,30 @@ int xc_domain_getinfolist(int xc_handle,
> xc_domaininfo_t *info);
>
> /**
> + * This function returns information about the context of a hvm domain
> + * @parm xc_handle a handle to an open hypervisor interface
> + * @parm domid the domain to get information from
> + * @parm hvm_ctxt a pointer to a structure to store the execution context
of the
> + * hvm domain
> + * @return 0 on success, -1 on failure
> + */
> +int xc_domain_hvm_getcontext(int xc_handle,
> + uint32_t domid,
> + hvm_domain_context_t *hvm_ctxt);
> +
> +/**
> + * This function will set the context for hvm domain
> + *
> + * @parm xc_handle a handle to an open hypervisor interface
> + * @parm domid the domain to set the hvm domain context for
> + * @parm hvm_ctxt pointer to the the hvm context with the values to set
> + * @return 0 on success, -1 on failure
> + */
> +int xc_domain_hvm_setcontext(int xc_handle,
> + uint32_t domid,
> + hvm_domain_context_t *hvm_ctxt);
> +
> +/**
> * This function returns information about the execution context of a
> * particular vcpu of a domain.
> *
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenguest.h
> --- a/tools/libxc/xenguest.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/xenguest.h Wed Dec 13 22:52:02 2006 +0800
> @@ -11,6 +11,7 @@
>
> #define XCFLAGS_LIVE 1
> #define XCFLAGS_DEBUG 2
> +#define XCFLAGS_HVM 4
>
>
> /**
> @@ -25,6 +26,13 @@ int xc_linux_save(int xc_handle, int io_
> uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
> int (*suspend)(int domid));
>
> +/**
> + * This function will save a hvm domain running unmodified guest.
> + * @return 0 on success, -1 on failure
> + */
> +int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t
max_iters,
> + uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
> + int (*suspend)(int domid));
>
> /**
> * This function will restore a saved domain running Linux.
> @@ -41,6 +49,18 @@ int xc_linux_restore(int xc_handle, int
> unsigned long nr_pfns, unsigned int store_evtchn,
> unsigned long *store_mfn, unsigned int
console_evtchn,
> unsigned long *console_mfn);
> +
> +/**
> + * This function will restore a saved hvm domain running unmodified guest.
> + *
> + * @parm store_mfn pass mem size & returned with the mfn of the store
page
> + * @return 0 on success, -1 on failure
> + */
> +int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
> + unsigned long nr_pfns, unsigned int store_evtchn,
> + unsigned long *store_mfn, unsigned int
console_evtchn,
> + unsigned long *console_mfn,
> + unsigned int pae, unsigned int apic);
>
> /**
> * This function will create a domain for a paravirtualized Linux
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xg_save_restore.h
> --- a/tools/libxc/xg_save_restore.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/libxc/xg_save_restore.h Wed Dec 13 22:52:02 2006 +0800
> @@ -65,6 +65,16 @@ static int get_platform_info(int xc_hand
> return 1;
> }
>
> +static inline ssize_t write_exact(int fd, void *buf, size_t count)
> +{
> + if(write(fd, buf, count) != count)
> + return 0;
> + return 1;
> +}
> +
> +extern int suspend_and_state(int (*suspend)(int), int xc_handle, int
io_fd,
> + int dom, xc_dominfo_t *info,
> + vcpu_guest_context_t *ctxt);
>
> /*
> ** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M)
tables.
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/lowlevel/xc/xc.c
> --- a/tools/python/xen/lowlevel/xc/xc.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/python/xen/lowlevel/xc/xc.c Wed Dec 13 22:52:02 2006 +0800
> @@ -158,6 +158,20 @@ static PyObject *pyxc_domain_destroy(XcO
> static PyObject *pyxc_domain_destroy(XcObject *self, PyObject *args)
> {
> return dom_op(self, args, xc_domain_destroy);
> +}
> +
> +static PyObject *pyxc_domain_shutdown(XcObject *self, PyObject *args)
> +{
> + uint32_t dom, reason;
> +
> + if (!PyArg_ParseTuple(args, "ii", &dom, &reason))
> + return NULL;
> +
> + if (xc_domain_shutdown(self->xc_handle, dom, reason) != 0)
> + return pyxc_error_to_exception();
> +
> + Py_INCREF(zero);
> + return zero;
> }
>
>
> @@ -969,6 +983,14 @@ static PyMethodDef pyxc_methods[] = {
> METH_VARARGS, "\n"
> "Destroy a domain.\n"
> " dom [int]: Identifier of domain to be destroyed.\n\n"
> + "Returns: [int] 0 on success; -1 on error.\n" },
> +
> + { "domain_shutdown",
> + (PyCFunction)pyxc_domain_shutdown,
> + METH_VARARGS, "\n"
> + "Shutdown a domain.\n"
> + " dom [int, 0]: Domain identifier to use.\n"
> + " reason [int, 0]: Reason for shutdown.\n"
> "Returns: [int] 0 on success; -1 on error.\n" },
>
> { "vcpu_setaffinity",
> diff -r 7c0030214af1 -r 3c0bd8907fd9
tools/python/xen/xend/XendCheckpoint.py
> --- a/tools/python/xen/xend/XendCheckpoint.py Fri Sep 15 17:05:38 2006
+0800
> +++ b/tools/python/xen/xend/XendCheckpoint.py Wed Dec 13 22:52:02 2006
+0800
> @@ -22,11 +22,14 @@ from xen.xend.XendConstants import *
> from xen.xend.XendConstants import *
>
> SIGNATURE = "LinuxGuestRecord"
> +QEMU_SIGNATURE = "QemuDeviceModelRecord"
> +dm_batch = 512
> XC_SAVE = "xc_save"
> XC_RESTORE = "xc_restore"
>
>
> sizeof_int = calcsize("i")
> +sizeof_unsigned_int = calcsize("I")
> sizeof_unsigned_long = calcsize("L")
>
>
> @@ -69,6 +72,11 @@ def save(fd, dominfo, network, live, dst
> "could not write guest state file: config
len")
> write_exact(fd, config, "could not write guest state file:
config")
>
> + image_cfg = dominfo.info.get(''image'', {})
> + hvm = image_cfg.has_key(''hvm'')
> +
> + if hvm:
> + log.info("save hvm domain")
> # xc_save takes three customization parameters: maxit, max_f, and
> # flags the last controls whether or not save is
''live'', while the
> # first two further customize behaviour when
''live'' save is
> @@ -76,7 +84,7 @@ def save(fd, dominfo, network, live, dst
> # libxenguest; see the comments and/or code in xc_linux_save() for
> # more information.
> cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
> - str(dominfo.getDomid()), "0", "0",
str(int(live)) ]
> + str(dominfo.getDomid()), "0", "0",
str(int(live) | (int(hvm) << 2)) ]
> log.debug("[xc_save]: %s", string.join(cmd))
>
> def saveInputHandler(line, tochild):
> @@ -90,11 +98,28 @@ def save(fd, dominfo, network, live, dst
> log.info("Domain %d suspended.",
dominfo.getDomid())
> dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3,
> domain_name)
> + #send signal to device model for save
> + if hvm:
> + log.info("release_devices for hvm domain")
> + dominfo._releaseDevices(True)
> tochild.write("done\n")
> tochild.flush()
> log.debug(''Written done'')
>
> forkHelper(cmd, fd, saveInputHandler, False)
> +
> + # put qemu device model state
> + if hvm:
> + write_exact(fd, QEMU_SIGNATURE, "could not write qemu
signature")
> + qemu_fd = os.open("/tmp/xen.qemu-dm.%d" %
dominfo.getDomid(), os.O_RDONLY)
> + while True:
> + buf = os.read(qemu_fd, dm_batch)
> + if len(buf):
> + write_exact(fd, buf, "could not write device
model state")
> + else:
> + break
> + os.close(qemu_fd)
> + os.remove("/tmp/xen.qemu-dm.%d" %
dominfo.getDomid())
>
> dominfo.destroyDomain()
> try:
> @@ -147,19 +172,38 @@ def restore(xd, fd, dominfo = None, paus
> assert store_port
> assert console_port
>
> + #if hvm, pass mem size to calculate the store_mfn
> + hvm = 0
> + apic = 0
> + pae = 0
> + image_cfg = dominfo.info.get(''image'', {})
> + hvm = image_cfg.has_key(''hvm'')
> + if hvm:
> + #the ''memory'' in config has been removed
> + hvm = dominfo.info[''memory_static_min'']
> + apic =
dominfo.info[''image''][''hvm''].get(''apic'',
0)
> + pae =
dominfo.info[''image''][''hvm''].get(''pae'',
0)
> + log.info("restore hvm domain %d, mem=%d, apic=%d,
pae=%d", dominfo.domid, hvm, apic, pae)
> +
> try:
> - l = read_exact(fd, sizeof_unsigned_long,
> - "not a valid guest state file: pfn count
read")
> - nr_pfns = unpack("L", l)[0] # native sizeof long
> + if hvm:
> + l = read_exact(fd, sizeof_unsigned_int,
> + "not a valid hvm guest state file: pfn count
read")
> + nr_pfns = unpack("I", l)[0] # native sizeof int
> + else:
> + l = read_exact(fd, sizeof_unsigned_long,
> + "not a valid guest state file: pfn count
read")
> + nr_pfns = unpack("L", l)[0] # native sizeof long
> if nr_pfns > 16*1024*1024: # XXX
> raise XendError(
> "not a valid guest state file: pfn count out of
range")
>
> balloon.free(xc.pages_to_kib(nr_pfns))
> + log.info("HVM restore:balloon free 0x%x pages.",
nr_pfns)
>
> cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
> fd, dominfo.getDomid(), nr_pfns,
> - store_port, console_port])
> + store_port, console_port, hvm, pae, apic])
> log.debug("[xc_restore]: %s", string.join(cmd))
>
> handler = RestoreInputHandler()
> @@ -169,10 +213,29 @@ def restore(xd, fd, dominfo = None, paus
> if handler.store_mfn is None or handler.console_mfn is None:
> raise XendError(''Could not read store/console
MFN'')
>
> - os.read(fd, 1) # Wait for source to close connection
> dominfo.waitForDevices() # Wait for backends to set up
> if not paused:
> dominfo.unpause()
> +
> + # get qemu state and create a tmp file for dm restore
> + if hvm:
> + qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
> + "not a valid device model state: signature
read")
> + if qemu_signature != QEMU_SIGNATURE:
> + raise XendError("not a valid device model state:
found ''%s''" %
> + qemu_signature)
> + qemu_fd = os.open("/tmp/xen.qemu-dm.%d" %
dominfo.getDomid(),
> + os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
> + while True:
> + buf = os.read(fd, dm_batch)
> + if len(buf):
> + write_exact(qemu_fd, buf, "could not write dm
state to tmp file")
> + else:
> + break
> + os.close(qemu_fd)
> +
> + os.read(fd, 1) # Wait for source to close connection
> +
>
> dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9
tools/python/xen/xend/XendDomainInfo.py
> --- a/tools/python/xen/xend/XendDomainInfo.py Fri Sep 15 17:05:38 2006
+0800
> +++ b/tools/python/xen/xend/XendDomainInfo.py Wed Dec 13 22:52:02 2006
+0800
> @@ -488,6 +488,16 @@ class XendDomainInfo:
> self._removeVm(''xend/previous_restart_time'')
> self.storeDom("control/shutdown", reason)
>
> + ## shutdown hypercall for hvm domain desides xenstore write
> + image_cfg = self.info.get(''image'', {})
> + hvm = image_cfg.has_key(''hvm'')
> + if hvm:
> + for code in DOMAIN_SHUTDOWN_REASONS.keys():
> + if DOMAIN_SHUTDOWN_REASONS[code] == reason:
> + break
> + xc.domain_shutdown(self.domid, code)
> +
> +
> def pause(self):
> """Pause domain
>
> @@ -1203,8 +1213,11 @@ class XendDomainInfo:
> if self.image:
> self.image.createDeviceModel()
>
> - def _releaseDevices(self):
> + def _releaseDevices(self, suspend = False):
> """Release all domain''s devices. Nothrow
guarantee."""
> + if suspend and self.image:
> + self.image.destroy(suspend)
> + return
>
> while True:
> t = xstransact("%s/device" % self.dompath)
> @@ -1473,6 +1486,16 @@ class XendDomainInfo:
> self.console_mfn = console_mfn
>
> self._introduceDomain()
> + image_cfg = self.info.get(''image'', {})
> + hvm = image_cfg.has_key(''hvm'')
> + if hvm:
> + self.image = image.create(self,
> + self.info,
> + self.info[''image''],
> + self.info[''devices''])
> + if self.image:
> + self.image.createDeviceModel(True)
> + self.image.register_shutdown_watch()
> self._storeDomDetails()
> self._registerWatches()
> self.refreshShutdown()
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/image.py
> --- a/tools/python/xen/xend/image.py Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/python/xen/xend/image.py Wed Dec 13 22:52:02 2006 +0800
> @@ -157,7 +157,7 @@ class ImageHandler:
> """Build the domain. Define in
subclass."""
> raise NotImplementedError()
>
> - def createDeviceModel(self):
> + def createDeviceModel(self, restore = False):
> """Create device model for the domain (define in
subclass if needed)."""
> pass
>
> @@ -405,7 +405,7 @@ class HVMImageHandler(ImageHandler):
>
> return ret
>
> - def createDeviceModel(self):
> + def createDeviceModel(self, restore = False):
> if self.pid:
> return
> # Execute device model.
> @@ -414,6 +414,8 @@ class HVMImageHandler(ImageHandler):
> args = args + ([ "-d", "%d" %
self.vm.getDomid(),
> "-m", "%s" %
(self.getRequiredInitialReservation() / 1024)])
> args = args + self.dmargs
> + if restore:
> + args = args + ([ "-loadvm",
"/tmp/xen.qemu-dm.%d" % self.vm.getDomid() ])
> env = dict(os.environ)
> if self.display:
> env[''DISPLAY''] = self.display
> @@ -432,12 +434,16 @@ class HVMImageHandler(ImageHandler):
> self.register_reboot_feature_watch()
> self.pid =
self.vm.gatherDom((''image/device-model-pid'', int))
>
> - def destroy(self):
> + def destroy(self, suspend = False):
> self.unregister_shutdown_watch()
> self.unregister_reboot_feature_watch();
> if self.pid:
> try:
> - os.kill(self.pid, signal.SIGKILL)
> + sig = signal.SIGKILL
> + if suspend:
> + log.info("use sigusr1 to signal qemu %d",
self.pid)
> + sig = signal.SIGUSR1
> + os.kill(self.pid, sig)
> except OSError, exn:
> log.exception(exn)
> try:
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_restore.c
> --- a/tools/xcutils/xc_restore.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/xcutils/xc_restore.c Wed Dec 13 22:52:02 2006 +0800
> @@ -19,12 +19,13 @@ main(int argc, char **argv)
> main(int argc, char **argv)
> {
> unsigned int xc_fd, io_fd, domid, nr_pfns, store_evtchn,
console_evtchn;
> + unsigned int hvm, pae, apic;
> int ret;
> unsigned long store_mfn, console_mfn;
>
> - if (argc != 6)
> + if (argc != 9)
> errx(1,
> - "usage: %s iofd domid nr_pfns store_evtchn
console_evtchn",
> + "usage: %s iofd domid nr_pfns store_evtchn console_evtchn hvm
pae apic",
> argv[0]);
>
> xc_fd = xc_interface_open();
> @@ -36,9 +37,19 @@ main(int argc, char **argv)
> nr_pfns = atoi(argv[3]);
> store_evtchn = atoi(argv[4]);
> console_evtchn = atoi(argv[5]);
> + hvm = atoi(argv[6]);
> + pae = atoi(argv[7]);
> + apic = atoi(argv[8]);
>
> - ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
> - &store_mfn, console_evtchn, &console_mfn);
> + if (hvm) {
> + /* pass the memsize to xc_hvm_restore to find the store_mfn */
> + store_mfn = hvm;
> + ret = xc_hvm_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
> + &store_mfn, console_evtchn, &console_mfn, pae,
apic);
> + } else
> + ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
> + &store_mfn, console_evtchn, &console_mfn);
> +
> if (ret == 0) {
> printf("store-mfn %li\n", store_mfn);
> printf("console-mfn %li\n", console_mfn);
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_save.c
> --- a/tools/xcutils/xc_save.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/tools/xcutils/xc_save.c Wed Dec 13 22:52:02 2006 +0800
> @@ -51,7 +51,10 @@ main(int argc, char **argv)
> max_f = atoi(argv[4]);
> flags = atoi(argv[5]);
>
> - ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags,
&suspend);
> + if (flags & XCFLAGS_HVM)
> + ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags,
&suspend);
> + else
> + ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags,
&suspend);
>
> xc_interface_close(xc_fd);
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domain.c
> --- a/xen/arch/x86/domain.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/domain.c Wed Dec 13 22:52:02 2006 +0800
> @@ -330,6 +330,7 @@ int arch_set_info_guest(
> else
> {
> hvm_load_cpu_guest_regs(v,
&v->arch.guest_context.user_regs);
> + hvm_load_cpu_context(v,
&v->arch.guest_context.hvmcpu_ctxt);
> }
>
> if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domctl.c
> --- a/xen/arch/x86/domctl.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/domctl.c Wed Dec 13 22:52:02 2006 +0800
> @@ -297,6 +297,7 @@ void arch_getdomaininfo_ctxt(
> if ( is_hvm_vcpu(v) )
> {
> hvm_store_cpu_guest_regs(v, &c->user_regs, c->ctrlreg);
> + hvm_save_cpu_context(v, &c->hvmcpu_ctxt);
> }
> else
> {
> @@ -314,6 +315,22 @@ void arch_getdomaininfo_ctxt(
> c->ctrlreg[3] =
xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table));
>
> c->vm_assist = v->domain->vm_assist;
> +}
> +
> +int arch_gethvm_ctxt(
> + struct vcpu *v, struct hvm_domain_context *c)
> +{
> + if ( !is_hvm_vcpu(v) )
> + return -1;
> +
> + return hvm_save(v, c);
> +
> +}
> +
> +int arch_sethvm_ctxt(
> + struct vcpu *v, struct hvm_domain_context *c)
> +{
> + return hvm_load(v, c);
> }
>
> /*
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/hvm.c
> --- a/xen/arch/x86/hvm/hvm.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/hvm.c Wed Dec 13 22:52:02 2006 +0800
> @@ -182,9 +182,18 @@ int hvm_domain_initialise(struct domain
>
> void hvm_domain_destroy(struct domain *d)
> {
> + HVMStateEntry *se, *dse;
> kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
> rtc_deinit(d);
> pmtimer_deinit(d);
> +
> + se = d->arch.hvm_domain.first_se;
> + while (se) {
> + dse = se;
> + se = se->next;
> + xfree(dse);
> + }
> +
>
> if ( d->arch.hvm_domain.shared_page_va )
> unmap_domain_page_global(
> @@ -225,6 +234,9 @@ int hvm_vcpu_initialise(struct vcpu *v)
> pit_init(v, cpu_khz);
> rtc_init(v, RTC_PORT(0), RTC_IRQ);
> pmtimer_init(v, ACPI_PM_TMR_BLK_ADDRESS);
> +
> + /* init hvm sharepage */
> + shpage_init(v->domain, get_sp(v->domain));
>
> /* Init guest TSC to start from zero. */
> hvm_set_guest_time(v, 0);
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/i8254.c
> --- a/xen/arch/x86/hvm/i8254.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/i8254.c Wed Dec 13 22:52:02 2006 +0800
> @@ -203,11 +203,11 @@ static inline void pit_load_count(PITCha
> switch (s->mode) {
> case 2:
> /* create periodic time */
> - s->pt = create_periodic_time (period, 0, 0, pit_time_fired,
s);
> + s->pt = create_periodic_time (current->domain, period,
0, 0, pit_time_fired, s);
> break;
> case 1:
> /* create one shot time */
> - s->pt = create_periodic_time (period, 0, 1, pit_time_fired,
s);
> + s->pt = create_periodic_time (current->domain, period,
0, 1, pit_time_fired, s);
> #ifdef DEBUG_PIT
> printk("HVM_PIT: create one shot time.\n");
> #endif
> @@ -345,6 +345,152 @@ static uint32_t pit_ioport_read(void *op
> return ret;
> }
>
> +#ifdef HVM_DEBUG_SUSPEND
> +static void pit_info(PITState *pit)
> +{
> + PITChannelState *s;
> + int i;
> +
> + for(i = 0; i < 3; i++) {
> + printk("*****pit channel %d''s state:*****\n",
i);
> + s = &pit->channels[i];
> + printk("pit 0x%x.\n", s->count);
> + printk("pit 0x%x.\n", s->latched_count);
> + printk("pit 0x%x.\n", s->count_latched);
> + printk("pit 0x%x.\n", s->status_latched);
> + printk("pit 0x%x.\n", s->status);
> + printk("pit 0x%x.\n", s->read_state);
> + printk("pit 0x%x.\n", s->write_state);
> + printk("pit 0x%x.\n", s->write_latch);
> + printk("pit 0x%x.\n", s->rw_mode);
> + printk("pit 0x%x.\n", s->mode);
> + printk("pit 0x%x.\n", s->bcd);
> + printk("pit 0x%x.\n", s->gate);
> + printk("pit %"PRId64"\n",
s->count_load_time);
> +
> + if (s->pt) {
> + struct periodic_time *pt = s->pt;
> + printk("pit channel %d has a periodic timer:\n", i);
> + printk("pt %d.\n", pt->enabled);
> + printk("pt %d.\n", pt->one_shot);
> + printk("pt %d.\n", pt->irq);
> + printk("pt %d.\n", pt->first_injected);
> +
> + printk("pt %d.\n", pt->pending_intr_nr);
> + printk("pt %d.\n", pt->period);
> + printk("pt %"PRId64"\n",
pt->period_cycles);
> + printk("pt %"PRId64"\n",
pt->last_plt_gtime);
> + }
> + }
> +
> +}
> +#else
> +static void pit_info(PITState *pit)
> +{
> +}
> +#endif
> +
> +static void pit_save(hvm_domain_context_t *h, void *opaque)
> +{
> + struct domain *d = opaque;
> + PITState *pit = &d->arch.hvm_domain.pl_time.vpit;
> + PITChannelState *s;
> + struct periodic_time *pt;
> + int i, pti = -1;
> +
> + pit_info(pit);
> +
> + for(i = 0; i < 3; i++) {
> + s = &pit->channels[i];
> + hvm_put_32u(h, s->count);
> + hvm_put_16u(h, s->latched_count);
> + hvm_put_8u(h, s->count_latched);
> + hvm_put_8u(h, s->status_latched);
> + hvm_put_8u(h, s->status);
> + hvm_put_8u(h, s->read_state);
> + hvm_put_8u(h, s->write_state);
> + hvm_put_8u(h, s->write_latch);
> + hvm_put_8u(h, s->rw_mode);
> + hvm_put_8u(h, s->mode);
> + hvm_put_8u(h, s->bcd);
> + hvm_put_8u(h, s->gate);
> + hvm_put_64u(h, s->count_load_time);
> +
> + if (s->pt && pti == -1)
> + pti = i;
> + }
> +
> + /* save guest time */
> + pt = pit->channels[pti].pt;
> + hvm_put_8u(h, pti);
> + hvm_put_8u(h, pt->first_injected);
> + hvm_put_32u(h, pt->pending_intr_nr);
> + hvm_put_64u(h, pt->last_plt_gtime);
> +
> +}
> +
> +static int pit_load(hvm_domain_context_t *h, void *opaque, int version_id)
> +{
> + struct domain *d = opaque;
> + PITState *pit = &d->arch.hvm_domain.pl_time.vpit;
> + PITChannelState *s;
> + int i, pti;
> + u32 period;
> +
> + if (version_id != 1)
> + return -EINVAL;
> +
> + for(i = 0; i < 3; i++) {
> + s = &pit->channels[i];
> + s->count = hvm_get_32u(h);
> + s->latched_count = hvm_get_16u(h);
> + s->count_latched = hvm_get_8u(h);
> + s->status_latched = hvm_get_8u(h);
> + s->status = hvm_get_8u(h);
> + s->read_state = hvm_get_8u(h);
> + s->write_state = hvm_get_8u(h);
> + s->write_latch = hvm_get_8u(h);
> + s->rw_mode = hvm_get_8u(h);
> + s->mode = hvm_get_8u(h);
> + s->bcd = hvm_get_8u(h);
> + s->gate = hvm_get_8u(h);
> + s->count_load_time = hvm_get_64u(h);
> + }
> +
> + pti = hvm_get_8u(h);
> + if ( pti < 0 || pti > 2) {
> + printk("pit load get a wrong channel %d when HVM
resume.\n", pti);
> + return -EINVAL;
> + }
> +
> + s = &pit->channels[pti];
> + period = DIV_ROUND((s->count * 1000000000ULL), PIT_FREQ);
> +
> + printk("recreate periodic timer %d in mode %d, freq=%d.\n",
pti, s->mode, period);
> + switch (s->mode) {
> + case 2:
> + /* create periodic time */
> + s->pt = create_periodic_time (d, period, 0, 0,
pit_time_fired, s);
> + s->pt->first_injected = hvm_get_8u(h);
> + s->pt->pending_intr_nr = hvm_get_32u(h);
> + s->pt->last_plt_gtime = hvm_get_64u(h);
> + break;
> + case 1:
> + /* create one shot time */
> + s->pt = create_periodic_time (d, period, 0, 1,
pit_time_fired, s);
> + break;
> + default:
> + printk("pit mode %"PRId8" should not use
periodic timer!\n", s->mode);
> + return -EINVAL;
> + }
> +
> + /*XXX: need set_guest_time here or do this when post_inject? */
> +
> + pit_info(pit);
> +
> + return 0;
> +}
> +
> static void pit_reset(void *opaque)
> {
> PITState *pit = opaque;
> @@ -373,6 +519,8 @@ void pit_init(struct vcpu *v, unsigned l
> s->vcpu = v;
> s++; s->vcpu = v;
> s++; s->vcpu = v;
> +
> + hvm_register_savevm(v->domain, "xen_hvm_i8254", PIT_BASE,
1, pit_save, pit_load, v->domain);
>
> register_portio_handler(v->domain, PIT_BASE, 4, handle_pit_io);
> /* register the speaker port */
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/intercept.c
> --- a/xen/arch/x86/hvm/intercept.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/intercept.c Wed Dec 13 22:52:02 2006 +0800
> @@ -29,6 +29,8 @@
> #include <asm/current.h>
> #include <io_ports.h>
> #include <xen/event.h>
> +#include <xen/compile.h>
> +#include <public/version.h>
>
>
> extern struct hvm_mmio_handler vlapic_mmio_handler;
> @@ -314,13 +316,14 @@ void pickup_deactive_ticks(struct period
> * period: fire frequency in ns.
> */
> struct periodic_time * create_periodic_time(
> + struct domain *d,
> u32 period,
> char irq,
> char one_shot,
> time_cb *cb,
> void *data)
> {
> - struct periodic_time *pt =
&(current->domain->arch.hvm_domain.pl_time.periodic_tm);
> + struct periodic_time *pt =
&(d->arch.hvm_domain.pl_time.periodic_tm);
> if ( pt->enabled ) {
> stop_timer (&pt->timer);
> pt->enabled = 0;
> @@ -353,6 +356,278 @@ void destroy_periodic_time(struct period
> stop_timer(&pt->timer);
> pt->enabled = 0;
> }
> +}
> +
> +/* save/restore support */
> +#define HVM_FILE_MAGIC 0x54381286
> +#define HVM_FILE_VERSION 0x00000001
> +
> +int hvm_register_savevm(struct domain *d,
> + const char *idstr,
> + int instance_id,
> + int version_id,
> + SaveStateHandler *save_state,
> + LoadStateHandler *load_state,
> + void *opaque)
> +{
> + HVMStateEntry *se, **pse;
> +
> + if ( (se = xmalloc(struct HVMStateEntry)) == NULL ){
> + printk("allocat hvmstate entry fail.\n");
> + return -1;
> + }
> +
> + strncpy(se->idstr, idstr, HVM_SE_IDSTR_LEN);
> +
> + se->instance_id = instance_id;
> + se->version_id = version_id;
> + se->save_state = save_state;
> + se->load_state = load_state;
> + se->opaque = opaque;
> + se->next = NULL;
> +
> + /* add at the end of list */
> + pse = &d->arch.hvm_domain.first_se;
> + while (*pse != NULL)
> + pse = &(*pse)->next;
> + *pse = se;
> + return 0;
> +}
> +
> +int hvm_save(struct vcpu *v, hvm_domain_context_t *h)
> +{
> + uint32_t len, len_pos, cur_pos;
> + uint32_t eax, ebx, ecx, edx;
> + HVMStateEntry *se;
> + char *chgset;
> +
> + if (!is_hvm_vcpu(v)) {
> + printk("hvm_save only for hvm guest!\n");
> + return -1;
> + }
> +
> + memset(h, 0, sizeof(hvm_domain_context_t));
> + hvm_put_32u(h, HVM_FILE_MAGIC);
> + hvm_put_32u(h, HVM_FILE_VERSION);
> +
> + /* save xen changeset */
> + chgset = strrchr(XEN_CHANGESET, '' '') + 1;
> +
> + len = strlen(chgset);
> + hvm_put_8u(h, len);
> + hvm_put_buffer(h, chgset, len);
> +
> + /* save cpuid */
> + cpuid(1, &eax, &ebx, &ecx, &edx);
> + hvm_put_32u(h, eax);
> +
> + for(se = v->domain->arch.hvm_domain.first_se; se != NULL; se =
se->next) {
> + /* ID string */
> + len = strnlen(se->idstr, HVM_SE_IDSTR_LEN);
> + hvm_put_8u(h, len);
> + hvm_put_buffer(h, se->idstr, len);
> +
> + hvm_put_32u(h, se->instance_id);
> + hvm_put_32u(h, se->version_id);
> +
> + /* record size */
> + len_pos = hvm_ctxt_tell(h);
> + hvm_put_32u(h, 0);
> +
> + se->save_state(h, se->opaque);
> +
> + cur_pos = hvm_ctxt_tell(h);
> + len = cur_pos - len_pos - 4;
> + hvm_ctxt_seek(h, len_pos);
> + hvm_put_32u(h, len);
> + hvm_ctxt_seek(h, cur_pos);
> +
> + }
> +
> + h->size = hvm_ctxt_tell(h);
> + hvm_ctxt_seek(h, 0);
> +
> + if (h->size >= HVM_CTXT_SIZE) {
> + printk("hvm_domain_context overflow when hvm_save! need
%"PRId32" bytes for use.\n", h->size);
> + return -1;
> + }
> +
> + return 0;
> +
> +}
> +
> +static HVMStateEntry *find_se(struct domain *d, const char *idstr, int
instance_id)
> +{
> + HVMStateEntry *se;
> +
> + for(se = d->arch.hvm_domain.first_se; se != NULL; se = se->next)
{
> + if (!strncmp(se->idstr, idstr, HVM_SE_IDSTR_LEN) &&
> + instance_id == se->instance_id){
> + return se;
> + }
> + }
> + return NULL;
> +}
> +
> +int hvm_load(struct vcpu *v, hvm_domain_context_t *h)
> +{
> + uint32_t len, rec_len, rec_pos, magic, instance_id, version_id;
> + uint32_t eax, ebx, ecx, edx;
> + HVMStateEntry *se;
> + char idstr[HVM_SE_IDSTR_LEN];
> + xen_changeset_info_t chgset;
> + char *cur_chgset;
> + int ret;
> +
> + if (!is_hvm_vcpu(v)) {
> + printk("hvm_load only for hvm guest!\n");
> + return -1;
> + }
> +
> + if (h->size >= HVM_CTXT_SIZE) {
> + printk("hvm_load fail! seems hvm_domain_context overflow when
hvm_save! need %"PRId32" bytes.\n", h->size);
> + return -1;
> + }
> +
> + hvm_ctxt_seek(h, 0);
> +
> + magic = hvm_get_32u(h);
> + if (magic != HVM_FILE_MAGIC) {
> + printk("HVM restore magic dismatch!\n");
> + return -1;
> + }
> +
> + magic = hvm_get_32u(h);
> + if (magic != HVM_FILE_VERSION) {
> + printk("HVM restore version dismatch!\n");
> + return -1;
> + }
> +
> + /* check xen change set */
> + cur_chgset = strrchr(XEN_CHANGESET, '' '') + 1;
> +
> + len = hvm_get_8u(h);
> + if (len > 20) { /*typical length is 18 -- "revision
number:changeset id" */
> + printk("wrong change set length %d when hvm restore!\n",
len);
> + return -1;
> + }
> +
> + hvm_get_buffer(h, chgset, len);
> + chgset[len] = ''\0'';
> + if (strncmp(cur_chgset, chgset, len + 1))
> + printk("warnings: try to restore hvm guest(%s) on a different
changeset %s.\n",
> + chgset, cur_chgset);
> +
> + /* check cpuid */
> + cpuid(1, &eax, &ebx, &ecx, &edx);
> + ebx = hvm_get_32u(h);
> + /*TODO: need difine how big difference is acceptable */
> + if (ebx != eax)
> + printk("warnings: try to restore hvm
guest(0x%"PRIx32") "
> + "on a different type
processor(0x%"PRIx32").\n",
> + ebx,
> + eax);
> +
> + while(1) {
> + if (hvm_ctxt_end(h)) {
> + break;
> + }
> +
> + /* ID string */
> + len = hvm_get_8u(h);
> + if (len > HVM_SE_IDSTR_LEN) {
> + printk("wrong HVM save entry idstr len %d!", len);
> + return -1;
> + }
> +
> + hvm_get_buffer(h, idstr, len);
> + idstr[len] = ''\0'';
> +
> + instance_id = hvm_get_32u(h);
> + version_id = hvm_get_32u(h);
> +
> + rec_len = hvm_get_32u(h);
> + rec_pos = hvm_ctxt_tell(h);
> +
> + se = find_se(v->domain, idstr, instance_id);
> + if (se == NULL) {
> + printk("warnings: hvm load can''t find device
%s''s instance %d!\n",
> + idstr, instance_id);
> + } else {
> + ret = se->load_state(h, se->opaque, version_id);
> + if (ret < 0)
> + printk("warnings: loading state fail for device %s
instance %d!\n",
> + idstr, instance_id);
> + }
> +
> +
> + /* make sure to jump end of record */
> + if ( hvm_ctxt_tell(h) - rec_pos != rec_len) {
> + printk("wrong hvm record size, maybe some dismatch
between save&restore handler!\n");
> + }
> + hvm_ctxt_seek(h, rec_pos + rec_len);
> + }
> +
> + return 0;
> +}
> +
> +#ifdef HVM_DEBUG_SUSPEND
> +static void shpage_info(shared_iopage_t *sh)
> +{
> +
> + vcpu_iodata_t *p = &sh->vcpu_iodata[0];
> + ioreq_t *req = &p->vp_ioreq;
> + printk("*****sharepage_info******!\n");
> + printk("vp_eport=%d\n", p->vp_eport);
> + printk("io packet: "
> + "state:%x, pvalid: %x, dir:%x, port:
%"PRIx64", "
> + "data: %"PRIx64", count:
%"PRIx64", size: %"PRIx64"\n",
> + req->state, req->data_is_ptr, req->dir,
req->addr,
> + req->data, req->count, req->size);
> +}
> +#else
> +static void shpage_info(shared_iopage_t *sh)
> +{
> +}
> +#endif
> +
> +static void shpage_save(hvm_domain_context_t *h, void *opaque)
> +{
> + /* XXX:no action required for shpage save/restore, since it''s
in guest memory
> + * keep it for debug purpose only */
> +
> +#if 0
> + struct shared_iopage *s = opaque;
> + /* XXX:smp */
> + struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
> +
> + shpage_info(s);
> +
> + hvm_put_buffer(h, (char*)req, sizeof(struct ioreq));
> +#endif
> +}
> +
> +static int shpage_load(hvm_domain_context_t *h, void *opaque, int
version_id)
> +{
> + struct shared_iopage *s = opaque;
> +#if 0
> + /* XXX:smp */
> + struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
> +
> + if (version_id != 1)
> + return -EINVAL;
> +
> + hvm_get_buffer(h, (char*)req, sizeof(struct ioreq));
> +
> +
> +#endif
> + shpage_info(s);
> + return 0;
> +}
> +
> +void shpage_init(struct domain *d, shared_iopage_t *sp)
> +{
> + hvm_register_savevm(d, "xen_hvm_shpage", 0x10, 1,
shpage_save, shpage_load, sp);
> }
>
> /*
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vioapic.c
> --- a/xen/arch/x86/hvm/vioapic.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/vioapic.c Wed Dec 13 22:52:02 2006 +0800
> @@ -466,10 +466,138 @@ void vioapic_update_EOI(struct domain *d
> spin_unlock(&hvm_irq->lock);
> }
>
> +#ifdef HVM_DEBUG_SUSPEND
> +static void ioapic_info(struct vioapic *s)
> +{
> + int i;
> + printk("*****ioapic state:*****\n");
> + printk("ioapic 0x%x.\n", s->ioregsel);
> + printk("ioapic 0x%x.\n", s->id);
> + printk("ioapic 0x%lx.\n", s->base_address);
> + for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
> + printk("ioapic redirtbl[%d]:0x%"PRIx64"\n", i,
s->redirtbl[i].bits);
> + }
> +
> +}
> +static void hvmirq_info(struct hvm_irq *hvm_irq)
> +{
> + int i;
> + printk("*****hvmirq state:*****\n");
> + for (i = 0; i < BITS_TO_LONGS(32*4); i++)
> + printk("hvmirq pci_intx[%d]:0x%lx.\n", i,
hvm_irq->pci_intx[i]);
> +
> + for (i = 0; i < BITS_TO_LONGS(16); i++)
> + printk("hvmirq isa_irq[%d]:0x%lx.\n", i,
hvm_irq->isa_irq[i]);
> +
> + for (i = 0; i < BITS_TO_LONGS(1); i++)
> + printk("hvmirq callback_irq_wire[%d]:0x%lx.\n", i,
hvm_irq->callback_irq_wire[i]);
> +
> + printk("hvmirq callback_gsi:0x%x.\n",
hvm_irq->callback_gsi);
> +
> + for (i = 0; i < 4; i++)
> + printk("hvmirq
pci_link_route[%d]:0x%"PRIx8".\n", i,
hvm_irq->pci_link_route[i]);
> +
> + for (i = 0; i < 4; i++)
> + printk("hvmirq
pci_link_assert_count[%d]:0x%"PRIx8".\n", i,
hvm_irq->pci_link_assert_count[i]);
> +
> + for (i = 0; i < 4; i++)
> + printk("hvmirq
gsi_assert_count[%d]:0x%"PRIx8".\n", i,
hvm_irq->gsi_assert_count[i]);
> +
> + printk("hvmirq
round_robin_prev_vcpu:0x%"PRIx8".\n",
hvm_irq->round_robin_prev_vcpu);
> +}
> +#else
> +static void ioapic_info(struct vioapic *s)
> +{
> +}
> +static void hvmirq_info(struct hvm_irq *hvm_irq)
> +{
> +}
> +#endif
> +
> +static void ioapic_save(hvm_domain_context_t *h, void *opaque)
> +{
> + int i;
> + struct domain *d = opaque;
> + struct vioapic *s = domain_vioapic(d);
> + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
> +
> + ioapic_info(s);
> + hvmirq_info(hvm_irq);
> +
> + /* save iopaic state*/
> + hvm_put_32u(h, s->ioregsel);
> + hvm_put_32u(h, s->id);
> + hvm_put_64u(h, s->base_address);
> + for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
> + hvm_put_64u(h, s->redirtbl[i].bits);
> + }
> +
> + /* save hvm irq state */
> + hvm_put_buffer(h, (char*)hvm_irq->pci_intx, 16);
> + hvm_put_buffer(h, (char*)hvm_irq->isa_irq, 2);
> + hvm_put_buffer(h, (char*)hvm_irq->callback_irq_wire, 1);
> + hvm_put_32u(h, hvm_irq->callback_gsi);
> +
> + for (i = 0; i < 4; i++)
> + hvm_put_8u(h, hvm_irq->pci_link_route[i]);
> +
> + for (i = 0; i < 4; i++)
> + hvm_put_8u(h, hvm_irq->pci_link_assert_count[i]);
> +
> + for (i = 0; i < VIOAPIC_NUM_PINS; i++)
> + hvm_put_8u(h, hvm_irq->gsi_assert_count[i]);
> +
> + hvm_put_8u(h, hvm_irq->round_robin_prev_vcpu);
> +
> +}
> +
> +static int ioapic_load(hvm_domain_context_t *h, void *opaque, int
version_id)
> +{
> + int i;
> + struct domain *d = opaque;
> + struct vioapic *s = domain_vioapic(d);
> + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
> +
> + if (version_id != 1)
> + return -EINVAL;
> +
> + /* restore ioapic state */
> + s->ioregsel = hvm_get_32u(h);
> + s->id = hvm_get_32u(h);
> + s->base_address = hvm_get_64u(h);
> + for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
> + s->redirtbl[i].bits = hvm_get_64u(h);
> + }
> +
> + /* restore irq state */
> + hvm_get_buffer(h, (char*)hvm_irq->pci_intx, 16);
> + hvm_get_buffer(h, (char*)hvm_irq->isa_irq, 2);
> + hvm_get_buffer(h, (char*)hvm_irq->callback_irq_wire, 1);
> + hvm_irq->callback_gsi = hvm_get_32u(h);
> +
> + for (i = 0; i < 4; i++)
> + hvm_irq->pci_link_route[i] = hvm_get_8u(h);
> +
> + for (i = 0; i < 4; i++)
> + hvm_irq->pci_link_assert_count[i] = hvm_get_8u(h);
> +
> + for (i = 0; i < VIOAPIC_NUM_PINS; i++)
> + hvm_irq->gsi_assert_count[i] = hvm_get_8u(h);
> +
> + hvm_irq->round_robin_prev_vcpu = hvm_get_8u(h);
> +
> + ioapic_info(s);
> + hvmirq_info(hvm_irq);
> +
> + return 0;
> +}
> +
> void vioapic_init(struct domain *d)
> {
> struct vioapic *vioapic = domain_vioapic(d);
> int i;
> +
> + hvm_register_savevm(d, "xen_hvm_ioapic", 0, 1, ioapic_save,
ioapic_load, d);
>
> memset(vioapic, 0, sizeof(*vioapic));
> for ( i = 0; i < VIOAPIC_NUM_PINS; i++ )
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vlapic.c
> --- a/xen/arch/x86/hvm/vlapic.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/vlapic.c Wed Dec 13 22:52:02 2006 +0800
> @@ -921,6 +921,82 @@ static int vlapic_reset(struct vlapic *v
> return 1;
> }
>
> +#ifdef HVM_DEBUG_SUSPEND
> +static void lapic_info(struct vlapic *s)
> +{
> + printk("*****lapic state:*****\n");
> + printk("lapic 0x%"PRIx64".\n",
s->apic_base_msr);
> + printk("lapic 0x%x.\n", s->disabled);
> + printk("lapic 0x%x.\n", s->timer_divisor);
> + printk("lapic 0x%x.\n", s->timer_pending_count);
> +}
> +#else
> +static void lapic_info(struct vlapic *s)
> +{
> +}
> +#endif
> +
> +static void lapic_save(hvm_domain_context_t *h, void *opaque)
> +{
> + struct vlapic *s = opaque;
> +
> + lapic_info(s);
> +
> + hvm_put_64u(h, s->apic_base_msr);
> + hvm_put_32u(h, s->disabled);
> + hvm_put_32u(h, s->timer_divisor);
> +
> + /*XXX: need this?*/
> + hvm_put_32u(h, s->timer_pending_count);
> +
> + hvm_put_buffer(h, (char*)s->regs, 0x3f0);
> +
> +}
> +
> +static int lapic_load(hvm_domain_context_t *h, void *opaque, int
version_id)
> +{
> + struct vlapic *s = opaque;
> + uint32_t tmict;
> +
> + if (version_id != 1)
> + return -EINVAL;
> +
> + s->apic_base_msr = hvm_get_64u(h);
> + s->disabled = hvm_get_32u(h);
> + s->timer_divisor = hvm_get_32u(h);
> +
> + /*XXX: need this?*/
> + s->timer_pending_count = hvm_get_32u(h);
> +
> + hvm_get_buffer(h, (char*)s->regs, 0x3f0);
> +
> + /* rearm the actiemr if needed */
> + tmict = vlapic_get_reg(s, APIC_TMICT);
> + if (tmict > 0) {
> + s_time_t now = NOW(), offset;
> + stop_timer(&s->vlapic_timer);
> + vlapic_set_reg(s, APIC_TMCCT, tmict);
> + s->timer_last_update = now;
> +
> + offset = APIC_BUS_CYCLE_NS * s->timer_divisor * tmict;
> +
> + set_timer(&s->vlapic_timer, now + offset);
> +
> + printk("lapic_load to rearm the actimer:"
> + "bus cycle is %"PRId64"ns, now
0x%016"PRIx64", "
> + "timer initial count 0x%x, offset
0x%016"PRIx64", "
> + "expire @ 0x%016"PRIx64".",
> + APIC_BUS_CYCLE_NS, now,
> + vlapic_get_reg(s, APIC_TMICT),
> + offset, now + offset);
> + }
> +
> +
> + lapic_info(s);
> +
> + return 0;
> +}
> +
> int vlapic_init(struct vcpu *v)
> {
> struct vlapic *vlapic = vcpu_vlapic(v);
> @@ -939,6 +1015,7 @@ int vlapic_init(struct vcpu *v)
> vlapic->regs =
map_domain_page_global(page_to_mfn(vlapic->regs_page));
> memset(vlapic->regs, 0, PAGE_SIZE);
>
> + hvm_register_savevm(v->domain, "xen_hvm_lapic",
v->vcpu_id, 1, lapic_save, lapic_load, vlapic);
> vlapic_reset(vlapic);
>
> vlapic->apic_base_msr = MSR_IA32_APICBASE_ENABLE |
APIC_DEFAULT_PHYS_BASE;
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vmx/vmx.c
> --- a/xen/arch/x86/hvm/vmx/vmx.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Dec 13 22:52:02 2006 +0800
> @@ -426,6 +426,319 @@ static void vmx_store_cpu_guest_regs(
> vmx_vmcs_exit(v);
> }
>
> +static int __get_instruction_length(void);
> +int vmx_vmcs_save(struct vcpu *v, struct vmcs_data *c)
> +{
> + unsigned long inst_len;
> +
> + inst_len = __get_instruction_length();
> + c->eip = __vmread(GUEST_RIP);
> +
> +#ifdef HVM_DEBUG_SUSPEND
> + printk("vmx_vmcs_save: inst_len=0x%lx,
eip=0x%"PRIx64".\n",
> + inst_len, c->eip);
> +#endif
> +
> + c->esp = __vmread(GUEST_RSP);
> + c->eflags = __vmread(GUEST_RFLAGS);
> +
> + c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
> + c->cr3 = v->arch.hvm_vmx.cpu_cr3;
> + c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
> +
> +#ifdef HVM_DEBUG_SUSPEND
> + printk("vmx_vmcs_save: cr3=0x%"PRIx64",
cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
> + c->cr3,
> + c->cr0,
> + c->cr4);
> +#endif
> +
> + c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
> + c->idtr_base = __vmread(GUEST_IDTR_BASE);
> +
> + c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
> + c->gdtr_base = __vmread(GUEST_GDTR_BASE);
> +
> + c->cs_sel = __vmread(GUEST_CS_SELECTOR);
> + c->cs_limit = __vmread(GUEST_CS_LIMIT);
> + c->cs_base = __vmread(GUEST_CS_BASE);
> + c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
> +
> + c->ds_sel = __vmread(GUEST_DS_SELECTOR);
> + c->ds_limit = __vmread(GUEST_DS_LIMIT);
> + c->ds_base = __vmread(GUEST_DS_BASE);
> + c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
> +
> + c->es_sel = __vmread(GUEST_ES_SELECTOR);
> + c->es_limit = __vmread(GUEST_ES_LIMIT);
> + c->es_base = __vmread(GUEST_ES_BASE);
> + c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
> +
> + c->ss_sel = __vmread(GUEST_SS_SELECTOR);
> + c->ss_limit = __vmread(GUEST_SS_LIMIT);
> + c->ss_base = __vmread(GUEST_SS_BASE);
> + c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
> +
> + c->fs_sel = __vmread(GUEST_FS_SELECTOR);
> + c->fs_limit = __vmread(GUEST_FS_LIMIT);
> + c->fs_base = __vmread(GUEST_FS_BASE);
> + c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
> +
> + c->gs_sel = __vmread(GUEST_GS_SELECTOR);
> + c->gs_limit = __vmread(GUEST_GS_LIMIT);
> + c->gs_base = __vmread(GUEST_GS_BASE);
> + c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
> +
> + c->tr_sel = __vmread(GUEST_TR_SELECTOR);
> + c->tr_limit = __vmread(GUEST_TR_LIMIT);
> + c->tr_base = __vmread(GUEST_TR_BASE);
> + c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
> +
> + c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
> + c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
> + c->ldtr_base = __vmread(GUEST_LDTR_BASE);
> + c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
> +
> + c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
> + c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
> + c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
> +
> + return 1;
> +}
> +
> +int vmx_vmcs_restore(struct vcpu *v, struct vmcs_data *c)
> +{
> + unsigned long mfn, old_cr4, old_base_mfn;
> + int error = 0;
> +
> + __vmwrite(GUEST_RIP, c->eip);
> + __vmwrite(GUEST_RSP, c->esp);
> + __vmwrite(GUEST_RFLAGS, c->eflags);
> +
> + v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
> + __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
> +
> + old_cr4 = __vmread(CR4_READ_SHADOW);
> + __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
> +
> + v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
> + __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
> +
> +#ifdef HVM_DEBUG_SUSPEND
> + printk("vmx_vmcs_restore: cr3=0x%"PRIx64",
cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
> + c->cr3,
> + c->cr0,
> + c->cr4);
> +#endif
> +
> + if (!vmx_paging_enabled(v)) {
> + HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys
table");
> + __vmwrite(GUEST_CR3,
pagetable_get_paddr(v->domain->arch.phys_table));
> + goto skip_cr3;
> + }
> +
> + if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
> + /*
> + * This is simple TLB flush, implying the guest has
> + * removed some translation or changed page attributes.
> + * We simply invalidate the shadow.
> + */
> + mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
> + if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
> + printk("Invalid CR3 value=%"PRIx64"",
c->cr3);
> + domain_crash(v->domain);
> + return 0;
> + }
> + } else {
> + /*
> + * If different, make a shadow. Check if the PDBR is valid
> + * first.
> + */
> + HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 =
%"PRIx64"", c->cr3);
> + if ((c->cr3 >> PAGE_SHIFT) >
v->domain->max_pages) {
> + printk("Invalid CR3 value=%"PRIx64"",
c->cr3);
> + domain_crash(v->domain);
> + return 0;
> + }
> +
> + /* current!=vcpu as not called by arch_vmx_do_launch */
> + mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
> + if(!get_page(mfn_to_page(mfn), v->domain)) {
> + struct page_info *page = mfn_to_page(mfn);
> + printk("get_page for mfn failed. CR3
value=%"PRIx64", count_info=0x%"PRIx32", type_info=0x%lx,
owner=%d.\n", c->cr3,
> + page->count_info,
> + page->u.inuse.type_info,
> + page->u.inuse._domain);
> + return 0;
> + }
> + old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
> + v->arch.guest_table = pagetable_from_pfn(mfn);
> + if (old_base_mfn)
> + put_page(mfn_to_page(old_base_mfn));
> + /*
> + * arch.shadow_table should now hold the next CR3 for shadow
> + */
> + v->arch.hvm_vmx.cpu_cr3 = c->cr3;
> + }
> +
> + skip_cr3:
> +#if defined(__x86_64__)
> + if (vmx_long_mode_enabled(v)) {
> + unsigned long vm_entry_value;
> + vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
> + vm_entry_value |= VM_ENTRY_IA32E_MODE;
> + __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
> + }
> +#endif
> +
> + shadow_update_paging_modes(v);
> + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
> +
> + __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
> + __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
> +
> + __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
> + __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
> +
> + __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
> + __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
> + __vmwrite(GUEST_CS_BASE, c->cs_base);
> + __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
> +
> + __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
> + __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
> + __vmwrite(GUEST_DS_BASE, c->ds_base);
> + __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
> +
> + __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
> + __vmwrite(GUEST_ES_LIMIT, c->es_limit);
> + __vmwrite(GUEST_ES_BASE, c->es_base);
> + __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
> +
> + __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
> + __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
> + __vmwrite(GUEST_SS_BASE, c->ss_base);
> + __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
> +
> + __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
> + __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
> + __vmwrite(GUEST_FS_BASE, c->fs_base);
> + __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
> +
> + __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
> + __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
> + __vmwrite(GUEST_GS_BASE, c->gs_base);
> + __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
> +
> + __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
> + __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
> + __vmwrite(GUEST_TR_BASE, c->tr_base);
> + __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
> +
> + __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
> + __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
> + __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
> + __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
> +
> + __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
> + __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
> + __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
> +
> + return !error;
> +}
> +
> +#ifdef HVM_DEBUG_SUSPEND
> +static void dump_msr_state(struct vmx_msr_state *m)
> +{
> + int i = 0;
> + printk("**** msr state ****\n");
> + printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:",
m->shadow_gs, m->flags);
> + for (i = 0; i < VMX_MSR_COUNT; i++)
> + printk("0x%lx,", m->msrs[i]);
> + printk("\n");
> +}
> +#else
> +static void dump_msr_state(struct vmx_msr_state *m)
> +{
> +}
> +#endif
> +
> +void vmx_save_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
> +{
> + struct vmcs_data *data = &ctxt->data;
> + struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
> + unsigned long guest_flags = guest_state->flags;
> + int i = 0;
> +
> + data->shadow_gs = guest_state->shadow_gs;
> + data->vmxassist_enabled = v->arch.hvm_vmx.vmxassist_enabled;
> + /* save msrs */
> + data->flags = guest_flags;
> + for (i = 0; i < VMX_MSR_COUNT; i++)
> + data->msr_items[i] = guest_state->msrs[i];
> +
> + dump_msr_state(guest_state);
> +}
> +
> +void vmx_load_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
> +{
> + int i = 0;
> + struct vmcs_data *data = &ctxt->data;
> + struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
> +
> + /* restore msrs */
> + guest_state->flags = data->flags;
> + for (i = 0; i < VMX_MSR_COUNT; i++)
> + guest_state->msrs[i] = data->msr_items[i];
> +
> + guest_state->shadow_gs = data->shadow_gs;
> +
> + /*XXX:no need to restore msrs, current!=vcpu as not called by
arch_vmx_do_launch */
> +/* vmx_restore_guest_msrs(v);*/
> +
> + v->arch.hvm_vmx.vmxassist_enabled = data->vmxassist_enabled;
> +
> + dump_msr_state(guest_state);
> +}
> +
> +void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
> +{
> + struct vmcs_data *data = &ctxt->data;
> +
> + /* set valid flag to recover whole vmcs when restore */
> + ctxt->valid = 1;
> +
> + vmx_save_cpu_state(v, ctxt);
> +
> + vmx_vmcs_enter(v);
> +
> + vmx_vmcs_save(v, data);
> +
> + vmx_vmcs_exit(v);
> +
> +}
> +
> +void vmx_load_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
> +{
> + if (!ctxt->valid)
> + return;
> +
> + vmx_load_cpu_state(v, ctxt);
> +
> + vmx_vmcs_enter(v);
> +
> + if (!vmx_vmcs_restore(v, &ctxt->data)) {
> + printk("vmx_vmcs restore failed!\n");
> + domain_crash(v->domain);
> + }
> +
> + /* only load vmcs once */
> + ctxt->valid = 0;
> +
> + vmx_vmcs_exit(v);
> +
> +}
> +
> /*
> * The VMX spec (section 4.3.1.2, Checks on Guest Segment
> * Registers) says that virtual-8086 mode guests'' segment
> @@ -737,6 +1050,9 @@ static void vmx_setup_hvm_funcs(void)
>
> hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
> hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
> +
> + hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
> + hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;
>
> hvm_funcs.paging_enabled = vmx_paging_enabled;
> hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vpic.c
> --- a/xen/arch/x86/hvm/vpic.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/hvm/vpic.c Wed Dec 13 22:52:02 2006 +0800
> @@ -378,6 +378,87 @@ static int vpic_intercept_elcr_io(ioreq_
> return 1;
> }
>
> +#ifdef HVM_DEBUG_SUSPEND
> +static void vpic_info(struct vpic *s)
> +{
> + printk("*****pic state:*****\n");
> + printk("pic 0x%x.\n", s->irr);
> + printk("pic 0x%x.\n", s->imr);
> + printk("pic 0x%x.\n", s->isr);
> + printk("pic 0x%x.\n", s->irq_base);
> + printk("pic 0x%x.\n", s->init_state);
> + printk("pic 0x%x.\n", s->priority_add);
> + printk("pic 0x%x.\n", s->readsel_isr);
> + printk("pic 0x%x.\n", s->poll);
> + printk("pic 0x%x.\n", s->auto_eoi);
> + printk("pic 0x%x.\n", s->rotate_on_auto_eoi);
> + printk("pic 0x%x.\n", s->special_fully_nested_mode);
> + printk("pic 0x%x.\n", s->special_mask_mode);
> + printk("pic 0x%x.\n", s->elcr);
> + printk("pic 0x%x.\n", s->int_output);
> + printk("pic 0x%x.\n", s->is_master);
> +}
> +#else
> +static void vpic_info(struct vpic *s)
> +{
> +}
> +#endif
> +
> +static void vpic_save(hvm_domain_context_t *h, void *opaque)
> +{
> + struct vpic *s = opaque;
> +
> + vpic_info(s);
> +
> + hvm_put_8u(h, s->irr);
> + hvm_put_8u(h, s->imr);
> + hvm_put_8u(h, s->isr);
> + hvm_put_8u(h, s->irq_base);
> + hvm_put_8u(h, s->init_state);
> + hvm_put_8u(h, s->priority_add);
> + hvm_put_8u(h, s->readsel_isr);
> +
> + hvm_put_8u(h, s->poll);
> + hvm_put_8u(h, s->auto_eoi);
> +
> + hvm_put_8u(h, s->rotate_on_auto_eoi);
> + hvm_put_8u(h, s->special_fully_nested_mode);
> + hvm_put_8u(h, s->special_mask_mode);
> +
> + hvm_put_8u(h, s->elcr);
> + hvm_put_8u(h, s->int_output);
> +}
> +
> +static int vpic_load(hvm_domain_context_t *h, void *opaque, int
version_id)
> +{
> + struct vpic *s = opaque;
> +
> + if (version_id != 1)
> + return -EINVAL;
> +
> + s->irr = hvm_get_8u(h);
> + s->imr = hvm_get_8u(h);
> + s->isr = hvm_get_8u(h);
> + s->irq_base = hvm_get_8u(h);
> + s->init_state = hvm_get_8u(h);
> + s->priority_add = hvm_get_8u(h);
> + s->readsel_isr = hvm_get_8u(h);
> +
> + s->poll = hvm_get_8u(h);
> + s->auto_eoi = hvm_get_8u(h);
> +
> + s->rotate_on_auto_eoi = hvm_get_8u(h);
> + s->special_fully_nested_mode = hvm_get_8u(h);
> + s->special_mask_mode = hvm_get_8u(h);
> +
> + s->elcr = hvm_get_8u(h);
> + s->int_output = hvm_get_8u(h);
> +
> + vpic_info(s);
> +
> + return 0;
> +}
> +
> void vpic_init(struct domain *d)
> {
> struct vpic *vpic;
> @@ -387,12 +468,14 @@ void vpic_init(struct domain *d)
> memset(vpic, 0, sizeof(*vpic));
> vpic->is_master = 1;
> vpic->elcr = 1 << 2;
> + hvm_register_savevm(d, "xen_hvm_i8259", 0x20, 1, vpic_save,
vpic_load, vpic);
> register_portio_handler(d, 0x20, 2, vpic_intercept_pic_io);
> register_portio_handler(d, 0x4d0, 1, vpic_intercept_elcr_io);
>
> /* Slave PIC. */
> vpic++;
> memset(vpic, 0, sizeof(*vpic));
> + hvm_register_savevm(d, "xen_hvm_i8259", 0xa0, 1, vpic_save,
vpic_load, vpic);
> register_portio_handler(d, 0xa0, 2, vpic_intercept_pic_io);
> register_portio_handler(d, 0x4d1, 1, vpic_intercept_elcr_io);
> }
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/common.c
> --- a/xen/arch/x86/mm/shadow/common.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/mm/shadow/common.c Wed Dec 13 22:52:02 2006 +0800
> @@ -2145,7 +2145,7 @@ int shadow_remove_all_mappings(struct vc
> /* Don''t complain if we''re in HVM and
there''s one extra mapping:
> * The qemu helper process has an untyped mapping of this
dom''s RAM */
> if ( !(shadow_mode_external(v->domain)
> - && (page->count_info & PGC_count_mask) <=
2
> + && (page->count_info & PGC_count_mask) <=
3 /* vmx restore add one extra mapping*/
> && (page->u.inuse.type_info &
PGT_count_mask) == 0) )
> {
> SHADOW_ERROR("can''t find all mappings of mfn
%lx: "
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/multi.c
> --- a/xen/arch/x86/mm/shadow/multi.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/arch/x86/mm/shadow/multi.c Wed Dec 13 22:52:02 2006 +0800
> @@ -1613,6 +1613,14 @@ sh_make_shadow(struct vcpu *v, mfn_t gmf
> }
> }
>
> + {
> + struct page_info *page = mfn_to_page(gmfn);
> + /* XXX: add it to emulate a touched page */
> + if ((page->u.inuse.type_info & PGT_type_mask) == PGT_none){
> + page->u.inuse.type_info |= (PGT_writable_page |
PGT_validated);
> + }
> + }
> +
> shadow_promote(v, gmfn, shadow_type);
> set_shadow_status(v, gmfn, shadow_type, smfn);
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domain.c
> --- a/xen/common/domain.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/common/domain.c Wed Dec 13 22:52:02 2006 +0800
> @@ -24,6 +24,7 @@
> #include <xen/percpu.h>
> #include <xen/multicall.h>
> #include <asm/debugger.h>
> +#include <asm/hvm/support.h>
> #include <public/sched.h>
> #include <public/vcpu.h>
>
> @@ -454,8 +455,14 @@ int set_info_guest(struct domain *d,
> domain_pause(d);
>
> rc = -EFAULT;
> - if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 )
> + if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 ) {
> rc = arch_set_info_guest(v, c);
> + if ( v->vcpu_id != 0 &&
> + is_hvm_vcpu(v) &&
> + test_and_clear_bit(_VCPUF_down, &v->vcpu_flags) ) {
> + vcpu_wake(v);
> + }
> + }
>
> domain_unpause(d);
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domctl.c
> --- a/xen/common/domctl.c Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/common/domctl.c Wed Dec 13 22:52:02 2006 +0800
> @@ -26,6 +26,10 @@ extern long arch_do_domctl(
> struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
> extern void arch_getdomaininfo_ctxt(
> struct vcpu *, struct vcpu_guest_context *);
> +extern int arch_gethvm_ctxt(
> + struct vcpu *, struct hvm_domain_context *);
> +extern int arch_sethvm_ctxt(
> + struct vcpu *, struct hvm_domain_context *);
>
> void cpumask_to_xenctl_cpumap(
> struct xenctl_cpumap *xenctl_cpumap, cpumask_t *cpumask)
> @@ -205,6 +209,37 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
> }
> break;
>
> + case XEN_DOMCTL_sethvmcontext:
> + {
> + struct hvm_domain_context *c;
> + struct domain *d;
> + struct vcpu *v;
> +
> + ret = -ESRCH;
> + if ( (d = find_domain_by_id(op->domain)) == NULL )
> + break;
> +
> + ret = -ENOMEM;
> + if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
> + goto sethvmcontext_out;
> +
> + /*XXX: need check input vcpu when smp */
> + v = d->vcpu[0];
> +
> + ret = -EFAULT;
> + if ( copy_from_guest(c, op->u.hvmcontext.ctxt, 1) != 0 )
> + goto sethvmcontext_out;
> +
> + ret = arch_sethvm_ctxt(v, c);
> +
> + xfree(c);
> +
> + sethvmcontext_out:
> + put_domain(d);
> +
> + }
> + break;
> +
> case XEN_DOMCTL_pausedomain:
> {
> struct domain *d = find_domain_by_id(op->domain);
> @@ -489,6 +524,44 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
>
> getvcpucontext_out:
> put_domain(d);
> + }
> + break;
> +
> + case XEN_DOMCTL_gethvmcontext:
> + {
> + struct hvm_domain_context *c;
> + struct domain *d;
> + struct vcpu *v;
> +
> + ret = -ESRCH;
> + if ( (d = find_domain_by_id(op->domain)) == NULL )
> + break;
> +
> + ret = -ENOMEM;
> + if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
> + goto gethvmcontext_out;
> +
> + v = d->vcpu[0];
> +
> + ret = -ENODATA;
> + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
> + goto gethvmcontext_out;
> +
> + ret = 0;
> + if (arch_gethvm_ctxt(v, c) == -1)
> + ret = -EFAULT;
> +
> + if ( copy_to_guest(op->u.hvmcontext.ctxt, c, 1) )
> + ret = -EFAULT;
> +
> + xfree(c);
> +
> + if ( copy_to_guest(u_domctl, op, 1) )
> + ret = -EFAULT;
> +
> + gethvmcontext_out:
> + put_domain(d);
> +
> }
> break;
>
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/domain.h
> --- a/xen/include/asm-x86/hvm/domain.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/asm-x86/hvm/domain.h Wed Dec 13 22:52:02 2006 +0800
> @@ -27,6 +27,20 @@
> #include <asm/hvm/io.h>
> #include <public/hvm/params.h>
>
> +typedef void SaveStateHandler(hvm_domain_context_t *h, void *opaque);
> +typedef int LoadStateHandler(hvm_domain_context_t *h, void *opaque, int
version_id);
> +
> +#define HVM_SE_IDSTR_LEN 32
> +typedef struct HVMStateEntry {
> + char idstr[HVM_SE_IDSTR_LEN];
> + int instance_id;
> + int version_id;
> + SaveStateHandler *save_state;
> + LoadStateHandler *load_state;
> + void *opaque;
> + struct HVMStateEntry *next;
> +} HVMStateEntry;
> +
> struct hvm_domain {
> unsigned long shared_page_va;
> unsigned long buffered_io_va;
> @@ -44,6 +58,9 @@ struct hvm_domain {
> spinlock_t pbuf_lock;
>
> uint64_t params[HVM_NR_PARAMS];
> +
> + struct hvm_domain_context *hvm_ctxt;
> + HVMStateEntry *first_se;
> };
>
> #endif /* __ASM_X86_HVM_DOMAIN_H__ */
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/hvm.h
> --- a/xen/include/asm-x86/hvm/hvm.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/asm-x86/hvm/hvm.h Wed Dec 13 22:52:02 2006 +0800
> @@ -79,6 +79,13 @@ struct hvm_function_table {
> struct vcpu *v, struct cpu_user_regs *r, unsigned long *crs);
> void (*load_cpu_guest_regs)(
> struct vcpu *v, struct cpu_user_regs *r);
> +
> + /* save and load hvm guest cpu context for save/restore */
> + void (*save_cpu_ctxt)(
> + struct vcpu *v, struct hvmcpu_context *ctxt);
> + void (*load_cpu_ctxt)(
> + struct vcpu *v, struct hvmcpu_context *ctxt);
> +
> /*
> * Examine specifics of the guest state:
> * 1) determine whether paging is enabled,
> @@ -152,6 +159,20 @@ hvm_load_cpu_guest_regs(struct vcpu *v,
> hvm_funcs.load_cpu_guest_regs(v, r);
> }
>
> +static inline void
> +hvm_save_cpu_context(
> + struct vcpu *v, struct hvmcpu_context *ctxt)
> +{
> + hvm_funcs.save_cpu_ctxt(v, ctxt);
> +}
> +
> +static inline void
> +hvm_load_cpu_context(
> + struct vcpu *v, struct hvmcpu_context *ctxt)
> +{
> + hvm_funcs.load_cpu_ctxt(v, ctxt);
> +}
> +
> static inline int
> hvm_paging_enabled(struct vcpu *v)
> {
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/support.h
> --- a/xen/include/asm-x86/hvm/support.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/asm-x86/hvm/support.h Wed Dec 13 22:52:02 2006 +0800
> @@ -121,6 +121,130 @@ extern unsigned int opt_hvm_debug_level;
> #define TRACE_VMEXIT(index, value) \
> current->arch.hvm_vcpu.hvm_trace_values[index] = (value)
>
> +/* save/restore support */
> +
> +//#define HVM_DEBUG_SUSPEND
> +
> +extern int hvm_register_savevm(struct domain *d,
> + const char *idstr,
> + int instance_id,
> + int version_id,
> + SaveStateHandler *save_state,
> + LoadStateHandler *load_state,
> + void *opaque);
> +
> +static inline void hvm_ctxt_seek(hvm_domain_context_t *h, unsigned int
pos)
> +{
> + h->cur = pos;
> +}
> +
> +static inline uint32_t hvm_ctxt_tell(hvm_domain_context_t *h)
> +{
> + return h->cur;
> +}
> +
> +static inline int hvm_ctxt_end(hvm_domain_context_t *h)
> +{
> + return (h->cur >= h->size || h->cur >= HVM_CTXT_SIZE);
> +}
> +
> +static inline void hvm_put_byte(hvm_domain_context_t *h, unsigned int i)
> +{
> + if (h->cur >= HVM_CTXT_SIZE) {
> + h->cur++;
> + return;
> + }
> + h->data[h->cur++] = (char)i;
> +}
> +
> +static inline void hvm_put_8u(hvm_domain_context_t *h, uint8_t b)
> +{
> + hvm_put_byte(h, b);
> +}
> +
> +static inline void hvm_put_16u(hvm_domain_context_t *h, uint16_t b)
> +{
> + hvm_put_8u(h, b >> 8);
> + hvm_put_8u(h, b);
> +}
> +
> +static inline void hvm_put_32u(hvm_domain_context_t *h, uint32_t b)
> +{
> + hvm_put_16u(h, b >> 16);
> + hvm_put_16u(h, b);
> +}
> +
> +static inline void hvm_put_64u(hvm_domain_context_t *h, uint64_t b)
> +{
> + hvm_put_32u(h, b >> 32);
> + hvm_put_32u(h, b);
> +}
> +
> +static inline void hvm_put_buffer(hvm_domain_context_t *h, const char
*buf, int len)
> +{
> + memcpy(&h->data[h->cur], buf, len);
> + h->cur += len;
> +}
> +
> +
> +static inline char hvm_get_byte(hvm_domain_context_t *h)
> +{
> + if (h->cur >= HVM_CTXT_SIZE) {
> + printk("hvm_get_byte overflow.\n");
> + return -1;
> + }
> +
> + if (h->cur >= h->size) {
> + printk("hvm_get_byte exceed data area.\n");
> + return -1;
> + }
> +
> + return h->data[h->cur++];
> +}
> +
> +static inline uint8_t hvm_get_8u(hvm_domain_context_t *h)
> +{
> + return hvm_get_byte(h);
> +}
> +
> +static inline uint16_t hvm_get_16u(hvm_domain_context_t *h)
> +{
> + uint16_t v;
> + v = hvm_get_8u(h) << 8;
> + v |= hvm_get_8u(h);
> +
> + return v;
> +}
> +
> +static inline uint32_t hvm_get_32u(hvm_domain_context_t *h)
> +{
> + uint32_t v;
> + v = hvm_get_16u(h) << 16;
> + v |= hvm_get_16u(h);
> +
> + return v;
> +}
> +
> +static inline uint64_t hvm_get_64u(hvm_domain_context_t *h)
> +{
> + uint64_t v;
> + v = (uint64_t)hvm_get_32u(h) << 32;
> + v |= hvm_get_32u(h);
> +
> + return v;
> +}
> +
> +static inline void hvm_get_buffer(hvm_domain_context_t *h, char *buf, int
len)
> +{
> + memcpy(buf, &h->data[h->cur], len);
> + h->cur += len;
> +}
> +
> +extern int hvm_save(struct vcpu*, hvm_domain_context_t *h);
> +extern int hvm_load(struct vcpu*, hvm_domain_context_t *h);
> +
> +extern void shpage_init(struct domain *d, shared_iopage_t *sp);
> +
> extern int hvm_enabled;
>
> int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size);
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/vpt.h
> --- a/xen/include/asm-x86/hvm/vpt.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/asm-x86/hvm/vpt.h Wed Dec 13 22:52:02 2006 +0800
> @@ -123,7 +123,7 @@ extern void hvm_hooks_assist(struct vcpu
> extern void hvm_hooks_assist(struct vcpu *v);
> extern void pickup_deactive_ticks(struct periodic_time *vpit);
> extern struct periodic_time *create_periodic_time(
> - u32 period, char irq, char one_shot, time_cb *cb, void *data);
> + struct domain* d, u32 period, char irq, char one_shot, time_cb *cb,
void *data);
> extern void destroy_periodic_time(struct periodic_time *pt);
> void pit_init(struct vcpu *v, unsigned long cpu_khz);
> void rtc_init(struct vcpu *v, int base, int irq);
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_32.h
> --- a/xen/include/public/arch-x86_32.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/public/arch-x86_32.h Wed Dec 13 22:52:02 2006 +0800
> @@ -181,6 +181,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
> DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
>
> typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
> +
> +#include "vmcs_data.h"
> +
> +struct hvmcpu_context {
> + uint32_t valid;
> + struct vmcs_data data;
> +};
>
> /*
> * The following is all CPU context. Note that the fpu_ctxt block is
filled
> @@ -210,6 +217,7 @@ struct vcpu_guest_context {
> unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback
*/
> unsigned long failsafe_callback_eip;
> unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
> + struct hvmcpu_context hvmcpu_ctxt; /* whole vmcs region */
> };
> typedef struct vcpu_guest_context vcpu_guest_context_t;
> DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_64.h
> --- a/xen/include/public/arch-x86_64.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/public/arch-x86_64.h Wed Dec 13 22:52:02 2006 +0800
> @@ -255,6 +255,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
>
> typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
>
> +#include "vmcs_data.h"
> +
> +struct hvmcpu_context {
> + uint32_t valid;
> + struct vmcs_data data;
> +};
> +
> /*
> * The following is all CPU context. Note that the fpu_ctxt block is
filled
> * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
> @@ -288,6 +295,7 @@ struct vcpu_guest_context {
> uint64_t fs_base;
> uint64_t gs_base_kernel;
> uint64_t gs_base_user;
> + struct hvmcpu_context hvmcpu_ctxt; /* whole vmcs region */
> };
> typedef struct vcpu_guest_context vcpu_guest_context_t;
> DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/domctl.h
> --- a/xen/include/public/domctl.h Fri Sep 15 17:05:38 2006 +0800
> +++ b/xen/include/public/domctl.h Wed Dec 13 22:52:02 2006 +0800
> @@ -384,6 +384,21 @@ struct xen_domctl_settimeoffset {
> };
> typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
> DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
> +
> +#define HVM_CTXT_SIZE 6144
> +typedef struct hvm_domain_context {
> + uint32_t cur;
> + uint32_t size;
> + uint8_t data[HVM_CTXT_SIZE];
> +} hvm_domain_context_t;
> +DEFINE_XEN_GUEST_HANDLE(hvm_domain_context_t);
> +
> +#define XEN_DOMCTL_gethvmcontext 33
> +#define XEN_DOMCTL_sethvmcontext 34
> +typedef struct xen_domctl_hvmcontext {
> + XEN_GUEST_HANDLE(hvm_domain_context_t) ctxt; /* IN/OUT */
> +} xen_domctl_hvmcontext_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);
>
> struct xen_domctl {
> uint32_t cmd;
> @@ -410,6 +425,7 @@ struct xen_domctl {
> struct xen_domctl_hypercall_init hypercall_init;
> struct xen_domctl_arch_setup arch_setup;
> struct xen_domctl_settimeoffset settimeoffset;
> + struct xen_domctl_hvmcontext hvmcontext;
> uint8_t pad[128];
> } u;
> };
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_restore.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/tools/libxc/xc_hvm_restore.c Wed Dec 13 22:52:02 2006 +0800
> @@ -0,0 +1,280 @@
>
+/******************************************************************************
> + * xc_hvm_restore.c
> + *
> + * Restore the state of a HVM guest.
> + *
> + * Copyright (c) 2003, K A Fraser.
> + * Copyright (c) 2006 Intel Corperation
> + * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
with
> + * this program; if not, write to the Free Software Foundation, Inc., 59
Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + */
> +
> +#include <stdlib.h>
> +#include <unistd.h>
> +
> +#include "xg_private.h"
> +#include "xg_save_restore.h"
> +
> +#include <xen/hvm/ioreq.h>
> +#include <xen/hvm/params.h>
> +#include <xen/hvm/e820.h>
> +
> +/* max mfn of the whole machine */
> +static unsigned long max_mfn;
> +
> +/* virtual starting address of the hypervisor */
> +static unsigned long hvirt_start;
> +
> +/* #levels of page tables used by the currrent guest */
> +static unsigned int pt_levels;
> +
> +/* total number of pages used by the current guest */
> +static unsigned long max_pfn;
> +
> +/* A table mapping each PFN to its new MFN. */
> +static xen_pfn_t *p2m = NULL;
> +
> +static ssize_t
> +read_exact(int fd, void *buf, size_t count)
> +{
> + int r = 0, s;
> + unsigned char *b = buf;
> +
> + while (r < count) {
> + s = read(fd, &b[r], count - r);
> + if ((s == -1) && (errno == EINTR))
> + continue;
> + if (s <= 0) {
> + break;
> + }
> + r += s;
> + }
> +
> + return (r == count) ? 1 : 0;
> +}
> +
> +int xc_hvm_restore(int xc_handle, int io_fd,
> + uint32_t dom, unsigned long nr_pfns,
> + unsigned int store_evtchn, unsigned long *store_mfn,
> + unsigned int console_evtchn, unsigned long
*console_mfn,
> + unsigned int pae, unsigned int apic)
> +{
> + DECLARE_DOMCTL;
> +
> + /* The new domain''s shared-info frame number. */
> + unsigned long shared_info_frame;
> +
> + /* A copy of the CPU context of the guest. */
> + vcpu_guest_context_t ctxt;
> +
> + char *region_base;
> +
> + xc_mmu_t *mmu = NULL;
> +
> + xc_dominfo_t info;
> + unsigned int rc = 1, i;
> + uint32_t rec_len, nr_vcpus;
> + hvm_domain_context_t hvm_ctxt;
> + unsigned long long v_end, memsize;
> + unsigned long shared_page_nr;
> +
> + /* hvm guest mem size (Mb) */
> + memsize = (unsigned long long)*store_mfn;
> + v_end = memsize << 20;
> +
> + DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d,
*store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n",
> + dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn,
*console_mfn, pae, apic);
> +
> +
> +
> + /*XXX: caculate the VGA hole, it''s better derived from
memsize*/
> + max_pfn = nr_pfns + 0x20;
> +
> + if(!get_platform_info(xc_handle, dom,
> + &max_mfn, &hvirt_start, &pt_levels))
{
> + ERROR("Unable to get platform info.");
> + return 1;
> + }
> +
> + DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx,
hvirt_start=%lx, pt_levels=%d\n",
> + max_pfn,
> + max_mfn,
> + hvirt_start,
> + pt_levels);
> +
> + if (mlock(&ctxt, sizeof(ctxt))) {
> + /* needed for build dom0 op, but might as well do early */
> + ERROR("Unable to mlock ctxt");
> + return 1;
> + }
> +
> +
> + p2m = malloc(max_pfn * sizeof(xen_pfn_t));
> +
> + if (p2m == NULL) {
> + ERROR("memory alloc failed");
> + errno = ENOMEM;
> + goto out;
> + }
> +
> + /* Get the domain''s shared-info frame. */
> + domctl.cmd = XEN_DOMCTL_getdomaininfo;
> + domctl.domain = (domid_t)dom;
> + if (xc_domctl(xc_handle, &domctl) < 0) {
> + ERROR("Could not get information on new domain");
> + goto out;
> + }
> + shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
> +
> + if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
> + errno = ENOMEM;
> + goto out;
> + }
> +
> + for ( i = 0; i < max_pfn; i++ )
> + p2m[i] = i;
> + for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn;
i++ )
> + p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
> +
> + /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
*/
> + rc = xc_domain_memory_populate_physmap(
> + xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn,
> + 0, 0, &p2m[0x00]);
> + if ( (rc == 0) && (max_pfn > 0xc0) )
> + rc = xc_domain_memory_populate_physmap(
> + xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]);
> + if ( rc != 0 )
> + {
> + PERROR("Could not allocate memory for HVM guest.\n");
> + goto out;
> + }
> +
> +
> + /**********XXXXXXXXXXXXXXXX******************/
> + if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
> + ERROR("Could not get domain info");
> + return 1;
> + }
> +
> + domctl.cmd = XEN_DOMCTL_getdomaininfo;
> + domctl.domain = (domid_t)dom;
> + if (xc_domctl(xc_handle, &domctl) < 0) {
> + ERROR("Could not get information on new domain");
> + goto out;
> + }
> +
> + for ( i = 0; i < max_pfn; i++)
> + p2m[i] = i;
> +
> + /* resotre memory */
> + if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ |
PROT_WRITE, p2m, max_pfn) ) == 0) {
> + ERROR("HVM:map page_array failed!\n");
> + goto out;
> + }
> +
> + for (i = 0; i < max_pfn; i++) {
> + void *zpage = region_base + i * PAGE_SIZE;
> + if ( p2m[i] == (~0UL)) { /*invalid mfn*/
> + continue;
> + }
> + if (i >= 0xa0 && i < 0xc0) {
> + continue;
> + }
> +
> + if (!read_exact(io_fd, zpage, PAGE_SIZE)) {
> + ERROR("HVM:read page %d failed!\n", i);
> + goto out;
> + }
> + }
> +
> + (void)munmap(region_base, max_pfn*PAGE_SIZE);
> +
> +
> +/* xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/
> + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
> +
> + if ( v_end > HVM_BELOW_4G_RAM_END )
> + shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
> + else
> + shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
> +
> + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
shared_page_nr-2);
> + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
> +
> + /* caculate the store_mfn , wrong val cause hang when introduceDomain
*/
> + *store_mfn = p2m[(v_end >> PAGE_SHIFT) - 2];
> + DPRINTF("hvm restore:calculate new
store_mfn=0x%lx,v_end=0x%llx..\n", *store_mfn, v_end);
> +
> + /* restore hvm context including pic/pit/shpage */
> + if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
> + ERROR("error read hvm context size!\n");
> + goto out;
> + }
> + if (rec_len != sizeof(hvm_ctxt)) {
> + ERROR("hvm context size dismatch!\n");
> + goto out;
> + }
> +
> + if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
> + ERROR("error read hvm context!\n");
> + goto out;
> + }
> +
> + if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) {
> + ERROR("error set hvm context!\n");
> + goto out;
> + }
> +
> + if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
> + ERROR("error read nr vcpu !\n");
> + goto out;
> + }
> + DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus);
> +
> + for (i =0; i < nr_vcpus; i++) {
> + if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
> + ERROR("error read vcpu context size!\n");
> + goto out;
> + }
> + if (rec_len != sizeof(ctxt)) {
> + ERROR("vcpu context size dismatch!\n");
> + goto out;
> + }
> +
> + if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
> + ERROR("error read vcpu context.\n");
> + goto out;
> + }
> +
> + if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) {
> + ERROR("Could not set vcpu context, rc=%d", rc);
> + goto out;
> + }
> + }
> +
> + rc = 0;
> + goto out;
> +
> + out:
> + if ( (rc != 0) && (dom != 0) )
> + xc_domain_destroy(xc_handle, dom);
> + free(mmu);
> + free(p2m);
> +
> + DPRINTF("Restore exit with rc=%d\n", rc);
> +
> + return rc;
> +}
> diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_save.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/tools/libxc/xc_hvm_save.c Wed Dec 13 22:52:02 2006 +0800
> @@ -0,0 +1,248 @@
>
+/******************************************************************************
> + * xc_hvm_save.c
> + *
> + * Save the state of a running HVM guest.
> + *
> + * Copyright (c) 2003, K A Fraser.
> + * Copyright (c) 2006 Intel Corperation
> + * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
with
> + * this program; if not, write to the Free Software Foundation, Inc., 59
Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + */
> +
> +#include <inttypes.h>
> +#include <time.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/time.h>
> +
> +#include "xc_private.h"
> +#include "xg_private.h"
> +#include "xg_save_restore.h"
> +
> +#define DEF_MAX_ITERS (4 - 1) /* limit us to 4 times round loop */
> +#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
> +
> +/* max mfn of the whole machine */
> +static unsigned long max_mfn;
> +
> +/* virtual starting address of the hypervisor */
> +static unsigned long hvirt_start;
> +
> +/* #levels of page tables used by the currrent guest */
> +static unsigned int pt_levels;
> +
> +/* total number of pages used by the current guest */
> +static unsigned long max_pfn;
> +
> +#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
> +
> +int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t
max_iters,
> + uint32_t max_factor, uint32_t flags, int
(*suspend)(int))
> +{
> + xc_dominfo_t info;
> +
> + int rc = 1, i;
> + int live = (flags & XCFLAGS_LIVE);
> + int debug = (flags & XCFLAGS_DEBUG);
> +
> + /* The new domain''s shared-info frame number. */
> + unsigned long shared_info_frame;
> +
> + /* A copy of the CPU context of the guest. */
> + vcpu_guest_context_t ctxt;
> +
> + /* A copy of hvm domain context */
> + hvm_domain_context_t hvm_ctxt;
> +
> + /* Live mapping of shared info structure */
> + shared_info_t *live_shinfo = NULL;
> +
> + /* base of the region in which domain memory is mapped */
> + unsigned char *region_base = NULL;
> +
> + uint32_t nr_pfns, max_pfns, rec_size, nr_vcpus;
> + unsigned long *page_array;
> +
> + DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d,
flags=0x%x.\n",
> + dom, max_iters, max_factor, flags);
> +
> + /* If no explicit control parameters given, use defaults */
> + if(!max_iters)
> + max_iters = DEF_MAX_ITERS;
> + if(!max_factor)
> + max_factor = DEF_MAX_FACTOR;
> +
> +/* initialize_mbit_rate();*/
> +
> + if(!get_platform_info(xc_handle, dom,
> + &max_mfn, &hvirt_start, &pt_levels))
{
> + ERROR("HVM:Unable to get platform info.");
> + return 1;
> + }
> +
> + if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
> + ERROR("HVM:Could not get domain info");
> + return 1;
> + }
> + nr_vcpus = info.nr_online_vcpus;
> +
> + if (mlock(&ctxt, sizeof(ctxt))) {
> + ERROR("HVM:Unable to mlock ctxt");
> + return 1;
> + }
> +
> + /* Only have to worry about vcpu 0 even for SMP */
> + if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
> + ERROR("HVM:Could not get vcpu context");
> + goto out;
> + }
> + shared_info_frame = info.shared_info_frame;
> +
> + /* A cheesy test to see whether the domain contains valid state. */
> + if (ctxt.ctrlreg[3] == 0)
> + {
> + ERROR("Domain is not in a valid HVM guest state");
> + goto out;
> + }
> +
> + /* cheesy sanity check */
> + if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
> + ERROR("Invalid HVM state record -- pfn count out of range:
%lu",
> + (info.max_memkb >> (PAGE_SHIFT - 10)));
> + goto out;
> + }
> +
> + /* Map the shared info frame */
> + if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
> + PROT_READ,
shared_info_frame))) {
> + ERROR("HVM:Couldn''t map live_shinfo");
> + goto out;
> + }
> +
> + max_pfn = live_shinfo->arch.max_pfn;
> +
> + DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx,
nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages);
> +
> + if (live) {
> + ERROR("hvm domain doesn''t support live migration
now.\n");
> + if (debug)
> + ERROR("hvm domain debug on.\n");
> + goto out;
> + }
> +
> + /* suspend hvm domain */
> + if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
&ctxt)) {
> + ERROR("HVM Domain appears not to have suspended");
> + goto out;
> + }
> +
> + nr_pfns = info.nr_pages;
> + DPRINTF("after suspend hvm domain nr_pages=0x%x,
max_memkb=0x%lx.\n", nr_pfns, info.max_memkb);
> +
> + /*XXX: caculate the VGA hole*/
> + max_pfns = nr_pfns + 0x20;
> +
> + /* get all the HVM domain pfns */
> + if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) *
max_pfns)) == NULL) {
> + ERROR("HVM:malloc fail!\n");
> + goto out;
> + }
> +
> + for ( i = 0; i < max_pfns; i++)
> + page_array[i] = i;
> +
> + if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ |
PROT_WRITE, page_array, max_pfns) ) == 0) {
> + ERROR("HVM domain map pages failed!\n");
> + goto out;
> + }
> +
> +
> + /* Start writing out the saved-domain record. begin with mem */
> + if (!write_exact(io_fd, &nr_pfns, sizeof(unsigned int))) {
> + ERROR("write: nr_pfns");
> + goto out;
> + }
> +
> + for (i = 0; i < max_pfns; i++) {
> + int ret;
> + void *zpage = region_base + i * PAGE_SIZE;
> + if ( page_array[i] == (~0UL)) {
> + continue;
> + }
> + if (i >= 0xa0 && i < 0xc0) {
> + continue;
> + }
> +
> + if ((ret = ratewrite(io_fd, zpage, PAGE_SIZE)) != PAGE_SIZE) {
> + ERROR("HVM:read page %d failed, mfn=0x%lx.\n", i,
page_array[i]);
> + goto out;
> + }
> + }
> +
> + /* save hvm hypervisor state including pic/pit/shpage */
> + if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
> + ERROR("Unable to mlock ctxt");
> + return 1;
> + }
> +
> + if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){
> + ERROR("HVM:Could not get hvm context");
> + goto out;
> + }
> +
> + rec_size = sizeof(hvm_ctxt);
> + if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
> + ERROR("error write hvm ctxt size");
> + goto out;
> + }
> +
> + if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
> + ERROR("write HVM info failed!\n");
> + }
> +
> + /* save vcpu/vmcs context */
> + if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
> + ERROR("error write nr vcpus");
> + goto out;
> + }
> +
> + /*XXX: need a online map to exclude down cpu */
> + for (i = 0; i < nr_vcpus; i++) {
> +
> + if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) {
> + ERROR("HVM:Could not get vcpu context");
> + goto out;
> + }
> +
> + rec_size = sizeof(ctxt);
> + DPRINTF("write %d vcpucontext of total %d.\n", i,
nr_vcpus);
> + if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
> + ERROR("error write vcpu ctxt size");
> + goto out;
> + }
> +
> + if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) {
> + ERROR("write vmcs failed!\n");
> + goto out;
> + }
> + }
> +
> + /* Success! */
> + rc = 0;
> +
> + out:
> + return !!rc;
> +}
> diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/vmcs_data.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/public/vmcs_data.h Wed Dec 13 22:52:02 2006 +0800
> @@ -0,0 +1,68 @@
>
+/******************************************************************************
> + * vmcs_data.h
> + *
> + * Copyright (c) 2006 Intel Corperation
> + *
> + */
> +
> +#ifndef __XEN_PUBLIC_VMCS_DATA_H__
> +#define __XEN_PUBLIC_VMCS_DATA_H__
> +
> +/*
> + * World vmcs state
> + */
> +struct vmcs_data {
> + uint64_t eip; /* execution pointer */
> + uint64_t esp; /* stack pointer */
> + uint64_t eflags; /* flags register */
> + uint64_t cr0;
> + uint64_t cr3; /* page table directory */
> + uint64_t cr4;
> + uint32_t idtr_limit; /* idt */
> + uint64_t idtr_base;
> + uint32_t gdtr_limit; /* gdt */
> + uint64_t gdtr_base;
> + uint32_t cs_sel; /* cs selector */
> + uint32_t cs_limit;
> + uint64_t cs_base;
> + uint32_t cs_arbytes;
> + uint32_t ds_sel; /* ds selector */
> + uint32_t ds_limit;
> + uint64_t ds_base;
> + uint32_t ds_arbytes;
> + uint32_t es_sel; /* es selector */
> + uint32_t es_limit;
> + uint64_t es_base;
> + uint32_t es_arbytes;
> + uint32_t ss_sel; /* ss selector */
> + uint32_t ss_limit;
> + uint64_t ss_base;
> + uint32_t ss_arbytes;
> + uint32_t fs_sel; /* fs selector */
> + uint32_t fs_limit;
> + uint64_t fs_base;
> + uint32_t fs_arbytes;
> + uint32_t gs_sel; /* gs selector */
> + uint32_t gs_limit;
> + uint64_t gs_base;
> + uint32_t gs_arbytes;
> + uint32_t tr_sel; /* task selector */
> + uint32_t tr_limit;
> + uint64_t tr_base;
> + uint32_t tr_arbytes;
> + uint32_t ldtr_sel; /* ldtr selector */
> + uint32_t ldtr_limit;
> + uint64_t ldtr_base;
> + uint32_t ldtr_arbytes;
> + uint32_t sysenter_cs;
> + uint64_t sysenter_esp;
> + uint64_t sysenter_eip;
> + /* msr for em64t */
> + uint64_t shadow_gs;
> + uint64_t flags;
> + /* same size as VMX_MSR_COUNT */
> + uint64_t msr_items[6];
> + uint64_t vmxassist_enabled;
> +};
> +typedef struct vmcs_data vmcs_data_t;
> +#endif
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel