thr3ads.net - Linux Virtualization - [RFC/PATCH LGUEST X86

If this information is useful, please help other people find it:
Share via:

Steven Rostedt

2007-Apr-18 13:02 UTC

[RFC/PATCH LGUEST X86_64 03/13] lguest64 core

plain text document attachment (lguest64.patch)
This is the main core code for the lguest64.

Have fun, and don't hurt the puppies!

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Glauber de Oliveira Costa <glommer@gmail.com>
Cc: Chris Wright <chrisw@sous-sol.org>


Index: work-pv/arch/x86_64/lguest/Makefile
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/Makefile
@@ -0,0 +1,24 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST)	+= lg.o
+lg-objs := core.o hypervisor.o lguest_user.o hv_vm.o page_tables.o \
+hypercalls.o io.o interrupts_and_traps.o lguest_debug.o
+
+# hypercalls.o page_tables.o interrupts_and_traps.o \
+#	segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC00000+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)"
-DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+##$(obj)/core.o: $(obj)/hypervisor-blob.c
+### This links the hypervisor in the right place and turns it into a C array.
+##$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+##	@$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf %#x
$$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O
binary $@
+##$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+##	@od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e
's/ /,0x/g' > $@
+
Index: work-pv/arch/x86_64/lguest/core.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/core.c
@@ -0,0 +1,379 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <asm/paravirt.h>
+#include <asm/hv_vm.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "lguest.h"
+
+#define HV_OFFSET(x) (typeof(x))((unsigned long)(x)+lguest_hv_offset)
+
+unsigned long lguest_hv_addr;
+unsigned long lguest_hv_offset;
+int lguest_hv_pages;
+
+int lguest_vcpu_pages;
+int lguest_vcpu_order;
+
+DEFINE_MUTEX(lguest_lock);
+
+int lguest_address_ok(const struct lguest_guest_info *linfo, u64 addr)
+{
+	return addr / PAGE_SIZE < linfo->pfn_limit;
+}
+
+u8 lhread_u8(struct lguest_vcpu *vcpu, u64 addr)
+{
+	u8 val = 0;
+
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || get_user(val, (u8 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+	return val;
+}
+
+u16 lhread_u16(struct lguest_vcpu *vcpu, u64 addr)
+{
+	u16 val = 0;
+
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || get_user(val, (u16 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+	return val;
+}
+
+u64 lhread_u64(struct lguest_vcpu *vcpu, u64 addr)
+{
+	u64 val = 0;
+
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || get_user(val, (u64 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+	return val;
+}
+
+void lhwrite_u64(struct lguest_vcpu *vcpu, u64 addr, u64 val)
+{
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || put_user(val, (u64 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+}
+
+void lhread(struct lguest_guest_info *linfo, void *b, u64 addr, unsigned bytes)
+{
+	if (addr + bytes < addr || !lguest_address_ok(linfo, addr+bytes)
+	   || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+		/* copy_from_user should do this, but as we rely on it... */
+		memset(b, 0, bytes);
+		kill_guest(linfo, "bad read address %llx len %u", addr, bytes);
+	}
+}
+
+void lhwrite(struct lguest_guest_info *linfo, u64 addr, const void *b,
+								unsigned bytes)
+{
+	if (addr + bytes < addr
+	   || !lguest_address_ok(linfo, addr+bytes)
+	   || copy_to_user((void __user *)addr, b, bytes) != 0)
+		kill_guest(linfo, "bad write address %llx len %u", addr, bytes);
+}
+
+static struct gate_struct *get_idt_table(void)
+{
+	struct desc_ptr idt;
+
+	asm("sidt %0":"=m" (idt));
+	return (void *)idt.address;
+}
+
+static int emulate_insn(struct lguest_vcpu *vcpu)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	unsigned long physaddr = guest_pa(vcpu->guest, vcpu->regs.rip);
+
+	if (vcpu->regs.rip < vcpu->guest->page_offset)
+		return 0;
+
+	lhread(vcpu->guest, &insn, physaddr, 1);
+
+	/* Operand size prefix means it's actually for ax. */
+	if (insn == 0x66) {
+		shift = 16;
+		insnlen = 1;
+		printk("physaddr + len: %lx\n",physaddr+insnlen);
+		lhread(vcpu->guest, &insn, physaddr + insnlen, 1);
+	}
+
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		printk("%llx: %02x unimplemented op\n", vcpu->regs.rip, insn);
+		kill_guest_dump(vcpu, "bad op");
+		return 0;
+	}
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			vcpu->regs.rax = 0xFFFFFFFF;
+		else
+			vcpu->regs.rax |= (0xFFFF << shift);
+	}
+	vcpu->regs.rip += insnlen;
+	return 1;
+}
+
+#define SAVE_CR2(cr2)	asm volatile ("movq %%cr2, %0" : "=r"
(cr2))
+
+static void run_guest_once(struct lguest_vcpu *vcpu)
+{
+	void (*sw_guest)(struct lguest_vcpu *) = HV_OFFSET(&switch_to_guest);
+	unsigned long foo, bar;
+
+	BUG_ON(!vcpu->regs.cr3);
+	BUG_ON(!vcpu->pgdir);
+	BUG_ON(!vcpu->pgdir->pgdir);
+	asm volatile ("pushq %2; pushq %%rsp; pushfq; pushq %3; call *%6;"
+		      /* The stack we pushed is off by 8, due to the previous pushq */
+		      "addq $8, %%rsp"
+		      : "=D"(foo), "=a"(bar)
+		      : "i" (__KERNEL_DS), "i" (__KERNEL_CS),
"0" (vcpu), "1"(get_idt_table()),
+			"r" (sw_guest)
+		      : "memory", "cc");
+}
+
+/* FIXME: don't know yet the right parameters to put here */
+int run_guest(struct lguest_vcpu *vcpu, char *__user user)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct desc_struct *gdt_table;
+	struct lguest_regs *regs = &vcpu->regs;
+	int ret;
+
+	unsigned long cr2 = 0;
+
+	while (!linfo->dead) {
+
+		if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+
+			if (lguest_debug) {
+				printk("hit trap %lld rip=", regs->trapnum);
+				lguest_print_address(vcpu, regs->rip);
+				printk("calling hypercall %d!\n", (unsigned)regs->rax);
+			}
+
+			regs->trapnum = 255;
+			hypercall(vcpu);
+			if (linfo->dead)
+				lguest_dump_vcpu_regs(vcpu);
+		}
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		maybe_do_interrupt(vcpu);
+
+		try_to_freeze();
+
+		if (linfo->dead)
+			return -1;
+
+
+		local_irq_disable();
+
+		/*
+		 * keep a pointer to the host GDT tss address.
+		 * Do this after disabling interrupts to make sure we
+		 * are on the same CPU.
+		 */
+		gdt_table = cpu_gdt(smp_processor_id());
+		vcpu->host_gdt_ptr = (unsigned long)gdt_table;
+		asm volatile ("sidt %0" : "=m"(vcpu->host_idt));
+
+		/* Even if *we* don't want FPU trap, guest might... */
+		if (vcpu->ts)
+			stts();
+
+		run_guest_once(vcpu);
+
+		if (regs->trapnum == 14) {
+			SAVE_CR2(cr2);
+			lgdebug_print("faulting cr2: %lx\n",cr2);
+		}
+
+		else if (regs->trapnum == 7)
+			math_state_restore();
+
+		if (lguest_debug && regs->trapnum < 32) {
+			printk("hit trap %lld rip=", regs->trapnum);
+			lguest_print_address(vcpu, regs->rip);
+		}
+
+		local_irq_enable();
+
+		BUG_ON(regs->trapnum > 0xFF);
+
+		switch (regs->trapnum) {
+		case 7:
+			/* We've intercepted a Device Not Available fault. */
+			/* If they don't want to know, just absorb it. */
+			if (!vcpu->ts)
+				continue;
+			if (reflect_trap(vcpu, 7, 1))
+				continue;
+			kill_guest(vcpu->guest, "Unhandled FPU trap at %#llx",
+								regs->rip);
+		case 13:
+			if (!regs->errcode) {
+				ret = emulate_insn(vcpu);
+				if (ret < 0) {
+					lguest_dump_vcpu_regs(vcpu);
+					return ret;
+				}
+				continue;
+			}
+			kill_guest_dump(vcpu, "took gfp errcode %lld\n",
regs->errcode);
+			lguest_dump_vcpu_regs(vcpu);
+			break;
+		case 14:
+			if (demand_page(vcpu, cr2, regs->errcode & PF_WRITE))
+				continue;
+
+			if (lguest_debug) {
+				printk ("guest taking a page fault\n");
+				lguest_print_page_tables(vcpu->pgdir->pgdir);
+			}
+
+			/* inform guest on the current state of cr2 */
+			put_user(cr2, &linfo->lguest_data->cr2);
+			if (reflect_trap(vcpu, 14, 1))
+				continue;
+
+			lguest_dump_vcpu_regs(vcpu);
+			kill_guest_dump(vcpu, "unhandled page fault at %#lx"
+					" (rip=%#llx, errcode=%#llx)",
+					cr2, regs->rip, regs->errcode);
+			break;
+		case LGUEST_TRAP_ENTRY:
+			/* hypercall! */
+			continue;
+
+		case 32 ... 255:
+			cond_resched();
+			break;
+		default:
+			kill_guest_dump(vcpu, "bad trapnum %lld\n", regs->trapnum);
+			lguest_dump_vcpu_regs(vcpu);
+			return -EINVAL;
+		}
+	}
+	return -ENOENT;
+}
+
+extern long end_hyper_text;
+extern long start_hyper_text;
+
+static int __init init(void)
+{
+	unsigned long pages;
+	unsigned long hvaddr;
+#if 0
+	unsigned long lg_hcall = (unsigned long)HV_OFFSET(&hcall_teste);
+	unsigned long *lg_host_syscall +				(unsigned long
*)HV_OFFSET(&host_syscall);
+#endif
+	int order;
+	int ret;
+
+	int i;
+	printk("start_hyper_text=%p\n",&start_hyper_text);
+	printk("end_hyper_text=%p\n",&end_hyper_text);
+	printk("default_idt_entries=%p\n",&_lguest_default_idt_entries);
+	printk("sizeof(vcpu)=%ld\n",sizeof(struct lguest_vcpu));
+
+	pages = (sizeof(struct lguest_vcpu)+(PAGE_SIZE-1))/PAGE_SIZE;
+	for (order = 0; (1<<order) < pages; order++)
+		;
+
+	lguest_vcpu_pages = pages;
+	lguest_vcpu_order = order;
+
+	ret = paravirt_enabled();
+	if (ret < 0)
+		return -EPERM;
+
+	ret = lguest_device_init();
+	if (ret < 0) {
+		return ret;
+	}
+
+	pages = (unsigned long)&end_hyper_text -
+		(unsigned long)&start_hyper_text;
+	pages = (pages + (PAGE_SIZE - 1)) / PAGE_SIZE;
+
+	ret = hvvm_map_pages(&start_hyper_text, pages, &hvaddr);
+	if (ret < 0)
+		goto out;
+	printk("hvaddr=%lx\n",hvaddr);
+
+	lguest_hv_addr = hvaddr;
+	lguest_hv_pages = pages;
+	lguest_hv_offset = hvaddr - (unsigned long)&start_hyper_text;
+
+	/* Setup LGUEST segments on all cpus */
+	for_each_possible_cpu(i) {
+		struct desc_struct *gdt_table;
+		gdt_table = cpu_gdt(i);
+		gdt_table[GDT_ENTRY_HV_CS] = gdt_table[gdt_index(__KERNEL_CS)];
+		gdt_table[GDT_ENTRY_HV_DS] = gdt_table[gdt_index(__KERNEL_DS)];
+	}
+
+//	rdmsrl(MSR_LSTAR, *lg_host_syscall);
+//	wrmsrl(MSR_LSTAR, lg_hcall);
+	return 0;
+#if 0
+	ret = init_pagetables(hvaddr);
+	if (ret < 0)
+		goto out2;
+
+	return 0;
+
+out2:
+	hvvm_unnmap_pages(hvaddr, pages);
+#endif
+out:
+	lguest_device_remove();
+	return ret;
+}
+
+
+static void __exit fini(void)
+{
+#if 0
+	unsigned long *lg_host_syscall +			(unsigned long
*)HV_OFFSET(&host_syscall);
+
+	wrmsrl(MSR_LSTAR, *lg_host_syscall);
+#endif
+	hvvm_release_all();
+	lguest_device_remove();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
Index: work-pv/arch/x86_64/lguest/hypercalls.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/hypercalls.c
@@ -0,0 +1,324 @@
+/*  Actual hypercalls, which allow guests to actually do something.
+    Copyright (C) 2007, Glauber de Oliveira Costa <gcosta@redhat.com>
+                        Steven Rostedt <srostedt@redhat.com>
+                        Red Hat Inc
+    Standing on the shoulders of Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include "lguest.h"
+
+/* FIXME: add this to Kconfig */
+#define CONFIG_LGUEST_DEBUG 1
+
+static void guest_set_stack(struct lguest_vcpu *vcpu,
+			    u64 rsp, unsigned int pages)
+{
+	/* You cannot have a stack segment with priv level 0. */
+	if (pages > 2)
+		kill_guest_dump(vcpu, "bad stack pages %u", pages);
+	vcpu->tss.rsp2 = rsp;
+	/* FIXME */
+//	lg->stack_pages = pages;
+//	pin_stack_pages(lg);
+}
+
+static DEFINE_MUTEX(hcall_print_lock);
+#define HCALL_PRINT_SIZ 1024
+static char hcall_print_buf[HCALL_PRINT_SIZ];
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest_vcpu *vcpu)
+{
+	struct lguest_regs *regs = &vcpu->regs;
+	struct lguest_guest_info *linfo = vcpu->guest;
+	unsigned long val;
+	unsigned long ret;
+
+	switch (regs->rax) {
+	case LHCALL_PRINT:
+		mutex_lock(&hcall_print_lock);
+		ret = strncpy_from_user(hcall_print_buf,
+					(const char __user *)regs->rdx,
+					HCALL_PRINT_SIZ);
+		if (ret < 0) {
+			kill_guest_dump(vcpu,
+					"bad hcall print pointer (%llx)",
+					regs->rdx);
+			mutex_unlock(&hcall_print_lock);
+			return -EFAULT;
+		}
+		printk("LGUEST: %s", hcall_print_buf);
+		mutex_unlock(&hcall_print_lock);
+
+		break;
+	case LHCALL_FLUSH_ASYNC:
+		break;
+	case LHCALL_LGUEST_INIT:
+		kill_guest_dump(vcpu, "already have lguest_data");
+		break;
+	case LHCALL_RDMSR:
+		switch (regs->rdx) {
+		case MSR_KERNEL_GS_BASE:
+			val = (vcpu->guest_gs_shadow_a & ((1UL << 32)-1)) |
+				(vcpu->guest_gs_shadow_d << 32);
+			lhwrite_u64(vcpu, regs->rbx, val);
+			break;
+		case MSR_GS_BASE:
+			val = (vcpu->guest_gs_a & ((1UL << 32)-1)) |
+				(vcpu->guest_gs_d << 32);
+			lhwrite_u64(vcpu, regs->rbx, val);
+		break;
+		case MSR_FS_BASE:
+			lhwrite_u64(vcpu, regs->rbx, 0);
+		break;
+		case MSR_EFER:
+			val = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+			lhwrite_u64(vcpu, regs->rbx, val);
+		break;
+		default:
+			kill_guest_dump(vcpu, "bad read of msr %llx\n", regs->rdx);
+		}
+		break;
+	case LHCALL_WRMSR:
+		switch (regs->rdx) {
+		case MSR_KERNEL_GS_BASE:
+			if ((regs->rbx >= HVVM_START) &&
+			    (regs->rbx < (HVVM_START + HV_VIRT_SIZE))) {
+				kill_guest_dump(vcpu,
+						"guest trying to set GS shadow base"
+						" in hypervisor");
+				break;
+			}
+			vcpu->guest_gs_shadow_a = regs->rbx;
+			vcpu->guest_gs_shadow_d = regs->rbx >> 32;
+		break;
+		case MSR_GS_BASE:
+			if ((regs->rbx >= HVVM_START) &&
+			    (regs->rbx < (HVVM_START + HV_VIRT_SIZE))) {
+				kill_guest_dump(vcpu,
+						"guest trying to set GS base in hypervisor");
+				break;
+			}
+			vcpu->guest_gs_a = regs->rbx;
+			vcpu->guest_gs_d = regs->rbx >> 32;
+		break;
+		case MSR_FS_BASE:
+			/* always zero */
+		break;
+		default:
+			kill_guest(linfo, "bad write to msr %llx\n", regs->rdx);
+		}
+		break;
+	case LHCALL_SET_PMD:
+		guest_set_pmd(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+	case LHCALL_SET_PUD:
+		guest_set_pud(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+	case LHCALL_SET_PGD:
+		guest_set_pgd(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+	case LHCALL_SET_PTE:
+		guest_set_pte(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+
+	case LHCALL_FLUSH_TLB_SIG:
+		guest_flush_tlb_single(vcpu, regs->rdx, regs->rbx);
+		break;
+	case LHCALL_FLUSH_TLB:
+		if (regs->rdx)
+			guest_pagetable_clear_all(vcpu);
+		else
+			guest_pagetable_flush_user(vcpu);
+		break;
+
+	case LHCALL_NEW_PGTABLE:
+		guest_new_pagetable(vcpu, regs->rdx);
+		break;
+
+	case LHCALL_CRASH: {
+		char msg[128];
+		lhread(linfo, msg, regs->rdx, sizeof(msg));
+		msg[sizeof(msg)-1] = '\0';
+		kill_guest_dump(vcpu, "CRASH: %s", msg);
+		break;
+	}
+	case LHCALL_LOAD_GDT:
+		/* i386 does a lot of gdt reloads. We don't.
+		 * we may want to support it in the future for more
+		 * strange code paths. Not now */
+		return -ENOSYS;
+
+	case LHCALL_LOAD_IDT_ENTRY: {
+		struct gate_struct g;;
+		if (regs->rdx > 0xFF) {
+			kill_guest(linfo, "There are just 255 idt entries."
+					"What are you trying to do??");
+		}
+		lhread(linfo, &g, regs->rbx, sizeof(g));
+		load_guest_idt_entry(vcpu, regs->rdx,&g);
+		break;
+	}
+	case LHCALL_SET_STACK:
+		guest_set_stack(vcpu, regs->rdx, regs->rbx);
+		break;
+	case LHCALL_TS:
+		vcpu->ts = regs->rdx;
+		break;
+	case LHCALL_TIMER_READ: {
+		u32 now = jiffies;
+		mb();
+		regs->rax = now - linfo->last_timer;
+		linfo->last_timer = now;
+		break;
+	}
+	case LHCALL_TIMER_START:
+		linfo->timer_on = 1;
+		if (regs->rdx != HZ)
+			kill_guest(linfo, "Bad clock speed %lli", regs->rdx);
+		linfo->last_timer = jiffies;
+		break;
+	case LHCALL_HALT:
+		linfo->halted = 1;
+		break;
+	case LHCALL_GET_WALLCLOCK: {
+		struct timeval tv;
+		do_gettimeofday(&tv);
+		regs->rax = tv.tv_sec;
+		break;
+	}
+	case LHCALL_BIND_DMA:
+		printk("Binding dma....\n");
+		regs->rax = bind_dma(linfo, regs->rdx, regs->rbx,
+				     regs->rcx >> 8, regs->rcx & 0xFF);
+		break;
+	case LHCALL_SEND_DMA:
+		printk("Sending dma....\n");
+		return send_dma(linfo, regs->rdx, regs->rbx);
+
+	case LHCALL_IRET:
+		guest_iret(vcpu);
+		break;
+#if 0
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, (struct desc_struct __user*)regs->rdx);
+		break;
+#endif
+
+	case LHCALL_DEBUG_ME:
+#ifdef CONFIG_LGUEST_DEBUG
+		lguest_debug = regs->rdx;
+		printk("lguest debug turned %s\n", regs->rdx ? "on" :
"off");
+		lguest_dump_vcpu_regs(vcpu);
+#else
+		{
+			static int once = 1;
+			if (once) {
+				once = 0;
+				printk("lguest debug is disabled, to use this "
+				       "please enable CONFIG_LGUEST_DEBUG\n");
+			}
+		}
+#endif
+		break;
+	default:
+		kill_guest(linfo, "Bad hypercall %lli\n", regs->rax);
+	}
+	return 0;
+}
+
+#if 0
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+	unsigned int i, pending;
+	u8 st[LHCALL_RING_SIZE];
+
+	if (!lg->lguest_data)
+		return 0;
+
+	if (copy_from_user(&st, &lg->lguest_data->hcall_status,
sizeof(st)))
+		return -EFAULT;
+
+	for (i = 0; i < ARRAY_SIZE(st); i++) {
+		struct lguest_regs regs;
+		unsigned int n = lg->next_hcall;
+
+		if (st[n] == 0xFF)
+			break;
+
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
+
+		get_user(regs.rax, &lg->lguest_data->hcalls[n].eax);
+		get_user(regs.rdx, &lg->lguest_data->hcalls[n].edx);
+		get_user(regs.rcx, &lg->lguest_data->hcalls[n].ecx);
+		get_user(regs.rbx, &lg->lguest_data->hcalls[n].ebx);
+		pending = do_hcall(lg, &regs);
+		put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+		if (pending)
+			return 1;
+	}
+
+	set_wakeup_process(lg, NULL);
+	return 0;
+}
+#endif
+
+int hypercall(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_regs *regs = &vcpu->regs;
+	int pending;
+
+	if (!linfo->lguest_data) {
+		if (regs->rax != LHCALL_LGUEST_INIT) {
+			kill_guest(linfo, "hypercall %lli before LGUEST_INIT",
+				   regs->rax);
+			return 0;
+		}
+
+		linfo->lguest_data = (struct lguest_data __user *)regs->rdx;
+		/* We check here so we can simply copy_to_user/from_user */
+		if (!lguest_address_ok(linfo, (long)linfo->lguest_data)
+		    || !lguest_address_ok(linfo, (long)(linfo->lguest_data+1))){
+			kill_guest(linfo, "bad guest page %p", linfo->lguest_data);
+			return 0;
+		}
+		/* update the page_offset info */
+		get_user(linfo->page_offset, &linfo->lguest_data->page_offset);
+		get_user(linfo->start_kernel_map,
&linfo->lguest_data->start_kernel_map);
+
+#if 0
+		get_user(linfo->noirq_start, &linfo->lguest_data->noirq_start);
+		get_user(linfo->noirq_end, &linfo->lguest_data->noirq_end);
+#endif
+		/* We reserve the top pgd entry. */
+		put_user(4U*1024*1024, &linfo->lguest_data->reserve_mem);
+		put_user(linfo->guest_id, &linfo->lguest_data->guest_id);
+		return 0;
+	}
+	pending = do_hcall(vcpu);
+	//set_wakeup_process(vcpu, NULL);
+	return pending;
+}
Index: work-pv/arch/x86_64/lguest/hypervisor.S
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/hypervisor.S
@@ -0,0 +1,711 @@
+#include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+#include "lguest.h"
+
+.text
+.align PAGE_SIZE
+
+.global start_hyper_text
+	.type start_hyper_text, @function
+start_hyper_text:
+
+.global	host_syscall
+host_syscall:
+	.quad 0
+
+#define PRINT_L(L)				\
+        PRINT_OUT($L)
+
+#define PRINT_N(n)				\
+        PRINT_OUT($'0' + $n)
+
+#define PRINT_HEX(n)				\
+	mov     n, %cl;				\
+	and     $0xf, %cl;			\
+	cmp     $0xa, %cl;			\
+	jge     11f;				\
+	add     $'0', %cl;			\
+	jmp     12f;				\
+11:	add     $('a' - 10), %cl;               \
+12:	PRINT_OUT(%cl);
+
+#define PRINT_NUM_BX				\
+9:	PRINT_HEX(%bl);				\
+	shr     $4, %rbx;			\
+	jne     9b
+
+#define PRINT_NUM(n)				\
+	movl    $n, %ebx;			\
+	PRINT_NUM_BX;				\
+	PRINT_L('\n');				\
+	PRINT_L('\r')
+
+#define PRINT_LONG(n)				\
+	movl    n, %ebx;			\
+	PRINT_NUM_BX;				\
+	PRINT_L('\n');				\
+	PRINT_L('\r')
+
+#define PRINT_QUAD(n)				\
+	movq    n, %rbx;			\
+	PRINT_NUM_BX;				\
+	PRINT_L('\n');				\
+	PRINT_L('\r')
+
+#define PRINT_X					\
+	PRINT_L('x')
+
+#define PRINT_OUT(x)				\
+	mov $0x3f8, %esi;			\
+21:	lea  0x5(%esi), %edx;			\
+	movzwl %dx, %edx;			\
+	in  (%dx), %al;				\
+	test $0x20,%al;				\
+	jne 22f;				\
+	pause;					\
+	jmp 21b;				\
+22:						\
+	movl    %esi, %edx;			\
+	movzwl  %dx, %edx;			\
+	mov     x, %al;				\
+	out     %al, (%dx);			\
+31:						\
+	lea  0x5(%esi), %edx;			\
+	movzwl %dx, %edx;			\
+	in  (%dx), %al;				\
+	test $0x20,%al;				\
+	jne 32f;				\
+	pause;					\
+	jmp 31b;				\
+32:						\
+
+#define PUSH_NUM				\
+	pushq %rcx;				\
+	pushq %rbx;
+
+#define POP_NUM					\
+	pushq %rbx;				\
+	pushq %rcx;
+
+#define PUSH_PRINT				\
+	pushq %rsi;				\
+	pushq %rdx;				\
+	pushq %rax;				\
+
+#define POP_PRINT				\
+	popq %rax;				\
+	popq %rdx;				\
+	popq %rsi;
+
+#define S_PRINT_NUM(_n)				\
+	PUSH_PRINT;				\
+	PUSH_NUM;				\
+	PRINT_NUM(_n);				\
+	POP_NUM;				\
+	POP_PRINT;
+
+#define S_PRINT_L(x)				\
+	PUSH_PRINT;				\
+	PRINT_L(x);				\
+	POP_PRINT;
+
+#define S_PRINT_QUAD(_n)			\
+	PUSH_PRINT;				\
+	PUSH_NUM;				\
+	PRINT_QUAD(_n);				\
+	POP_NUM;				\
+	POP_PRINT;
+
+/* Save registers on the current stack. Both for
+ * switch_to_guest and switch_to_host usage */
+#define SAVE_REGS				\
+	/* Save old guest/host state */		\
+	pushq	%fs;				\
+	pushq	%rax;				\
+	pushq	%r15;				\
+	pushq	%r14;				\
+	pushq	%r13;				\
+	pushq	%r12;				\
+	pushq	%r11;				\
+	pushq	%r10;				\
+	pushq	%r9;				\
+	pushq	%r8;				\
+	pushq	%rbp;				\
+	pushq	%rdi;				\
+	pushq	%rsi;				\
+	pushq	%rdx;				\
+	pushq	%rcx;				\
+	pushq	%rbx;				\
+
+#define RESTORE_REGS				\
+	/* Save old guest/host state */		\
+	popq	%rbx;				\
+	popq	%rcx;				\
+	popq	%rdx;				\
+	popq	%rsi;				\
+	popq	%rdi;				\
+	popq	%rbp;				\
+	popq	%r8;				\
+	popq	%r9;				\
+	popq	%r10;				\
+	popq	%r11;				\
+	popq	%r12;				\
+	popq	%r13;				\
+	popq	%r14;				\
+	popq	%r15;				\
+	popq	%rax;				\
+	popq	%fs;				\
+
+.macro dump_stack_regs PREFIX
+	movq	$LGUEST_REGS_size, %r10
+	xorq	%r11, %r11
+1:	PRINT_L(\PREFIX);
+	movq	%r11, %rbx;
+	PRINT_NUM_BX;
+	PRINT_L(':'); PRINT_L(' ');
+	movq	%rsp, %r9
+	addq	%r11, %r9
+	PRINT_QUAD((%r9))
+	addq	$8, %r11
+	cmp	%r11, %r10
+	ja	1b
+.endm
+
+.macro debugme VCPU C
+	testb	$1,LGUEST_VCPU_debug(\VCPU)
+	jz	23f
+	PRINT_L(\C)
+23:
+.endm
+
+
+#if 0
+.global hcall_teste
+	.type hcall_teste, @function
+hcall_teste:
+	cmpq	$0, %gs:pda_vcpu
+	jne	handle_guest
+	jmp	*host_syscall
+handle_guest:
+	/* SAVE_REGS  maybe it is not the macro we want */
+	#cmpq	$__PAGE_OFFSET, %rcx;
+	jb	do_hypercall
+	movq	%gs:pda_vcpu, %rcx;
+	movq	LGUEST_VCPU_guest_syscall(%rcx), %rcx;
+#endif
+
+/**
+ * DECODE_IDT  parse a IDT descriptor to find the target.
+ *  @IDT     - The register that holds the IDT descriptor location
+ *  @IDTWORD - The word version of the IDT register
+ *	        (ie. IDT is %rax, then IDTWORD must be %ax)
+ *  @RESULT  - The regsiter to place the result.
+ *
+ * This clobbers both IDT and RESULT regs.
+ */
+.macro DECODE_IDT IDT IDTWORD RESULT
+	movzwq	(\IDT), \RESULT
+	movq	4(\IDT), \IDT
+	xorw	\IDTWORD, \IDTWORD
+	orq	\IDT, \RESULT
+.endm
+
+/**
+ * DECODE_SSEG  parse a System Segment descriptor to find the target.
+ *  @SEG       - The register that holds the Sys Seg descriptor location
+ *  @RESULT    - The regsiter to place the result.
+ *  @RW	       - The word version of the RESULT register
+ *  @RH	       - The high byte version of the RESULT register
+ *
+ * (ie. RESULT is %rax, then RW must be %ax and RH must be %ah)
+ *
+ * This clobbers both SEG and RESULT regs.
+ */
+/* Why does Intel need to make everything so darn complex! */
+.macro DECODE_SSEG SEG RESULT RW RH
+	movzbq	7(\SEG), \RESULT
+	shl	$16, \RESULT
+	movb	4(\SEG), \RH
+	shl	$8, \RESULT
+	movw	2(\SEG), \RW
+	movq	8(\SEG), \SEG
+	shlq	$32, \SEG
+	orq	\SEG, \RESULT
+.endm
+
+.global switch_to_guest
+	.type switch_to_guest, @function
+/* rdi holds the pointer to vcpu.
+ * Interrupts are off on entry   */
+switch_to_guest:
+	SAVE_REGS
+	/* save host stack */
+	movq	%rsp, LGUEST_VCPU_host_stack(%rdi)
+	/* put the guest's stack in */
+	movq	%rdi, %rsp
+	/* move the stack to point to guest regs */
+	addq	$LGUEST_VCPU_regs, %rsp
+	/* filling this pointer has the effect of signalizing we're
+	 * running guest code */
+	movq	%rdi, %gs:pda_vcpu
+
+	/* save this host's gdt and idt */
+	sgdt LGUEST_VCPU_host_gdt(%rdi)
+	sidt LGUEST_VCPU_host_idt(%rdi)
+
+	/* Save the gs base of the host (for nmi use) */
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_host_gs_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_host_gs_d(%rdi)
+
+	/* Save the host proc gs pointer */
+	movl	$MSR_KERNEL_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_host_proc_gs_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_host_proc_gs_d(%rdi)
+
+	/* save the hosts page tables */
+	movq %cr3, %rax
+	movq %rax, LGUEST_VCPU_host_cr3(%rdi)
+
+	/*
+	 * The NMI is a big PITA. There's no way to atomically load the
+	 * TSS and IDT, so we can't just switch to the guest TSS without
+	 * causing a race condition with  the NMI.
+	 * So we set up the host NMI stack in the guest TSS IST so that
+	 * in case we take an NMI after loading our TR register
+	 * but before we've updated the lidt, we still have a valid
+	 * stack for the host nmi handler to use.
+	 */
+	/* Load the guest gdt */
+	lgdt LGUEST_VCPU_gdt(%rdi)
+
+	/* Switch to guest's TSS (before loading the idt) */
+	movl	$(GDT_ENTRY_TSS*8), %ebx
+	ltr	%bx
+
+	/* Set host's TSS to available (clear byte 5 bit 2). */
+	movq	LGUEST_VCPU_host_gdt_ptr(%rdi), %rax
+	andb	$0xFD, (GDT_ENTRY_TSS*8+5)(%rax)
+
+	/* Now load the guest idt */
+	lidt LGUEST_VCPU_idt(%rdi)
+
+	/* Load the guest gs pointer */
+	movl	$MSR_KERNEL_GS_BASE, %ecx
+	movq	LGUEST_VCPU_guest_gs_a(%rdi), %rax
+	movq	LGUEST_VCPU_guest_gs_d(%rdi), %rdx
+	wrmsr
+
+	/* Flush the TLB */
+	movq	%cr4, %rax
+	movq	%rax, %rbx
+	andb	$~(1<<7), %al
+	movq	%rax, %cr4
+	movq	%rbx, %cr4
+
+	/* switch to the guests page tables */
+	popq %rax
+	movq %rax, %cr3
+
+	/* Now we swap gs to the guest gs base */
+	swapgs
+
+	/* restore guest registers */
+	RESTORE_REGS
+	/* skip trapnum and errorcode */
+	addq	$0x10, %rsp;
+	iretq
+
+.macro print_trap VCPU REG
+	movq	LGUEST_VCPU_trapnum(\VCPU), \REG
+	PRINT_QUAD(\REG)
+.endm
+
+#define SWITCH_TO_HOST							\
+	SAVE_REGS;							\
+	/* Save old pgdir */						\
+	movq	%cr3, %rax;						\
+	pushq	%rax;							\
+	/* Point rdi to the vcpu struct */				\
+	movq	%rsp, %rdi;						\
+	subq	$LGUEST_VCPU_regs, %rdi;				\
+	/* Load lguest ds segment for convenience. */			\
+	movq	$(__HV_DS), %rax;					\
+	movq	%rax, %ds;						\
+	/* Load the host page tables since that's where the gdt is */	\
+	movq    LGUEST_VCPU_host_cr3(%rdi), %rax;			\
+	movq    %rax, %cr3;						\
+	/* Switch to hosts gdt */					\
+	lgdt    LGUEST_VCPU_host_gdt(%rdi);				\
+	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
+	movq    LGUEST_VCPU_vcpu(%rdi), %rax;				\
+	andb	$0xFD, (LGUEST_VCPU_gdt_table+GDT_ENTRY_TSS*8+5)(%rax);	\
+	/* Swap back to the host PDA */					\
+	swapgs;								\
+	/* Put back the host process gs as well */			\
+	movl  	$MSR_KERNEL_GS_BASE,%ecx;				\
+	movq    LGUEST_VCPU_host_proc_gs_a(%rdi), %rax;			\
+	movq    LGUEST_VCPU_host_proc_gs_d(%rdi), %rdx;			\
+	wrmsr;								\
+	/* With PDA back now switch to host idt */			\
+	lidt    LGUEST_VCPU_host_idt(%rdi);				\
+	/* Switch to host's TSS. */					\
+	movl	$(GDT_ENTRY_TSS*8), %eax;				\
+	ltr	%ax;							\
+	/* put flag down. We're in the host again */			\
+	movq	$0, %gs:pda_vcpu;					\
+	movq	LGUEST_VCPU_host_stack(%rdi), %rsp;			\
+	RESTORE_REGS;
+
+/* Return to run_guest_once. */
+return_to_host:
+	SWITCH_TO_HOST
+	iretq
+
+deliver_to_host:
+	SWITCH_TO_HOST
+decode_idt_and_jmp:
+	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
+	 * will return to run_guest_once.  This is a feature. */
+	/* We told gcc we'd clobber rdi and rax... */
+	movq	LGUEST_VCPU_trapnum(%rdi), %rdi
+	shl	$1, %rdi
+	leaq	(%rax,%rdi,8), %rdi
+	DECODE_IDT %rdi %di %rax
+	jmp	*%rax
+
+#define NMI_SWITCH_TO_HOST						\
+	/* Force switch to host, GDT, CR3, and both GS bases */		\
+	movl    $MSR_GS_BASE, %ecx;					\
+	movq    LGUEST_VCPU_host_gs_a(%rdi), %rax;			\
+	movq    LGUEST_VCPU_host_gs_d(%rdi), %rdx;			\
+	wrmsr;								\
+	movl    $MSR_KERNEL_GS_BASE, %ecx;				\
+	movq    LGUEST_VCPU_host_proc_gs_a(%rdi), %rax;			\
+	movq    LGUEST_VCPU_host_proc_gs_d(%rdi), %rdx;			\
+	wrmsr;								\
+	movq    LGUEST_VCPU_host_cr3(%rdi), %rax;			\
+	movq	%rax, %cr3;						\
+	lgdt    LGUEST_VCPU_host_gdt(%rdi);
+
+#if 0
+	/* Set host's TSS to available (clear byte 5 bit 2). */		\
+	movq	LGUEST_VCPU_host_gdt_ptr(%rdi), %rax;			\
+	andb	$0xFD, (GDT_ENTRY_TSS*8+5)(%rax);			\
+
+#endif
+
+/* Used by NMI only */
+/*
+ * The NMI is special because it uses its own stack, and needs to
+ * find the vcpu struct differently.
+ */
+nmi_trampoline:
+	/* nmi has it's own stack */
+	SAVE_REGS
+
+	/* save the cr3 */
+	movq     %cr3, %rax
+	pushq	 %rax
+
+	/* get the vcpu struct */
+	movq     %rsp, %rdi
+	subq     $LGUEST_VCPU_nmi_stack_end, %rdi
+	addq     $LGUEST_REGS_size, %rdi  /* compensate for saved regs */
+
+	/* compensate if our end pointer is not 16 bytes aligned */
+	movq	 $LGUEST_VCPU_nmi_stack_end, %rax
+	andq	 $0xf, %rax;
+	addq	 %rax, %rdi;
+
+#if 0 /* in case we want to see where the nmi hit */
+	movq	LGUEST_REGS_rip(%rsp), %r8
+	PRINT_L('R')
+	PRINT_QUAD(%r8)
+#endif
+
+	/*
+	 * All guest descriptors are above the HV text code (here!)
+	 * If we hit the suspected NMI race, our stack will be the host
+	 * kernel stack, and that is in lower address space than the HV.
+	 * So test to see if we are screwed. Don't do anything, but just
+	 * report it!
+	 */
+	call   1f
+1:
+	movq	0(%rsp), %rax /* put this RIP into rax */
+	/* If rsp >= rax; jmp */
+	cmpq	%rax, %rsp
+	jge	1f
+
+	PRINT_L('H'); PRINT_L('i'); PRINT_L('t');
PRINT_L(' ');
+	PRINT_L('N'); PRINT_L('M'); PRINT_L('I');
PRINT_L(' ');
+	PRINT_L('r'); PRINT_L('a'); PRINT_L('c');
+	PRINT_L('\n'); PRINT_L('\r');
+
+1:
+	/* put back the stack from the previous call */
+	addq   $8, %rsp
+
+	/*
+	 * If we take another NMI while saving, we need to start over
+	 * and try again. It's OK as long as we don't overwrite
+	 * the saved material.
+	 */
+	testq    $1,LGUEST_VCPU_nmi_sw(%rdi)
+	jnz      1f
+
+	/* Copy the saved regs */
+	cld
+	movq	%rdi,  %rbx   /* save off vcpu struct */
+	leaq	LGUEST_VCPU_nmi_regs(%rdi), %rdi
+	leaq	0(%rsp), %rsi
+	movq	$(LGUEST_REGS_size/8), %rcx
+	rep	movsq
+
+	movq	%rbx, %rdi  /* put back vcpu struct */
+
+	/* save the gs base and shadow */
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_nmi_gs_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_nmi_gs_d(%rdi)
+
+	movl	$MSR_KERNEL_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_nmi_gs_shadow_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_nmi_gs_shadow_d(%rdi)
+
+	/* save the gdt */
+	sgdt	LGUEST_VCPU_nmi_gdt(%rdi)
+
+	/* set the switch flag to prevent another nmi from saving over this */
+	movq   $1, LGUEST_VCPU_nmi_sw(%rdi)
+
+1:
+
+#if 0
+	S_PRINT_L('N')
+	S_PRINT_L('M')
+	S_PRINT_L('I')
+	S_PRINT_L(' ')
+	S_PRINT_L('l')
+	S_PRINT_L('g')
+	S_PRINT_L('u')
+	S_PRINT_L('e')
+	S_PRINT_L('s')
+	S_PRINT_L('t')
+	S_PRINT_L('\n')
+	S_PRINT_L('\r')
+#endif
+	NMI_SWITCH_TO_HOST
+
+	/* we want to come back here on the iret */
+	pushq  $__HV_DS
+	/* put the vcpu struct as our stack */
+	pushq %rdi
+	pushfq
+	pushq	$__HV_CS
+
+	movq    LGUEST_VCPU_host_idt_address(%rdi), %rax
+
+	/* Decode the location of the host NMI handler */
+	leaq   32(%rax), %rbx   /* NMI IDT entry */
+	DECODE_IDT %rbx %bx %rax
+
+	callq   *%rax
+
+	/*
+	 * Back from NMI, stack points to vcpu, and we can take
+	 * more NMIs at this point. That's OK, since we only
+	 * want to get to the original NMI interruption. We
+	 * just restart this restore process. Nested NMIs will
+	 * not destroy this data while the nmi_sw flag is set.
+	 */
+	movq    %rsp, %rdi
+
+	/* restore the cr3 */
+	addq   $(LGUEST_VCPU_nmi_regs), %rsp
+	popq   %rax
+	movq   %rax, %cr3
+
+	/* restore the gdt */
+	lgdt	LGUEST_VCPU_nmi_gdt(%rdi)
+
+#if 0 /* print magic */
+	movq	LGUEST_VCPU_magic(%rdi), %r8
+	movq	$(6*8), %r9
+1:	subq	$8, %r9
+	movq	%r9, %rcx
+	movq	%r8, %rbx
+	shr	%cl, %rbx
+	PRINT_OUT(%bl)
+	cmp	$0, %r9
+	jne	1b
+#endif
+
+	/* make both host and guest TSS available */
+#if 1
+	movq	LGUEST_VCPU_host_gdt_ptr(%rdi), %rax
+	andb	$0xFD, (GDT_ENTRY_TSS*8+5)(%rax)
+
+	andb	$0xFD, (LGUEST_VCPU_gdt_table+GDT_ENTRY_TSS*8+5)(%rdi)
+#endif
+
+#if 0
+	movl	$(GDT_ENTRY_TSS*8), %ebx
+	ltr	%bx
+#endif
+
+	/* restore the gs base and shadow */
+	movl   $MSR_GS_BASE, %ecx
+	movq   LGUEST_VCPU_nmi_gs_a(%rdi), %rax
+	movq   LGUEST_VCPU_nmi_gs_d(%rdi), %rdx
+	wrmsr
+
+	movl   $MSR_KERNEL_GS_BASE, %ecx
+	movq   LGUEST_VCPU_nmi_gs_shadow_a(%rdi), %rax
+	movq   LGUEST_VCPU_nmi_gs_shadow_d(%rdi), %rdx
+	wrmsr
+
+#if 0
+	PRINT_L('O')
+	PRINT_L('U')
+	PRINT_L('T')
+	PRINT_L('\n')
+	PRINT_L('\r')
+#endif
+
+#if 1
+	/* Flush the TLB */
+	movq	%cr4, %rax
+	movq	%rax, %rbx
+	andb	$~(1<<7), %al
+	movq	%rax, %cr4
+	movq	%rbx, %cr4
+#endif
+
+	RESTORE_REGS
+
+	/* skip trapnum and errcode */
+	addq	$0x10, %rsp
+
+	/*
+	 * Careful here, we can't modify any regs anymore
+	 * but we now have to zero out the nmi switch flag.
+	 * So all the work will be done by the stack pointer.
+	 */
+
+#define SW_OFFSET (LGUEST_VCPU_nmi_sw - \
+		   (LGUEST_VCPU_nmi_regs + LGUEST_REGS_rip))
+	 movq  $0, SW_OFFSET(%rsp)
+
+	 /* use iret to get back to where we were. */
+	 iretq;
+	 /* Whoo, all done! */
+
+do_crash:
+	SAVE_REGS
+	movq	%cr3, %rax;
+	pushq	%rax;
+
PRINT_L('C');PRINT_L('r');PRINT_L('a');PRINT_L('s');
+
PRINT_L('h');PRINT_L('i');PRINT_L('n');PRINT_L('g');
+	PRINT_L('\n');PRINT_L('\r');
+
+	dump_stack_regs 'S'
+
+	addq	$16, %rsp
+	sgdt	0(%rsp)
+
PRINT_L('G');PRINT_L('D');PRINT_L('T');PRINT_L('L');PRINT_L(':');PRINT_L('
');
+	xorq	%r8, %r8
+	movw	(%rsp), %r8
+	PRINT_QUAD(%r8)
+
PRINT_L('G');PRINT_L('D');PRINT_L('T');PRINT_L('A');PRINT_L(':');PRINT_L('
');
+	movq	2(%rsp), %r8
+	PRINT_QUAD(%r8)
+
+	PRINT_L('C');PRINT_L('S');PRINT_L(':');PRINT_L('
');
+	movq	%cs, %rbx
+	PRINT_QUAD(%rbx)
+	movq	%cs, %rbx
+	andb	$(~3), %bl
+	addq	%rbx, %r8
+	movq	0(%r8), %r9
+
PRINT_L('S');PRINT_L('E');PRINT_L('G');PRINT_L(':');PRINT_L('
');
+	PRINT_QUAD(%r9);
+	movq	$1, %r8;
+	shl	$47, %r8
+	andq	%r9, %r8
+	PRINT_L('P');PRINT_L(' ');PRINT_L(':');PRINT_L('
');
+	PRINT_QUAD(%r8);
+	PRINT_L('D');PRINT_L('P');PRINT_L(':');PRINT_L('
');
+	movq	$3, %r8;
+	shl	$45, %r8
+	andq	%r9, %r8
+	PRINT_QUAD(%r8);
+
+
+	/* just die! */
+2:
+	pause
+	jmp 2b
+
+
+/* Real hardware interrupts are delivered straight to the host.  Others
+   cause us to return to run_guest_once so it can decide what to do.  Note
+   that some of these are overridden by the guest to deliver directly, and
+   never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+	.data; .quad 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+/*  .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N
> 14) && (\N <> 17) */
+  .if (\N < 10 || \N > 14) && (\N <> 17)
+	pushq	$0
+ .endif
+	pushq	$\N
+	jmp	\TARGET
+	.align 8
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host.  Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+.global _lguest_default_idt_entries
+_lguest_default_idt_entries:
+.text
+	IRQ_STUBS 0 1 return_to_host		/* First two traps */
+	IRQ_STUB 2 nmi_trampoline	/* NMI */
+	IRQ_STUBS 3 7 return_to_host		/* Rest of traps */
+/*debug for now */
+	IRQ_STUB 8 do_crash			/* Double fault! */
+#if 1
+	IRQ_STUBS 9 31 return_to_host		/* Rest of traps */
+#else
+	IRQ_STUBS 9 12 return_to_host		/* Rest of traps */
+	IRQ_STUB 13 do_crash			/* GPF! */
+	IRQ_STUBS 14 31 return_to_host		/* Rest of traps */
+#endif
+	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
+	IRQ_STUB 128 return_to_host		/* System call (overridden) */
+	IRQ_STUBS 129 255 deliver_to_host	/* Other real interrupts */
+
+	.align PAGE_SIZE
+.global end_hyper_text
+	.type end_hyper_text, @function
+end_hyper_text:
+	nop
Index: work-pv/arch/x86_64/lguest/interrupts_and_traps.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/interrupts_and_traps.c
@@ -0,0 +1,292 @@
+#include <linux/uaccess.h>
+#include <asm/lguest.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include "lguest.h"
+
+static void push_guest_stack(struct lguest_vcpu *vcpu,
+					u64 __user **gstack, u64 val)
+{
+	lhwrite_u64(vcpu, (u64)--(*gstack), val);
+}
+
+static u64 pop_guest_stack(struct lguest_vcpu *vcpu,
+			   u64 __user **gstack)
+{
+	return lhread_u64(vcpu, (u64)(*gstack)++);
+}
+
+void guest_iret(struct lguest_vcpu *vcpu)
+{
+	struct lguest_regs *regs = &vcpu->regs;
+	u64 __user *gstack;
+	u64 cs;
+
+	gstack = (u64 __user *)guest_pa(vcpu->guest, regs->rsp);
+
+	regs->rip = pop_guest_stack(vcpu, &gstack);
+	cs = pop_guest_stack(vcpu, &gstack);
+
+	/* FIXME: determine if we are going back to userland */
+
+	regs->rflags = pop_guest_stack(vcpu, &gstack);
+	/* FIXME: check if this is correct */
+
+	if (regs->rflags & 512)
+		put_user(512, &vcpu->guest->lguest_data->irq_enabled);
+
+	/* make sure interrupts are enabled */
+	regs->rflags |= 512;
+
+	regs->rsp = pop_guest_stack(vcpu, &gstack);
+	regs->ss = pop_guest_stack(vcpu, &gstack);
+
+	/* restore the rax reg, since it was used by the guest to do the hcall */
+	regs->rax = vcpu->rax;
+
+	return;
+}
+
+int reflect_trap(struct lguest_vcpu *vcpu, int trap_num, int has_err)
+{
+	struct lguest_regs *regs = &vcpu->regs;
+	u64 __user *gstack;
+	u64 rflags, irq_enable;
+	u64 offset;
+
+	if (!vcpu->interrupt[trap_num]) {
+		printk("Not yet registered trap handler for %d\n",trap_num);
+		return 0;
+	}
+
+	/* save off the rax reg */
+	vcpu->rax = regs->rax;
+
+	/* FIXME: test for ring change and set up vcpu->tss.rsp2 ? */
+	gstack = (u64 __user *)guest_pa(vcpu->guest, regs->rsp);
+	offset = regs->rsp - (u64)gstack;
+
+	/* We use IF bit in eflags to indicate whether irqs were disabled
+	   (it's always 0, since irqs are enabled when guest is running). */
+	get_user(irq_enable, &vcpu->guest->lguest_data->irq_enabled);
+	rflags = regs->rflags;
+	rflags |= (irq_enable & 512);
+
+	/* FIXME: Really? */
+	push_guest_stack(vcpu, &gstack, regs->ss);
+	push_guest_stack(vcpu, &gstack, regs->rsp);
+	push_guest_stack(vcpu, &gstack, rflags);
+	/* FIXME: determine if guest is in kernel or user mode */
+	push_guest_stack(vcpu, &gstack, __KERNEL_CS);
+	push_guest_stack(vcpu, &gstack, regs->rip);
+
+	if (has_err)
+		push_guest_stack(vcpu, &gstack, regs->errcode);
+
+	/* Change the real stack so hypervisor returns to trap handler */
+	regs->ss = __USER_DS;
+	regs->rsp = (u64)gstack + offset;
+	regs->cs = __USER_CS;
+	lgdebug_print("rip was at %p\n", (void*)regs->rip);
+	regs->rip = vcpu->interrupt[trap_num];
+
+	/* Disable interrupts for an interrupt gate. */
+	if (test_bit(trap_num, vcpu->interrupt_disabled))
+		put_user(0, &vcpu->guest->lguest_data->irq_enabled);
+	return 1;
+#if 0
+	/* Was ist da? */
+	/* GS will be neutered on way back to guest. */
+	put_user(0, &lg->lguest_data->gs_gpf_eip);
+#endif
+	return 0;
+}
+
+void maybe_do_interrupt(struct lguest_vcpu *vcpu)
+{
+	unsigned int irq;
+	DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+	if (!vcpu->guest->lguest_data)
+		return;
+
+	/* If timer has changed, set timer interrupt. */
+	if (vcpu->guest->timer_on && jiffies !=
vcpu->guest->last_timer)
+		set_bit(0, vcpu->irqs_pending);
+
+	/* Mask out any interrupts they have blocked. */
+	if (copy_from_user(&irqs, vcpu->guest->lguest_data->interrupts,
+								sizeof(irqs)))
+		return;
+
+	bitmap_andnot(irqs, vcpu->irqs_pending, irqs, LGUEST_IRQS);
+
+	irq = find_first_bit(irqs, LGUEST_IRQS);
+	if (irq >= LGUEST_IRQS)
+		return;
+
+	/* If they're halted, we re-enable interrupts. */
+	if (vcpu->guest->halted) {
+		/* Re-enable interrupts. */
+		put_user(512, &vcpu->guest->lguest_data->irq_enabled);
+		vcpu->guest->halted = 0;
+	} else {
+		/* Maybe they have interrupts disabled? */
+		u32 irq_enabled;
+		get_user(irq_enabled, &vcpu->guest->lguest_data->irq_enabled);
+		if (!irq_enabled) {
+			lgdebug_print("Irqs are disabled\n");
+			return;
+		}
+	}
+
+	if (vcpu->interrupt[irq + FIRST_EXTERNAL_VECTOR] != 0) {
+		lgdebug_print("Reflect trap: %x\n",irq+FIRST_EXTERNAL_VECTOR);
+		clear_bit(irq, vcpu->irqs_pending);
+		reflect_trap(vcpu, irq+FIRST_EXTERNAL_VECTOR, 0);
+	}
+	else {
+		lgdebug_print("out without doing it!!\n");
+	}
+
+}
+
+void check_bug_kill(struct lguest_vcpu *vcpu)
+{
+/* FIXME: Use rostedt magic kallsyms */
+#if 0
+#ifdef CONFIG_BUG
+	u32 eip = lg->state->regs.rip - PAGE_OFFSET;
+	u16 insn;
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->state->regs.rip < PAGE_OFFSET)
+		return;
+	lhread(lg, &insn, eip, sizeof(insn));
+	if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+		u16 l;
+		u32 f;
+		char file[128];
+		lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+		lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+		lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+		file[sizeof(file)-1] = 0;
+		kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+		kill_guest(lg, "BUG() at %#x", eip);
+#endif	/* CONFIG_DEBUG_BUGVERBOSE */
+	}
+#endif	/* CONFIG_BUG */
+#endif
+}
+
+static void copy_trap(struct lguest_vcpu *vcpu,
+		      unsigned int trap_num,
+		      const struct gate_struct *desc)
+{
+
+	/* Not present? */
+	if (!desc->p) {
+		vcpu->interrupt[trap_num] = 0;
+		return;
+	}
+
+	switch (desc->type) {
+		case 0xE:
+			set_bit(trap_num,vcpu->interrupt_disabled);
+			break;
+		case 0xF:
+			clear_bit(trap_num,vcpu->interrupt_disabled);
+			break;
+		default:
+			kill_guest(vcpu->guest, "bad IDT type %i for irq %x",
+				desc->type,trap_num);
+	}
+
+	vcpu->interrupt[trap_num] = GATE_ADDRESS((*desc));
+}
+
+#if 0
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[]
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+    0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+    /* movl 0, %ss:lguest_data.gs_gpf_eip */
+    0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+	u32 addr, off;
+
+	off = sizeof(tramp)*i;
+	memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+	/* 0 is to be placed in lguest_data.gs_gpf_eip. */
+	addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+	memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+	/* Address is relative to where end of jmp will be. */
+	addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+	memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+	return (-4*1024*1024) + off;
+}
+
+#endif
+/* We bounce through the trap page, for two reasons: firstly, we need
+   the interrupt destination always mapped, to avoid double faults,
+   secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+/* guest kernel will not be mapped. we'd better do another schema */
+static void setup_idt(struct lguest_vcpu *vcpu,
+		      unsigned int i,
+		      const struct gate_struct *desc)
+{
+	u64 taddr;
+
+	/* Not present? */
+	if (!desc->p) {
+		/* FIXME: When we need this, we'll know... */
+		if (vcpu->idt_table[i].p)
+			kill_guest(vcpu->guest, "trying to remove irq line %i:"
+					"removing interrupts not supported",i);
+		return;
+	}
+
+#if 0
+	/* We could reflect and disable interrupts, but guest can do itself. */
+	if (desc->type != 0xF)
+		kill_guest(vcpu->guest, "bad direct IDT %i type 0x%x",
+								i, desc->type);
+#endif
+
+	/* FIXME: We may need to fix segment? */
+	_lguest_set_gate(&vcpu->idt_table[i], desc->type, GUEST_DPL, taddr,
0);
+#if 0
+	taddr = setup_trampoline(lg, i,
(desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+#endif
+}
+
+void load_guest_idt_entry(struct lguest_vcpu *vcpu, unsigned int i,
+				struct gate_struct *d)
+{
+	switch (i) {
+	/* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+	case 2:
+	case 8:
+	case 14:
+	case 15:
+	case LGUEST_TRAP_ENTRY:
+	/* FIXME: We should handle debug and int3 */
+	case 1:
+	case 3:
+		return;
+	default:
+		copy_trap(vcpu,i,d);
+	}
+}
+
Index: work-pv/arch/x86_64/lguest/lguest.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest.c
@@ -0,0 +1,705 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2007, Glauber de Oliveira Costa <gcosta@redhat.com>
+ *                     Steven Rostedt <srostedt@redhat.com>
+ *                     Red Hat Inc
+ * Standing on the shoulders of Rusty Russell.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/pfn.h>
+#include <asm/bootsetup.h>
+#include <asm/paravirt.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/asm-offsets.h>
+#include <asm/mce.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot =
(void*)__START_KERNEL_map;
+static struct lguest_text_ptr code_stack[2];
+extern int acpi_disabled;
+extern int acpi_ht;
+
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
+
+static DEFINE_SPINLOCK(hcall_print_lock);
+#define HCALL_BUFF_SIZ 1024
+static char hcall_buff[HCALL_BUFF_SIZ];
+
+/* Set to true when the lguest_init is called. */
+static int lguest_paravirt;
+
+struct lguest_print_ops {
+	void (*vprint)(const char *fmt, va_list ap);
+} *lguest_pops;
+
+void lguest_vprint(const char *fmt, va_list ap)
+{
+	if (lguest_pops)
+		lguest_pops->vprint(fmt, ap);
+}
+
+void lguest_print(const char *fmt, ...)
+{
+	va_list ap;
+
+	/* irq save? */
+	va_start(ap, fmt);
+	lguest_vprint(fmt, ap);
+	va_end(ap);
+}
+
+static void __lguest_vprint(const char *fmt, va_list ap)
+{
+	/* need to do this with interrupts disabled */
+//	spin_lock(&hcall_print_lock);
+	vsnprintf(hcall_buff, HCALL_BUFF_SIZ-1, fmt, ap);
+
+	hcall(LHCALL_PRINT, __pa(hcall_buff), 0, 0);
+//	spin_unlock(&hcall_print_lock);
+}
+
+struct lguest_print_ops local_pops = {__lguest_vprint };
+
+void lguest_set_debug(int d)
+{
+	if (lguest_paravirt)
+		hcall(LHCALL_DEBUG_ME, d, 0, 0);
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	/* Note: This code assumes we're uniprocessor. */
+	static unsigned int next_call;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (lguest_data.hcall_status[next_call] != 0xFF) {
+		/* Table full, so do normal hcall which will flush table. */
+		hcall(call, arg1, arg2, arg3);
+	} else {
+		lguest_data.hcalls[next_call].eax = call;
+		lguest_data.hcalls[next_call].edx = arg1;
+		lguest_data.hcalls[next_call].ebx = arg2;
+		lguest_data.hcalls[next_call].ecx = arg3;
+		wmb();
+		lguest_data.hcall_status[next_call] = 0;
+		if (++next_call == LHCALL_RING_SIZE)
+			next_call = 0;
+	}
+	local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE 	/* Not in 2.6.20. */
+static int lazy_mode;
+static void lguest_lazy_mode(int mode)
+{
+	lazy_mode = mode;
+	if (mode == PARAVIRT_LAZY_NONE)
+		hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2,
+		       unsigned long arg3)
+{
+	if (lazy_mode == PARAVIRT_LAZY_NONE)
+		hcall(call, arg1, arg2, arg3);
+	else
+		async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long save_fl(void)
+{
+	return lguest_data.irq_enabled;
+}
+
+static void restore_fl(unsigned long flags)
+{
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = flags;
+}
+
+static void irq_disable(void)
+{
+	lguest_data.irq_enabled = 0;
+}
+
+static void irq_enable(void)
+{
+	/* Linux i386 code expects bit 9 set. */
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = 512;
+}
+
+static void lguest_load_gdt(const struct desc_ptr *desc)
+{
+	/* Does nothing. HV should have done everything for us */
+}
+
+static void lguest_load_idt(const struct desc_ptr *desc)
+{
+	unsigned int i;
+	struct gate_struct *idt = (void *)desc->address;
+
+	for (i = 0; i < (desc->size+1)/16; i++) {
+		hcall(LHCALL_LOAD_IDT_ENTRY, i, __pa((u64)&idt[i]), 0);
+	}
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+	hcall(LHCALL_CRASH, __pa(p), 0, 0);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+	.notifier_call = lguest_panic
+};
+
+static void lguest_memory_setup(void)
+{
+	/* We do this here because lockcheck barfs if before start_kernel */
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+}
+
+static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	int is_feature = (*eax == 1);
+
+	native_cpuid(eax, ebx, ecx, edx);
+	if (is_feature) {
+		unsigned long *excap = (unsigned long *)ecx,
+			*features = (unsigned long *)edx;
+		/* Hypervisor needs to know when we flush kernel pages. */
+		set_bit(X86_FEATURE_PGE, features);
+		/* We don't have any features! */
+		clear_bit(X86_FEATURE_VME, features);
+		clear_bit(X86_FEATURE_DE, features);
+		clear_bit(X86_FEATURE_PSE, features);
+		clear_bit(X86_FEATURE_PAE, features);
+		clear_bit(X86_FEATURE_SEP, features);
+		clear_bit(X86_FEATURE_APIC, features);
+		clear_bit(X86_FEATURE_MTRR, features);
+		/* No MWAIT, either */
+		clear_bit(3, excap);
+	}
+}
+
+static unsigned long current_cr3;
+static void lguest_write_cr3(unsigned long cr3)
+{
+	hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	current_cr3 = cr3;
+}
+
+static u64 lguest_read_msr(unsigned int msr, int *err)
+{
+	unsigned long val;
+
+	*err = 0;
+	hcall(LHCALL_RDMSR, msr, __pa(&val), 0);
+	return val;
+}
+
+static int lguest_write_msr(unsigned int msr, u64 val)
+{
+	hcall(LHCALL_WRMSR, msr, (unsigned long)val, 0);
+	return val;
+}
+
+static u64 lguest_read_tsc(void)
+{
+	/* we don't use natives, otherwise they can recurse */
+	unsigned int a,b;
+	asm volatile("rdtsc" : "=a" (a), "=d" (b));
+	return a | (unsigned long)(b) << 32 ;
+}
+
+static void lguest_flush_tlb(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void lguest_flush_tlb_kernel(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void lguest_flush_tlb_single(u64 addr)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB_SIG, current_cr3, addr, 0);
+}
+
+static void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	hcall(LHCALL_SET_PTE, current_cr3, __pa(ptep), pte_val(pteval));
+}
+
+static void lguest_set_pte_at(struct mm_struct *mm, u64 addr, pte_t *ptep,
pte_t pteval)
+{
+	*ptep = pteval;
+	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), __pa(ptep), pte_val(pteval));
+}
+
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+	lazy_hcall(LHCALL_SET_PMD, current_cr3, __pa(pmdp)&PTE_MASK,
+		   (__pa(pmdp)&(PAGE_SIZE-1))/8);
+}
+
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+	*pudp = pudval;
+	lazy_hcall(LHCALL_SET_PUD, current_cr3, __pa(pudp)&PTE_MASK,
+		   (__pa(pudp)&(PAGE_SIZE-1))/8);
+}
+
+static void lguest_set_pgd(pgd_t *pgdp, pgd_t pgdval)
+{
+	*pgdp = pgdval;
+	lazy_hcall(LHCALL_SET_PGD, current_cr3, __pa(pgdp)&PTE_MASK,
+		   (__pa(pgdp)&(PAGE_SIZE-1))/8);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void lguest_apic_write(unsigned long reg, unsigned int v)
+{
+}
+
+static unsigned int lguest_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+#if 0
+/* We move eflags word to lguest_data.irq_enabled to restore interrupt
+   state.  For page faults, gpfs and virtual interrupts, the
+   hypervisor has saved eflags manually, otherwise it was delivered
+   directly and so eflags reflects the real machine IF state,
+   ie. interrupts on.  Since the kernel always dies if it takes such a
+   trap with interrupts disabled anyway, turning interrupts back on
+   unconditionally here is OK. */
+asm("lguest_iret:"
+    " pushq	%rax;"
+    " movq	0x18(%rsp), %rax;"
+    "lguest_noirq_start:;"
+    " movq	%rax,
lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";"
+    " popq	%rax;"
+    " iretq;"
+    "lguest_noirq_end:");
+extern char lguest_noirq_start[], lguest_noirq_end[];
+#endif
+
+extern void lguest_iret(void);
+asm("lguest_iret:"
+    "  movq  $" __stringify(LHCALL_IRET) ", %rax\n"
+    "  int   $" __stringify(LGUEST_TRAP_ENTRY) );
+
+
+static void lguest_load_rsp0(struct tss_struct *tss,
+				     struct thread_struct *thread)
+{
+	lazy_hcall(LHCALL_SET_STACK, thread->rsp0, THREAD_SIZE/PAGE_SIZE, 0);
+}
+
+static void lguest_load_tr_desc(void)
+{
+}
+
+static void lguest_set_ldt(const void *addr, unsigned entries)
+{
+	/* FIXME: Implement. */
+	BUG_ON(entries);
+}
+
+static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static void lguest_set_debugreg(int regno, unsigned long value)
+{
+	/* FIXME: Implement */
+}
+
+static unsigned int lguest_cr0;
+static void lguest_clts(void)
+{
+	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	lguest_cr0 &= ~8U;
+}
+
+static unsigned long lguest_read_cr0(void)
+{
+	return lguest_cr0;
+}
+
+static void lguest_write_cr0(unsigned long val)
+{
+	hcall(LHCALL_TS, val & 8, 0, 0);
+	lguest_cr0 = val;
+}
+
+static unsigned long lguest_read_cr2(void)
+{
+	return lguest_data.cr2;
+}
+
+static unsigned long lguest_read_cr3(void)
+{
+	return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static unsigned long lguest_read_cr4(void)
+{
+	return 0;
+}
+
+static void lguest_write_cr4(unsigned long val)
+{
+}
+
+static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+	update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+	set_bit(irq, lguest_data.interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+	clear_bit(irq, lguest_data.interrupts);
+	/* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+	.name		= "lguest",
+	.mask		= disable_lguest_irq,
+	.mask_ack	= disable_lguest_irq,
+	.unmask		= enable_lguest_irq,
+};
+
+static void lguest_time_init(void)
+{
+	set_irq_handler(0, lguest_time_irq);
+	hcall(LHCALL_TIMER_START,HZ,0,0);
+}
+
+static void lguest_ebda_info(unsigned *addr, unsigned *size)
+{
+	*addr = *size = 0;
+}
+
+/* From i8259.c */
+extern void (*interrupt[])(void);
+static void __init lguest_init_IRQ(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_IRQS; i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		/* FIXTHEM: We should be doing it in a lot of other places */
+		if (vector != IA32_SYSCALL_VECTOR) {
+			printk("Setting vector %x as %p\n",vector, &interrupt[i]);
+			set_intr_gate(vector, interrupt[i]);
+			set_irq_chip_and_handler(i, &lguest_irq_controller,
+							 handle_level_irq);
+			hcall(LHCALL_LOAD_IDT_ENTRY, vector, __pa((u64)&idt_table[vector]), 0);
+		}
+	}
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low,
u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static void lguest_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	/* FIXME: Allow this. */
+	BUG();
+}
+
+static void lguest_write_gdt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static void lguest_write_idt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_CRASH, 0, 0 ,0);
+	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled)
+#define DEF_LGUEST(name, code)				\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name
":")
+DEF_LGUEST(cli, "movl $0," LGUEST_IRQ);
+DEF_LGUEST(sti, "movl $512," LGUEST_IRQ);
+DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ);
+DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax");
+DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0,"
LGUEST_IRQ);
+DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */
+
+static const struct lguest_insns
+{
+	const char *start, *end;
+} lguest_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+		return len;
+
+	insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, lguest_insns[type].start, insn_len);
+	if (type == PARAVIRT_INTERRUPT_RETURN) {
+		/* Jumps are relative. */
+		u64 off = (u64)lguest_iret - ((u64)insns + insn_len);
+		memcpy(insns+1, &off, sizeof(off));
+	}
+	return insn_len;
+}
+
+static void lguest_safe_halt(void)
+{
+	hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static void lguest_syscall_init(void)
+{
+	/* FIXME: Will have to implement it later */
+}
+
+static __attribute_used__ __init void lguest_init(void)
+{
+	int i;
+
+	current_cr3 = __pa(&boot_level4_pgt);
+	paravirt_ops.name = "lguest";
+	paravirt_ops.mem_type = "LGUEST";
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.syscall_init = lguest_syscall_init;
+
+	paravirt_ops.save_fl = save_fl;
+	paravirt_ops.restore_fl = restore_fl;
+	paravirt_ops.irq_disable = irq_disable;
+	paravirt_ops.irq_enable = irq_enable;
+	paravirt_ops.load_gdt = lguest_load_gdt;
+	paravirt_ops.memory_setup = lguest_memory_setup;
+	paravirt_ops.cpuid = lguest_cpuid;
+	paravirt_ops.write_cr3 = lguest_write_cr3;
+	paravirt_ops.read_msr = lguest_read_msr,
+	paravirt_ops.write_msr = lguest_write_msr,
+	paravirt_ops.read_tsc = lguest_read_tsc,
+	paravirt_ops.flush_tlb_user = lguest_flush_tlb;
+	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	paravirt_ops.set_pte = lguest_set_pte;
+	paravirt_ops.set_pte_at = lguest_set_pte_at;
+	paravirt_ops.set_pmd = lguest_set_pmd;
+	paravirt_ops.set_pud = lguest_set_pud;
+	paravirt_ops.set_pgd = lguest_set_pgd;
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_write = lguest_apic_write;
+	paravirt_ops.apic_read = lguest_apic_read;
+#endif
+	paravirt_ops.load_idt = lguest_load_idt;
+	paravirt_ops.iret = lguest_iret;
+	paravirt_ops.load_rsp0 = lguest_load_rsp0;
+	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+	paravirt_ops.set_ldt = lguest_set_ldt;
+	paravirt_ops.load_tls = lguest_load_tls;
+	paravirt_ops.set_debugreg = lguest_set_debugreg;
+	paravirt_ops.clts = lguest_clts;
+	paravirt_ops.read_cr0 = lguest_read_cr0;
+	paravirt_ops.write_cr0 = lguest_write_cr0;
+	paravirt_ops.init_IRQ = lguest_init_IRQ;
+	paravirt_ops.read_cr2 = lguest_read_cr2;
+	paravirt_ops.read_cr3 = lguest_read_cr3;
+	paravirt_ops.read_cr4 = lguest_read_cr4;
+	paravirt_ops.write_cr4 = lguest_write_cr4;
+	paravirt_ops.write_ldt_entry = lguest_write_ldt_entry;
+	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+	paravirt_ops.patch = lguest_patch;
+	paravirt_ops.safe_halt = lguest_safe_halt;
+	paravirt_ops.get_wallclock = lguest_get_wallclock;
+	paravirt_ops.time_init = lguest_time_init;
+#ifdef PARAVIRT_LAZY_NONE
+	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+#endif
+	paravirt_ops.ebda_info = lguest_ebda_info;
+
+	memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status));
+#if 0
+	lguest_data.noirq_start = (u64)lguest_noirq_start;
+	lguest_data.noirq_end = (u64)lguest_noirq_end;
+#endif
+	lguest_data.start_kernel_map = __START_KERNEL_map; /* current page offset */
+	lguest_data.page_offset = PAGE_OFFSET;
+
+	code_stack[0].next = __pa(&code_stack[1]);
+	code_stack[0].start = (unsigned long)_stext;
+	code_stack[0].end = (unsigned long)_etext;
+	code_stack[1].next = 0;
+	code_stack[1].start = (unsigned long)_sinittext;
+	code_stack[1].end = (unsigned long)_einittext;
+
+	lguest_data.text = __pa(&code_stack[0]);
+
+	lguest_data.kallsyms_addresses = __pa(&kallsyms_addresses);
+	lguest_data.kallsyms_num_syms = kallsyms_num_syms;
+	lguest_data.kallsyms_names = __pa(&kallsyms_names);
+	lguest_data.kallsyms_token_table = __pa(&kallsyms_token_table);
+	lguest_data.kallsyms_token_index = __pa(&kallsyms_token_index);
+	lguest_data.kallsyms_markers = __pa(&kallsyms_markers);
+
+	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+
+	lguest_pops = &local_pops;
+	lguest_paravirt = 1;
+
+	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
+	lguest_write_cr3(__pa_symbol(&init_level4_pgt));
+
+ 	for (i = 0; i < NR_CPUS; i++)
+ 		cpu_pda(i) = &boot_cpu_pda[i];
+
+	pda_init(0);
+//	copy_bootdata(real_mode_data);
+#ifdef CONFIG_SMP
+	cpu_set(0, cpu_online_map);
+#endif
+
+//	strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+	/* We use top of mem for initial pagetables. */
+//	init_pg_tables_end = __pa(pg0);
+
+//	reserve_top_address(lguest_data.reserve_mem);
+
+	/* FIXME: Better way? */
+	/* Suppress vgacon startup code */
+	SCREEN_INFO.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+	add_preferred_console("hvc", 0, NULL);
+/*
+#ifdef CONFIG_X86_MCE
+	mcheck_disable(NULL);
+#endif
+*/
+#ifdef CONFIG_ACPI
+	acpi_disabled = 1;
+	acpi_ht = 0;
+#endif
+	if (boot->initrd_size) {
+		/* We stash this at top of memory. */
+		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+		INITRD_SIZE = boot->initrd_size;
+		LOADER_TYPE = 0xFF;
+	}
+	pm_power_off = lguest_power_off;
+
+	start_kernel();
+}
+
+asm("lguest_maybe_init:\n"
+    "	cmpq $"__stringify(LGUEST_MAGIC_R13)", %r13\n"
+    "	jne 1f\n"
+    "	cmpq $"__stringify(LGUEST_MAGIC_R14)", %r14\n"
+    "	jne 1f\n"
+    "	cmpq $"__stringify(LGUEST_MAGIC_R15)", %r15\n"
+    "	je lguest_init\n"
+    "1: ret");
+
+extern void asmlinkage lguest_maybe_init(void);
+paravirt_probe(lguest_maybe_init);
Index: work-pv/arch/x86_64/lguest/lguest.h
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest.h
@@ -0,0 +1,161 @@
+#ifndef _LGUEST_GUEST_H_
+#define _LGUEST_GUEST_H_
+
+#define GUEST_DPL 0x3
+
+#define gdt_index(x) ((x) >> 3)
+
+/*
+ * Must be less than fixmap!
+ *
+ * To keep the hypervisor from needing any data sections,
+ * we need to hard code the difference between what the hypervisor
+ * may put into the GS base, and what we let the guest put in.
+ * We allow the guest to put in "Kernel addresses" to simplify
+ * the guest PDA code.
+ */
+#define LGUEST_HV_OFFSET_HIGH 0xffffffff
+#define LGUEST_HV_OFFSET_LOW  0xff000000
+
+#define LGUEST_NMI_IST 7
+
+#define LGUEST_MAGIC 0x6c6775657374 /* "lguest" */
+
+#ifndef __ASSEMBLY__
+#include <asm/lguest.h>
+
+extern void switch_to_guest(struct lguest_vcpu *);
+extern unsigned long hcall_teste;
+extern unsigned long host_syscall;
+extern unsigned long _lguest_default_idt_entries[];
+extern unsigned long lguest_hv_addr;
+extern unsigned long lguest_hv_offset;
+extern int lguest_hv_pages;
+extern int lguest_vcpu_pages;
+extern int lguest_vcpu_order;
+extern struct mutex lguest_lock;
+
+/* FIXME: Those would live better in some main kernel header */
+/* Page fault error code bits */
+#define PF_PROT	(1<<0)		/* or no page found */
+#define PF_WRITE	(1<<1)
+#define PF_USER	(1<<2)
+#define PF_RSVD	(1<<3)
+#define PF_INSTR	(1<<4)
+
+#define kill_guest(guest, fmt...)				\
+do {								\
+	if (!(guest)->dead) {					\
+		(guest)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(guest)->dead)				\
+			(guest)->dead = (void *)-1;		\
+	}							\
+} while (0)
+
+#define kill_guest_dump(vcpu, fmt...)		\
+do {						\
+	kill_guest((vcpu)->guest, fmt);		\
+	lguest_dump_vcpu_regs(vcpu);		\
+}  while(0)
+
+static inline void _lguest_set_gate(struct gate_struct *s, unsigned type,
unsigned long func,
+				    unsigned dpl, unsigned ist)
+{
+        s->offset_low = PTR_LOW(func);
+        s->segment = __HV_CS;
+        s->ist = ist;
+        s->p = 1;
+        s->dpl = dpl;
+        s->zero0 = 0;
+        s->zero1 = 0;
+        s->type = type;
+        s->offset_middle = PTR_MIDDLE(func);
+        s->offset_high = PTR_HIGH(func);
+}
+
+static inline unsigned long guest_pa(struct lguest_guest_info *linfo, u64 addr)
+{
+	return (addr >= linfo->start_kernel_map) ?
+		(addr - linfo->start_kernel_map) :
+		(addr - linfo->page_offset);
+}
+
+int lguest_address_ok(const struct lguest_guest_info *, u64);
+
+int demand_page(struct lguest_vcpu *, u64, int);
+/* FIXME: put this in hv_vm.h */
+unsigned long hvvm_get_actual_phys(void *addr, pgprot_t *prot);
+
+int lguest_device_init(void);
+void lguest_device_remove(void);
+
+/* page_tables.h */
+int lguest_map_hv_pages(struct lguest_guest_info *lguest,
+			   unsigned long vaddr, int pages,
+			   pgprot_t *prot);
+int lguest_map_guest_page(struct lguest_guest_info *lguest,
+			  unsigned long vaddr, unsigned long paddr,
+			  pgprot_t prot);
+void lguest_unmap_guest_pages(struct lguest_guest_info *lguest,
+			      unsigned long vaddr, int pages);
+void lguest_free_guest_pages(struct lguest_guest_info *lguest);
+
+void *lguest_mem_addr(struct lguest_vcpu *vcpu, u64 vaddr);
+
+void guest_set_pte(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long idx);
+void guest_set_pmd(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long val);
+void guest_set_pud(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long val);
+void guest_set_pgd(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long val);
+void guest_flush_tlb_single(struct lguest_vcpu *vcpu, u64 cr3, u64 vaddr);
+void guest_pagetable_clear_all(struct lguest_vcpu *vcpu);
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu);
+void guest_new_pagetable(struct lguest_vcpu *vcpu, u64 pgtable);
+
+int init_guest_pagetable(struct lguest_guest_info *linfo, u64 pgtable);
+int lguest_init_vcpu_pagetable(struct lguest_vcpu *vcpu);
+
+int hypercall(struct lguest_vcpu *vcpu);
+
+/* core.c */
+u8 lhread_u8(struct lguest_vcpu *vcpu, u64 addr);
+u16 lhread_u16(struct lguest_vcpu *vcpu, u64 addr);
+u64 lhread_u64(struct lguest_vcpu *vcpu, u64 addr);
+void lhwrite_u64(struct lguest_vcpu *vcpu, u64 addr, u64 val);
+
+void lhread(struct lguest_guest_info *, void *, u64, unsigned);
+void lhwrite(struct lguest_guest_info *, u64, const void *, unsigned);
+
+/* io.c */
+u32 bind_dma(struct lguest_guest_info *, unsigned long, unsigned long,
+					u16, u8);
+int send_dma(struct lguest_guest_info *, unsigned long, unsigned long);
+
+/* interrupts_and_traps.c */
+
+void load_guest_idt_entry(struct lguest_vcpu *, unsigned int,
+						struct gate_struct *);
+void maybe_do_interrupt(struct lguest_vcpu *);
+void guest_iret(struct lguest_vcpu *vcpu);
+int reflect_trap(struct lguest_vcpu *, int, int);
+
+/* lguest_debug.c */
+extern int lguest_debug;
+void lgdebug_print(const char *fmt, ...);
+void lgdebug_vprint(const char *fmt, va_list ap);
+void lguest_dump_vcpu_regs(struct lguest_vcpu *vcpu);
+void lguest_dump_trace(struct lguest_vcpu *vcpu, struct lguest_regs *regs);
+void lguest_print_address(struct lguest_vcpu *vcpu, unsigned long address);
+void lguest_print_page_tables(u64 *cr3);
+void lguest_print_guest_page_tables(struct lguest_vcpu *vcpu, u64 cr3);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
Index: work-pv/arch/x86_64/lguest/lguest_user.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest_user.c
@@ -0,0 +1,436 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <asm/lguest_user.h>
+#include <asm/hv_vm.h>
+#include "lguest.h"
+
+static int next_guest_id;
+
+#if 0
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long addr, udma, irq;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	udma = get_dma_buffer(lg, addr, &irq);
+	if (!udma)
+		return -ENOENT;
+
+	/* We put irq number in udma->used_len. */
+	lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+	return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+	u32 irq;
+
+	if (get_user(irq, input) != 0)
+		return -EFAULT;
+	if (irq >= LGUEST_IRQS)
+		return -EINVAL;
+	set_bit(irq, lg->irqs_pending);
+	return 0;
+}
+#endif
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+	struct lguest_vcpu *vcpu = file->private_data;
+	struct lguest_guest_info *linfo = vcpu->guest;
+	int ret;
+
+	if (!vcpu)
+		return -EINVAL;
+
+	if (linfo->dead) {
+		size_t len;
+
+		if (linfo->dead == (void *)-1)
+			return -ENOMEM;
+
+		len = min(size, strlen(linfo->dead)+1);
+		if (copy_to_user(user, linfo->dead, len) != 0)
+			return -EFAULT;
+		return len;
+	}
+
+#if 0
+	if (lg->dma_is_pending)
+		lg->dma_is_pending = 0;
+#endif
+
+	ret = run_guest(vcpu, user);
+	if (ret != -EINTR)
+		ret = -ENOENT;
+	return ret;
+}
+
+struct lguest_vcpu *allocate_vcpu(struct lguest_guest_info *linfo)
+{
+	struct lguest_vcpu *vcpu;
+	unsigned long hv_vcpu;
+	int ret;
+
+	vcpu = (void*)__get_free_pages(GFP_KERNEL, lguest_vcpu_order);
+	if (!vcpu)
+		return NULL;
+	memset(vcpu, 0, sizeof(*vcpu));
+
+	ret = hvvm_map_pages(vcpu, lguest_vcpu_pages, &hv_vcpu);
+	if (ret < 0)
+		goto out;
+
+	ret = lguest_map_hv_pages(linfo, hv_vcpu, lguest_vcpu_pages, NULL);
+	if (ret < 0)
+		goto out2;
+
+	vcpu->host_page = (unsigned long)vcpu;
+
+	return (struct lguest_vcpu*)hv_vcpu;
+
+out2:
+	hvvm_unmap_pages(hv_vcpu, lguest_vcpu_pages);
+out:
+	free_pages((unsigned long)vcpu, lguest_vcpu_order);
+
+	return NULL;
+}
+
+void free_vcpu(struct lguest_guest_info *linfo, struct lguest_vcpu *vcpu)
+{
+	unsigned long hv_vcpu = (unsigned long)vcpu;
+	free_pages(vcpu->host_page, lguest_vcpu_order);
+	lguest_unmap_guest_pages(linfo, hv_vcpu, lguest_vcpu_pages);
+	hvvm_unmap_pages(hv_vcpu, lguest_vcpu_pages);
+	lguest_free_guest_pages(linfo);
+}
+
+#if 0
+static void print_tss(struct ldttss_desc *tss)
+{
+	u64 base;
+	u64 limit;
+	int i;
+	u16 iobp = 0x64;
+
+	base = (tss->base0) + ((u64)tss->base1 << 16) +
+		((u64)tss->base2 << 24) + ((u64)tss->base3 << 32);
+	limit = (tss->limit0) + ((u64)tss->limit1 << 16);
+	if (tss->g)
+		limit <<= 12;
+	printk("    base: %016llx\n", base);
+	printk("   limit: %llx\n", limit);
+	printk("    type: %x\n", tss->type);
+	printk("     dpl: %d\n", tss->dpl);
+	printk("       p: %d\n", tss->p);
+	printk("       g: %d\n", tss->g);
+
+	for (i=0; i < limit; i += 4) {
+		printk("   %8x: %08x\n", i, *(u32*)(base+i));
+		if (i == 0x64) {
+			iobp = (u16)((*(u32*)(base+i))>>16);
+		}
+		if (i >= iobp && *(s32*)(base+i) == -1L)
+			break;
+	}
+}
+#endif
+
+/* should be in some other file ? */
+int vcpu_start(int cpu, struct lguest_guest_info *linfo,
+				unsigned long entry_point,
+				void *pgd)
+{
+	struct lguest_vcpu *vcpu;
+	struct desc_struct *gdt_table;
+	struct lguest_regs *regs;
+	struct ldttss_desc *tss;
+	struct lguest_tss_struct *tss_ptr;
+	u64 target;
+	u64 limit;
+	u64 base;
+	int i;
+
+	if (cpu > LGUEST_MAX_VCPUS)
+		return -EINVAL;
+
+	vcpu = allocate_vcpu(linfo);
+	if (!vcpu)
+		return -ENOMEM;
+
+	printk("vcpu: %p\n", vcpu);
+
+	/*
+	 * Point back to itself to make it easier to read from gs:base in
+	 * hypervisor.S
+	 */
+	vcpu->vcpu = vcpu;
+	vcpu->magic = LGUEST_MAGIC;
+	gdt_table = cpu_gdt(get_cpu());
+	put_cpu();
+
+	/* Our gdt is basically host's, except for the privilege level */
+	for (i = 0; i < GDT_ENTRIES; i++) {
+		vcpu->gdt_table[i] = gdt_table[i];
+
+		if (!gdt_table[i].type)
+			continue;
+
+		switch (i) {
+		/* Keep TSS, and HV, and Host KERNEL segments the same */
+		case GDT_ENTRY_TSS:
+			/* The TSS will be modified below */
+		case GDT_ENTRY_HV_CS:
+		case GDT_ENTRY_HV_DS:
+		case __KERNEL_CS >> 3:
+		case __KERNEL_DS >> 3:
+			break;
+		default:
+			vcpu->gdt_table[i].dpl = GUEST_DPL;
+		}
+	}
+
+	for (i = 0; i < IDT_ENTRIES; i++) {
+		unsigned dpl = i == LGUEST_TRAP_ENTRY ? GUEST_DPL : 0;
+		/* NMI gets its own stack */
+		int ist = (i == 2) ? LGUEST_NMI_IST :
+			/* temp debug for now */
+			(i == 8) ? 6 :   /* Double Fault */
+//			(i == 13) ? 5 :  /* GPF */
+			0;
+
+		_lguest_set_gate(&vcpu->idt_table[i], 0xe,
+				 _lguest_default_idt_entries[i] +
+				 lguest_hv_offset, dpl, ist);
+	}
+
+	vcpu->gdt.size = 8 * GDT_ENTRIES - 1;
+	vcpu->gdt.address = (unsigned long)&vcpu->gdt_table;
+
+	vcpu->idt.size = 16 * IDT_ENTRIES -1;
+	vcpu->idt.address = (unsigned long)vcpu->idt_table;
+	rdmsrl(MSR_LSTAR, vcpu->host_syscall);
+
+	vcpu->id = cpu;
+	vcpu->guest = linfo;
+	linfo->vcpu[cpu] = vcpu;
+
+	lguest_init_vcpu_pagetable(vcpu);
+
+	/* setup the tss */
+	tss = (struct ldttss_desc*)&vcpu->gdt_table[GDT_ENTRY_TSS];
+	limit = sizeof(struct lguest_tss_struct);
+	base = (u64)&vcpu->tss;
+	tss->limit0 = (u16)limit;
+	tss->base0 = (u16)base;
+	tss->base1 = (u8)(base>>16);
+	tss->base2 = (u8)(base>>24);
+	tss->base3 = (u32)(base>>32);
+	tss->type = 0x9;
+	tss->g = 0; /* small tss */
+
+	vcpu->tss.rsp0 = (unsigned long)(&vcpu->regs.size);
+
+	/* NMI can happen at any time, so give it its own stack */
+	vcpu->tss.ist[LGUEST_NMI_IST-1] = (unsigned
long)(&vcpu->nmi_stack_end);
+	printk("nmi stack at: %llx\n", vcpu->tss.ist[LGUEST_NMI_IST-1]);
+
+	/* temp debug stuff */
+	vcpu->tss.ist[5-1] = (unsigned long)(&vcpu->gpf_stack_end);
+	vcpu->tss.ist[6-1] = (unsigned long)(&vcpu->df_stack_end);
+	/*
+	 * Load the host nmi stack into the guest tss. This prevents races
+	 * in loading the TR and IDT.
+	 */
+	tss = (struct ldttss_desc *)&gdt_table[GDT_ENTRY_TSS];
+	target = (u64)tss->base0 |
+		((u64)tss->base1 << 16) |
+		((u64)tss->base2 << 24) |
+		((u64)tss->base3 << 32);
+
+	tss_ptr = (struct lguest_tss_struct*)target;
+
+	vcpu->tss.ist[NMI_STACK-1] = tss_ptr->ist[NMI_STACK-1];
+
+	/*
+	 * The rsp0 had better be on 16 bytes aligned, or the interrupt
+	 * will put the stack at a undesireable location.
+	 */
+	/* Don't remove this test!!! */
+	if (unlikely(vcpu->tss.rsp0 & 0xf)) {
+		printk("HV ALIGNMENT BUG! don't put stack here!!\n");
+		printk(" tss.rsp0 stack was set to %llx\n",
+		       vcpu->tss.rsp0);
+		goto out;
+	}
+
+	vcpu->tss.io_bitmap_base = 0x68;
+	vcpu->tss.io_bitmap[0] = -1UL;
+
+	regs = &vcpu->regs;
+	regs->cr3 = __pa(vcpu->pgdir->pgdir);
+	regs->rax = regs->rbx = regs->rcx = regs->rdx +	regs->r8 =
regs->r9 = regs->r10 = regs->r11 +	regs->r12 = regs->rdi =
regs->rsi = regs->rbp = 0;
+	regs->r13 = LGUEST_MAGIC_R13;
+	regs->r14 = LGUEST_MAGIC_R14;
+	regs->r15 = LGUEST_MAGIC_R15;
+	regs->fs = 0;
+	regs->trapnum = 0;
+	regs->errcode = 0;
+	regs->rip = entry_point;
+//	regs->rip = 0x1000100;
+	regs->cs = __USER_CS;
+	regs->rflags = 0x202;   /* Interrupts enabled. */
+	regs->rsp = 0;
+	regs->ss = __USER_DS;
+
+	return 0;
+out:
+	free_vcpu(linfo, vcpu);
+	return -EINVAL;
+}
+
+static int initialize_guest(struct file *file, const u64 __user *input)
+{
+	struct lguest_guest_info *linfo;
+	int err;
+	u64 args[4];
+	int i;
+
+	if (file->private_data)
+		return -EBUSY;
+
+	if (copy_from_user(args, input, sizeof(args)) != 0)
+		return -EFAULT;
+
+	linfo = kzalloc(sizeof(*linfo), GFP_KERNEL);
+	if (!linfo)
+		return -ENOMEM;
+
+	mutex_init(&linfo->page_lock);
+
+	/* FIXME: protect the guest_id counter */
+	linfo->guest_id = ++next_guest_id;
+
+	linfo->pfn_limit = args[0];
+	linfo->page_offset = args[3];
+	linfo->start_kernel_map = args[3];
+
+	mutex_init(&linfo->page_lock);
+	INIT_LIST_HEAD(&linfo->pgd_list);
+
+	for (i=0; i < PUD_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&linfo->pud_hash[i]);
+
+	for (i=0; i < PMD_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&linfo->pmd_hash[i]);
+
+	for (i=0; i < PTE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&linfo->pte_hash[i]);
+
+	err = init_guest_pagetable(linfo, args[1]);
+	if (err)
+		return -ENOMEM; /* what else to return ?? */
+#if 0
+
+	lg->state = setup_guest_state(i,
lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+	if (!lg->state) {
+		err = -ENOEXEC;
+		goto release_pgtable;
+	}
+#endif
+	err = vcpu_start(0, linfo, args[2], __va(read_cr3()));
+	if (err < 0)
+		return err;
+
+	file->private_data = linfo->vcpu[0];
+
+	return sizeof(args);
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+		     size_t size, loff_t *off)
+{
+	struct lguest_vcpu *vcpu = file->private_data;
+	u64 req;
+
+	if (get_user(req, input) != 0)
+		return -EFAULT;
+	input += sizeof(req);
+
+	if (req != LHREQ_INITIALIZE && !vcpu)
+		return -EINVAL;
+#if 0
+	if (lg && lg->dead)
+		return -ENOENT;
+#endif
+
+	switch (req) {
+	case LHREQ_INITIALIZE:
+		return initialize_guest(file, (const u64 __user *)input);
+#if 0
+	case LHREQ_GETDMA:
+		return user_get_dma(lg, (const u32 __user *)input);
+	case LHREQ_IRQ:
+		return user_send_irq(lg, (const u32 __user *)input);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+	struct lguest_vcpu *vcpu = file->private_data;
+	struct lguest_guest_info *linfo;
+
+	if (!vcpu)
+		return -EBADFD;
+
+	linfo = vcpu->guest;
+	/* FIXME: need to handle multiple vcpus */
+	free_vcpu(linfo, vcpu);
+	kfree(linfo);
+#if 0
+	mutex_lock(&lguest_lock);
+	release_all_dma(lg);
+	free_page((long)lg->trap_page);
+	free_guest_pagetable(lg);
+	mmput(lg->mm);
+	if (lg->dead != (void *)1)
+		kfree(lg->dead);
+	memset(lg->state, 0, sizeof(*lg->state));
+	memset(lg, 0, sizeof(*lg));
+	mutex_unlock(&lguest_lock);
+#endif
+	return 0;
+}
+
+static struct file_operations lguest_fops = {
+	.owner	 = THIS_MODULE,
+	.release = close,
+	.write	 = write,
+	.read	 = read,
+};
+static struct miscdevice lguest_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lguest",
+	.fops	= &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+	return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+	misc_deregister(&lguest_dev);
+}
Index: work-pv/arch/x86_64/lguest/page_tables.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/page_tables.c
@@ -0,0 +1,1285 @@
+/* Shadow page table operations.
+ * Copyright (C) Steven Rostedt, Red Hat Inc, 2007
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include <asm/hv_vm.h>
+#include "lguest.h"
+
+/* move this to hv_vm.h */
+#define HVVM_END (HVVM_START + HV_VIRT_SIZE)
+
+#define HASH_PUD(x) (((u64)(x)>>PAGE_SHIFT) & (PUD_HASH_SIZE-1))
+#define HASH_PMD(x) (((u64)(x)>>PAGE_SHIFT) & (PMD_HASH_SIZE-1))
+#define HASH_PTE(x) (((u64)(x)>>PAGE_SHIFT) & (PTE_HASH_SIZE-1))
+
+/* guest and host share the same offset into the page tables */
+/* 9 bits at 8 byte increments */
+#define guest_host_idx(vaddr) ((vaddr) & (0x1ff<<3))
+
+
+/* These access the guest versions. */
+static u64 gtoplev(struct lguest_vcpu *vcpu, unsigned long vaddr)
+{
+	unsigned index = pgd_index(vaddr);
+
+	return vcpu->pgdir->cr3 + index * sizeof(u64);
+}
+
+
+#if 0
+
+/* FIXME: we need to put these in and make it more secure! */
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+	if ((entry & (_PAGE_PWT|_PAGE_PSE))
+	    || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
+	return entry & ~_PAGE_GLOBAL;
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+	u32 stack = lg->state->tss.esp1;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+			kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+	unsigned int i;
+
+	release_all_pagetables(lg);
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+	int cpu = smp_processor_id();
+
+	hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+	/* Since hypervisor less that 4MB, we simply mug top pte page. */
+	lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] +	
(__pa(hypervisor_pte_page(cpu))| __PAGE_KERNEL);
+}
+
+#endif
+
+static int __lguest_map_guest_page(struct lguest_guest_info *linfo, u64 *cr3,
+				   unsigned long vaddr, unsigned long paddr,
+				   pgprot_t pprot);
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+	struct vm_area_struct *vma;
+	struct page *page;
+	unsigned long ret = -1UL;
+
+	down_read(&current->mm->mmap_sem);
+	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+			   1, write, 1, &page, &vma) == 1)
+		ret = page_to_pfn(page);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static int is_hv_page(int pgd_idx, int pud_idx, int pmd_idx, int pte_idx)
+{
+	/* Never release the hv pages */
+	u64 addr = (u64)pgd_idx << PGDIR_SHIFT |
+		(u64)pud_idx << PUD_SHIFT |
+		(u64)pmd_idx << PMD_SHIFT |
+		(u64)pte_idx << PAGE_SHIFT;
+	/* sign extend */
+	if (pgd_idx & (1<<8))
+		addr |= 0xffffULL << 48;
+	return (addr >= HVVM_START) &&
+		(addr < (HVVM_START + HV_VIRT_SIZE));
+}
+
+static void release_pte(u64 pte)
+{
+	if (pte & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+static int release_pmd(int pgd_idx, int pud_idx, u64 *pmd, int idx)
+{
+	int save = 0;
+	if (pmd[idx] & _PAGE_PRESENT) {
+		int i;
+		u64 *ptepage = __va(pmd[idx] & PTE_MASK);
+		for (i=0; i < PTRS_PER_PMD; i++)
+			if (is_hv_page(pgd_idx, pud_idx, idx, i))
+				save = 1;
+			else
+				release_pte(ptepage[i]);
+		/* never free the HV pmds */
+		if (!save) {
+			free_page((unsigned long)ptepage);
+			pmd[idx] = 0;
+		}
+	}
+	return save;
+}
+
+static int release_pud(int pgd_idx, u64 *pud, int idx)
+{
+	int save = 0;
+	if (pud[idx] & _PAGE_PRESENT) {
+		int i;
+		u64 *pmdpage = __va(pud[idx] & PTE_MASK);
+		for (i=0; i < PTRS_PER_PUD; i++)
+			if (release_pmd(pgd_idx, idx, pmdpage, i))
+				save = 1;
+		/* never free the HV puds */
+		if (!save) {
+			free_page((unsigned long)pmdpage);
+			pud[idx] = 0;
+		}
+	}
+	return save;
+}
+
+static int release_pgd(u64 *pgd, int idx)
+{
+	int save = 0;
+
+	if (pgd[idx] & _PAGE_PRESENT) {
+		int i;
+		u64 *pudpage = __va(pgd[idx] & PTE_MASK);
+		for (i=0; i < PTRS_PER_PGD; i++) {
+			if (release_pud(idx, pudpage, i))
+				save = 1;
+		}
+		/* never free the HV pgd */
+		if (!save) {
+			free_page((unsigned long)pudpage);
+			pgd[idx] = 0;
+		}
+	}
+	return save;
+}
+
+static struct lguest_pgd *find_pgd(struct lguest_guest_info *linfo, u64 cr3)
+{
+	struct lguest_pgd *pgdir;
+
+	list_for_each_entry(pgdir, &linfo->pgd_list, list)
+		if (!(pgdir->flags & LGUEST_PGD_MASTER_FL) && pgdir->cr3 ==
cr3)
+			break;
+
+	if (pgdir == list_entry(&linfo->pgd_list, struct lguest_pgd, list))
+		return NULL;
+
+	return pgdir;
+}
+
+static struct lguest_pud *find_pud(struct lguest_guest_info *linfo, u64 gpud)
+{
+	unsigned idx = HASH_PUD(gpud);
+	struct lguest_pud *pudir;
+
+	list_for_each_entry(pudir, &linfo->pud_hash[idx], list)
+		if (pudir->gpud == gpud)
+			break;
+
+	if (pudir == list_entry(&linfo->pud_hash[idx], struct lguest_pud,
list))
+		return NULL;
+
+	return pudir;
+}
+
+static struct lguest_pmd *find_pmd(struct lguest_guest_info *linfo, u64 gpmd)
+{
+	unsigned idx = HASH_PMD(gpmd);
+	struct lguest_pmd *pmdir;
+
+	list_for_each_entry(pmdir, &linfo->pmd_hash[idx], list)
+		if (pmdir->gpmd == gpmd)
+			break;
+
+	if (pmdir == list_entry(&linfo->pmd_hash[idx], struct lguest_pmd,
list))
+		return NULL;
+
+	return pmdir;
+}
+
+static struct lguest_pte *find_pte(struct lguest_guest_info *linfo, u64 gpte)
+{
+	unsigned idx = HASH_PTE(gpte);
+	struct lguest_pte *pte;
+
+	list_for_each_entry(pte, &linfo->pte_hash[idx], list)
+		if (pte->gpte == gpte)
+			break;
+
+	if (pte == list_entry(&linfo->pte_hash[idx], struct lguest_pte, list))
+		return NULL;
+
+	return pte;
+}
+
+static void __release_pte_hash(struct lguest_vcpu *vcpu, struct lguest_pte
*pte)
+{
+	list_del(&pte->list);
+	kfree(pte);
+}
+
+static void __release_pmd_hash(struct lguest_vcpu *vcpu, struct lguest_pmd
*pmdir)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pte *pte;
+	int i;
+
+	list_del(&pmdir->list);
+
+	for (i=0; i < PTRS_PER_PMD; i++) {
+		u64 gpte;
+
+		gpte = lhread_u64(vcpu, pmdir->gpmd+i*sizeof(u64));
+		if (!gpte)
+			continue;
+		pte = find_pte(linfo, gpte & PTE_MASK);
+		if (!pte)
+			continue;
+		__release_pte_hash(vcpu, pte);
+	}
+
+	kfree(pmdir);
+}
+
+static void __release_pud_hash(struct lguest_vcpu *vcpu, struct lguest_pud
*pudir)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pmd *pmdir;
+	int i;
+
+	list_del(&pudir->list);
+
+	for (i=0; i < PTRS_PER_PUD; i++) {
+		u64 gpmd;
+
+		gpmd = lhread_u64(vcpu, pudir->gpud+i*sizeof(u64));
+		if (!gpmd)
+			continue;
+		pmdir = find_pmd(linfo, gpmd & PTE_MASK);
+		if (!pmdir)
+			continue;
+		__release_pmd_hash(vcpu, pmdir);
+	}
+
+	kfree(pudir);
+}
+
+static struct lguest_pud *hash_pud(struct lguest_vcpu *vcpu, u64 gpud, unsigned
idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	unsigned h;
+
+	mutex_lock(&linfo->page_lock);
+	pudir = find_pud(linfo, gpud);
+	if (!pudir) {
+		/* FIXME: make this a slab? */
+		pudir = kzalloc(sizeof(*pudir), GFP_KERNEL);
+		if (!pudir)
+			goto out;
+		h = HASH_PUD(gpud);
+		list_add(&pudir->list, &linfo->pud_hash[h]);
+		pudir->pgdir = vcpu->pgdir;
+		pudir->gpud = gpud;
+		pudir->idx = idx;
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+
+	return pudir;
+}
+
+static struct lguest_pmd *hash_pmd(struct lguest_vcpu *vcpu, struct lguest_pud
*pudir,
+				   u64 gpmd, unsigned idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pmd *pmdir;
+	unsigned h;
+
+	mutex_lock(&linfo->page_lock);
+	pmdir = find_pmd(linfo, gpmd);
+	if (!pmdir) {
+		/* FIXME: make this a slab? */
+		pmdir = kzalloc(sizeof(*pmdir), GFP_KERNEL);
+		if (!pmdir)
+			goto out;
+		h = HASH_PMD(gpmd);
+		list_add(&pmdir->list, &linfo->pmd_hash[h]);
+		pmdir->pudir = pudir;
+		pmdir->gpmd = gpmd;
+		pmdir->idx = idx;
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+
+	return pmdir;
+}
+
+static struct lguest_pte *hash_pte(struct lguest_vcpu *vcpu, struct lguest_pmd
*pmdir,
+				   u64 gpte, unsigned idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pte *pte;
+	unsigned h;
+
+	mutex_lock(&linfo->page_lock);
+	pte = find_pte(linfo, gpte);
+	if (!pte) {
+		/* FIXME: make this a slab? */
+		pte = kzalloc(sizeof(*pte), GFP_KERNEL);
+		if (!pte)
+			goto out;
+		h = HASH_PTE(gpte);
+		list_add(&pte->list, &linfo->pte_hash[h]);
+		pte->pmdir = pmdir;
+		pte->gpte = gpte;
+		pte->idx = idx;
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+
+	return pte;
+}
+
+void guest_set_pte(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long vaddr,
+		   unsigned long value)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	struct lguest_pmd *pmdir;
+	struct lguest_pte *ptedir;
+	unsigned long idx = (vaddr & (PAGE_SIZE-1)) / 8;
+	u64 base = vaddr & PTE_MASK;
+	u64 pgd;
+	u64 pud;
+	u64 pmd;
+	u64 pte;
+	u64 *pudpage;
+	u64 *pmdpage;
+	u64 *ptepage;
+
+	mutex_lock(&linfo->page_lock);
+
+	ptedir = find_pte(linfo, base);
+	if (!ptedir)
+		goto out;
+
+	pmdir = ptedir->pmdir;
+	pudir = pmdir->pudir;
+
+	pgd = vcpu->pgdir->pgdir[pudir->idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[pmdir->idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	pmdpage = __va(pud & PTE_MASK);
+	pmd = pmdpage[ptedir->idx];
+
+	if (!(pmd & _PAGE_PRESENT))
+		goto out;
+
+	ptepage = __va(pmd & PTE_MASK);
+	pte = ptepage[idx];
+
+	if (!(pte & _PAGE_PRESENT))
+		goto out;
+
+	/* If the guest is trying to touch HV area, kill it! */
+	if (is_hv_page(pudir->idx, pmdir->idx, ptedir->idx, idx)) {
+		kill_guest_dump(vcpu, "guest trying to write to HV area\n");
+		goto out;
+	}
+
+	/* FIXME: perhaps we could set the pte now ? */
+
+	release_pte(ptepage[idx]);
+	__release_pte_hash(vcpu, ptedir);
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pmd(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	struct lguest_pmd *pmdir;
+	u64 pgd;
+	u64 pud;
+	u64 pmd;
+	u64 *pudpage;
+	u64 *pmdpage;
+	int save;
+
+	if (idx >= PTRS_PER_PMD) {
+		kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+		return;
+	}
+
+	mutex_lock(&linfo->page_lock);
+
+	pmdir = find_pmd(linfo, base);
+	if (!pmdir)
+		goto out;
+
+	pudir = pmdir->pudir;
+
+	pgd = vcpu->pgdir->pgdir[pudir->idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[pmdir->idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	pmdpage = __va(pud & PTE_MASK);
+	pmd = pmdpage[idx];
+
+	if (!(pmd & _PAGE_PRESENT))
+		goto out;
+
+	save = release_pmd(pudir->idx, pmdir->idx, pmdpage, idx);
+	if (!save)
+		__release_pmd_hash(vcpu, pmdir);
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pud(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	u64 pgd;
+	u64 pud;
+	u64 *pudpage;
+	int save;
+
+	if (idx >= PTRS_PER_PUD) {
+		kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+		return;
+	}
+
+	mutex_lock(&linfo->page_lock);
+
+	pudir = find_pud(linfo, base);
+	if (!pudir)
+		goto out;
+
+	pgd = vcpu->pgdir->pgdir[pudir->idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	save = release_pud(pudir->idx, pudpage, idx);
+	if (!save)
+		__release_pud_hash(vcpu, pudir);
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pgd(struct lguest_vcpu *vcpu, unsigned long cr3,
+		   unsigned long base, unsigned long idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pgd *pgdir;
+	struct lguest_pud *pudir;
+	u64 gpud;
+	u64 pgd;
+	u64 pud;
+	int save;
+
+	pgdir = vcpu->pgdir;
+
+	if (idx >= PTRS_PER_PGD) {
+		kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+		return;
+	}
+
+	mutex_lock(&linfo->page_lock);
+
+	pgd = pgdir->pgdir[idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pud = pgd & PTE_MASK;
+
+	gpud = lhread_u64(vcpu, base + idx * sizeof(u64));
+	pudir = find_pud(linfo, gpud & PTE_MASK);
+	if (pudir)
+		__release_pud_hash(vcpu, pudir);
+	save = release_pgd(pgdir->pgdir, idx);
+
+	if (!save && idx >= guest_host_idx(linfo->page_offset >>
(PGDIR_SHIFT-3))) {
+		/* All guest procesess share the same kernel PML4Es */
+		/*
+		 * So we only free the tree once, but then reset
+		 * all the others.
+		 */
+		list_for_each_entry(pgdir, &linfo->pgd_list, list) {
+			pgd = pgdir->pgdir[idx];
+			if (!(pgd & _PAGE_PRESENT))
+				continue;
+			BUG_ON((pgd & PTE_MASK) != pud);
+			pgdir->pgdir[idx] = 0;
+		}
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_flush_tlb_single(struct lguest_vcpu *vcpu, u64 cr3, u64 vaddr)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pgd *pgdir;
+	unsigned long pgd_idx;
+	unsigned long pud_idx;
+	unsigned long pmd_idx;
+	unsigned long idx;
+	u64 pgd;
+	u64 pud;
+	u64 pmd;
+	u64 pte;
+	u64 *pudpage;
+	u64 *pmdpage;
+	u64 *ptepage;
+
+	mutex_lock(&linfo->page_lock);
+
+	if (vaddr > linfo->page_offset)
+		pgdir = &linfo->kpgdir;
+	else
+		pgdir = find_pgd(linfo, cr3);
+
+	pgd_idx = pgd_index(vaddr);
+	pgd = pgdir->pgdir[pgd_idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pud_idx = pud_index(vaddr);
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[pud_idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	pmd_idx = pmd_index(vaddr);
+	pmdpage = __va(pud & PTE_MASK);
+	pmd = pmdpage[pmd_idx];
+
+	if (!(pmd & _PAGE_PRESENT))
+		goto out;
+
+	idx = pte_index(vaddr);
+	ptepage = __va(pmd & PTE_MASK);
+	pte = ptepage[idx];
+
+	if (!(pte & _PAGE_PRESENT))
+		goto out;
+
+	/* If the guest is trying to touch HV area, kill it! */
+	if (is_hv_page(pgd_idx, pud_idx, pmd_idx, idx)) {
+		kill_guest_dump(vcpu, "guest trying to write to HV area\n");
+		goto out;
+	}
+
+	release_pte(ptepage[idx]);
+	/* FIXME: what about the hash?? */
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+static void flush_user_mappings(struct lguest_guest_info *linfo, struct
lguest_pgd *pgdir)
+{
+	unsigned int i;
+	for (i = 0; i < pgd_index(linfo->page_offset); i++)
+		release_pgd(pgdir->pgdir, i);
+}
+
+static struct lguest_pgd *new_pgdir(struct lguest_guest_info *linfo, u64 cr3)
+{
+	unsigned int next;
+	unsigned int i;
+
+	next = random32() % LGUEST_PGDIRS;
+	for (i=(next+1) % LGUEST_PGDIRS; i != next; i = (i+1) % LGUEST_PGDIRS) {
+		if (linfo->pgdirs[i].flags & LGUEST_PGD_BUSY_FL)
+			continue;
+		break;
+	}
+	BUG_ON(linfo->pgdirs[i].flags & LGUEST_PGD_BUSY_FL);
+
+	next = i;
+
+	linfo->pgdirs[next].cr3 = cr3;
+	if (!linfo->pgdirs[next].pgdir) {
+		linfo->pgdirs[next].pgdir = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!linfo->pgdirs[next].pgdir)
+			return NULL;
+		/* all kernel pages are the same */
+		for (i=pgd_index(linfo->page_offset); i < PTRS_PER_PGD; i++)
+			linfo->pgdirs[next].pgdir[i] = linfo->kpgdir.pgdir[i];
+	} else {
+		BUG_ON(!(linfo->pgdirs[next].flags & LGUEST_PGD_LINK_FL));
+		/* Release all the non-kernel mappings. */
+		flush_user_mappings(linfo, &linfo->pgdirs[next]);
+	}
+
+	return &linfo->pgdirs[next];
+}
+
+void guest_new_pagetable(struct lguest_vcpu *vcpu, u64 pgtable)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pgd *newpgdir;
+
+	mutex_lock(&linfo->page_lock);
+	newpgdir = find_pgd(linfo, pgtable);
+	if (vcpu->pgdir) {
+		if (!(--vcpu->pgdir->count))
+			vcpu->pgdir->flags &= ~(LGUEST_PGD_BUSY_FL);
+	}
+	if (!newpgdir)
+		newpgdir = new_pgdir(linfo, pgtable);
+	if (!newpgdir) {
+		kill_guest_dump(vcpu, "no more pgd's available!\n");
+		goto out;
+	}
+	vcpu->pgdir = newpgdir;
+	if (!vcpu->pgdir->count++)
+		vcpu->pgdir->flags |= LGUEST_PGD_BUSY_FL;
+	vcpu->regs.cr3 = __pa(vcpu->pgdir->pgdir);
+	if (!(vcpu->pgdir->flags & LGUEST_PGD_LINK_FL)) {
+		list_add(&vcpu->pgdir->list, &linfo->pgd_list);
+		vcpu->pgdir->flags |= LGUEST_PGD_LINK_FL;
+	}
+//	pin_stack_pages(lg);
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+static void release_all_pagetables(struct lguest_guest_info *linfo)
+{
+	struct lguest_pgd *pgdir, *next;
+	int i;
+
+	/* We share the kernel pages, so do them once */
+	for (i=0; i < PTRS_PER_PGD; i++)
+		release_pgd(linfo->kpgdir.pgdir, i);
+
+	list_for_each_entry(pgdir, &linfo->pgd_list, list) {
+		if (pgdir->pgdir)
+			for (i=0; i < pgd_index(linfo->page_offset); i++)
+				release_pgd(pgdir->pgdir, i);
+	}
+	/* now release any pgdirs that are not busy */
+	list_for_each_entry_safe(pgdir, next, &linfo->pgd_list, list) {
+		if (!(pgdir->flags & LGUEST_PGD_BUSY_FL)) {
+			BUG_ON(pgdir->count);
+			pgdir->flags &= ~LGUEST_PGD_LINK_FL;
+			list_del(&pgdir->list);
+			free_page((u64)pgdir->pgdir);
+			pgdir->cr3 = 0;
+			pgdir->pgdir = NULL;
+		}
+	}
+}
+
+void guest_pagetable_clear_all(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+
+	mutex_lock(&linfo->page_lock);
+	release_all_pagetables(linfo);
+//	pin_stack_pages(lg);
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	unsigned int i;
+
+	for (i = 0; i < pgd_index(linfo->page_offset); i++)
+		release_pgd(vcpu->pgdir->pgdir, i);
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+   swapped.  It'd be nice to have a callback when Linux wants to swap out.
*/
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return 0 if failed, 1 if good */
+static int page_in(struct lguest_vcpu *vcpu, u64 vaddr, pgprot_t prot)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	struct lguest_pmd *pmdir;
+	struct lguest_pte *ptedir;
+	u64 val;
+	u64 paddr;
+	u64 gpgd, gpud, gpmd, gpte;
+	u64 flags = pgprot_val(prot);
+	int write;
+	int ret;
+
+	gpgd = gtoplev(vcpu, vaddr);
+	val = lhread_u64(vcpu, gpgd);
+	if (!(val & _PAGE_PRESENT)) {
+		printk("pgd not present pgd:%llx vaddr:%llx val:%llx\n", gpgd,
vaddr, val);
+		return 0;
+	}
+
+	gpud = val & PTE_MASK;
+
+	pudir = hash_pud(vcpu, gpud, pgd_index(vaddr));
+	if (!pudir)
+		return 0; /* -ENOMEM */
+
+	if (vaddr >= linfo->page_offset)
+		pudir->flags |= LGUEST_PUD_KERNEL_FL;
+
+	gpud += pud_index(vaddr) * sizeof(u64);
+	val = lhread_u64(vcpu, gpud);
+	if (!(val & _PAGE_PRESENT)) {
+		printk("pud not present?\n");
+		return 0;
+	}
+
+	gpmd = val & PTE_MASK;
+
+	pmdir = hash_pmd(vcpu, pudir, gpmd, pud_index(vaddr));
+	if (!pmdir)
+		return 0; /* -ENOMEM */
+
+	if (vaddr >= linfo->page_offset)
+		pmdir->flags |= LGUEST_PMD_KERNEL_FL;
+
+	gpmd += pmd_index(vaddr) * sizeof(u64);
+	val = lhread_u64(vcpu, gpmd);
+	if (!(val & _PAGE_PRESENT)) {
+		printk("pmd not present?\n");
+		return 0;
+	}
+
+	/* The guest might have set up a 2M page */
+	if (val & (1<<7)) {
+		/* 2M pages */
+		/*
+		 * Although the guest may have mapped this into 2M pages
+		 * we haven't and wont. So we still need to find the 4K
+		 * page position.
+		 */
+		paddr = val & ~((1<<20)-1);
+		paddr += pte_index(vaddr) << PAGE_SHIFT;
+		paddr &= PTE_MASK; /* can still have the NX bit set */
+	} else {
+		/* 4K pages */
+		gpte = val & PTE_MASK;
+
+		ptedir = hash_pte(vcpu, pmdir, gpte, pmd_index(vaddr));
+		if (!ptedir)
+			return 0; /* -ENOMEM */
+
+		gpte += pte_index(vaddr) * sizeof(u64);
+		val = lhread_u64(vcpu, gpte);
+		if (!(val & _PAGE_PRESENT) || ((flags & _PAGE_DIRTY) && !(val
& _PAGE_RW))) {
+			printk("pte not present or dirty?\n");
+			return 0;
+		}
+		/* this is the guest's paddr */
+		paddr = val & PTE_MASK;
+
+	}
+
+	/* FIXME: check these values */
+
+	/*
+	 * FIXME: if this isn't write, we lose the lguest_data when we do
+	 *  a put_user in the hypercall init.
+	 */
+	write = 1; // val & _PAGE_DIRTY ? 1 : 0;
+
+	val = get_pfn(paddr >> PAGE_SHIFT, write);
+	if (val == (unsigned long)-1UL) {
+		printk("bad 1\n");
+		kill_guest_dump(vcpu, "page %llx not mapped", paddr);
+		return 0;
+	}
+
+	/* now we have the actual paddr */
+	val <<= PAGE_SHIFT;
+
+	ret = __lguest_map_guest_page(vcpu->guest, vcpu->pgdir->pgdir,
+				      vaddr, val, __pgprot(flags));
+	if (ret < 0) {
+		printk("bad 2\n");
+		kill_guest_dump(vcpu, "can't map page");
+		return 0;
+	}
+	return 1;
+}
+
+int demand_page(struct lguest_vcpu *vcpu, u64 vaddr, int write)
+{
+	return page_in(vcpu, vaddr, (write ? PAGE_SHARED_EXEC : PAGE_COPY_EXEC));
+}
+
+
+static pud_t *pud_from_index(unsigned long addr, unsigned index)
+{
+	pud_t *pud = (pud_t*)addr;
+
+	return &pud[index];
+}
+
+static pmd_t *pmd_from_index(unsigned long addr, unsigned index)
+{
+	pmd_t *pmd = (pmd_t*)addr;
+
+	return &pmd[index];
+}
+
+static pte_t *pte_from_index(unsigned long addr, unsigned index)
+{
+	pte_t *pte = (pte_t*)addr;
+
+	return &pte[index];
+}
+
+static int __lguest_map_guest_pte(pmd_t *pmd, unsigned long vaddr,
+				  unsigned long paddr, pgprot_t prot)
+{
+	unsigned long page;
+	pte_t *pte;
+	unsigned index;
+
+	page = pmd_page_vaddr(*pmd);
+
+	index = pte_index(vaddr);
+	pte = pte_from_index(page, index);
+	if (pte_val(*pte) & _PAGE_PRESENT &&
+	    pte_val(*pte) == pte_val(pfn_pte(paddr>>PAGE_SHIFT, prot)) ) {
+		printk("stange page faulting!\n");
+		printk("paddr=%lx (paddr)=%lx\n", paddr, *(unsigned long
*)__va(paddr));
+		printk("vaddr: %lx pte %x val: %lx\n", vaddr, index,
pte_val(*pte));
+	}
+
+	set_pte(pte, mk_pte(pfn_to_page(paddr >> PAGE_SHIFT), prot));
+
+	return 0;
+}
+
+static int __lguest_map_guest_pmd(pud_t *pud, unsigned long vaddr, unsigned
long paddr,
+				  pgprot_t prot)
+{
+	unsigned long page;
+	pmd_t *pmd;
+	unsigned index;
+
+	page = pud_page_vaddr(*pud);
+
+	index = pmd_index(vaddr);
+	pmd = pmd_from_index(page, index);
+	if (!pmd_val(*pmd)) {
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(page)));
+	}
+
+	return __lguest_map_guest_pte(pmd, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_pud(pgd_t *pgd, unsigned long vaddr, unsigned
long paddr,
+				  pgprot_t prot)
+{
+	unsigned long page;
+	pud_t *pud;
+	unsigned index;
+
+	page = pgd_page_vaddr(*pgd);
+
+	index = pud_index(vaddr);
+	pud = pud_from_index(page, index);
+	if (!pud_val(*pud)) {
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_pud(pud, __pud(_PAGE_TABLE | __pa(page)));
+	}
+
+	return __lguest_map_guest_pmd(pud, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_pgd(u64 *cr3,
+				  unsigned long vaddr, unsigned long paddr,
+				  pgprot_t prot)
+{
+	unsigned long page;
+	unsigned index;
+	pgd_t *pgd;
+
+	index = pgd_index(vaddr);
+	pgd = (pgd_t*)&cr3[index];
+	if (!pgd_val(*pgd)) {
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(page)));
+	}
+
+	return __lguest_map_guest_pud(pgd, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_page(struct lguest_guest_info *linfo, u64 *cr3,
+				   unsigned long vaddr, unsigned long paddr,
+				   pgprot_t prot)
+{
+	int ret;
+
+	ret = __lguest_map_guest_pgd(cr3, vaddr, paddr, prot);
+	if (ret < 0)
+		return ret;
+
+	/* All guest kernel pages are the same */
+	if (vaddr >= linfo->page_offset) {
+		struct lguest_pgd *pgdir;
+		unsigned index;
+		pgd_t *pgd;
+		u64 val;
+
+		index = pgd_index(vaddr);
+		pgd = (pgd_t*)&cr3[index];
+		val = pgd_val(*pgd);
+
+		list_for_each_entry(pgdir, &linfo->pgd_list, list)
+			pgdir->pgdir[index] = val;
+	}
+	return ret;
+}
+
+static void __lguest_unmap_page_pmd(pmd_t *pmd, unsigned long vaddr)
+{
+	pte_t *pte;
+	unsigned index;
+	unsigned long page;
+
+	page = pmd_page_vaddr(*pmd);
+
+	index = pte_index(vaddr);
+	pte = pte_from_index(page, index);
+	if (pte_val(*pte) & 1)
+		set_pte(pte, __pte(0));
+}
+
+static void __lguest_unmap_page_pud(pud_t *pud, unsigned long vaddr)
+{
+	pmd_t *pmd;
+	unsigned index;
+	unsigned long page;
+
+	page = pud_page_vaddr(*pud);
+
+	index = pmd_index(vaddr);
+	pmd = pmd_from_index(page, index);
+	if (pmd_val(*pmd) & 1)
+		__lguest_unmap_page_pmd(pmd, vaddr);
+}
+
+static void __lguest_unmap_page_pgd(pgd_t *pgd, unsigned long vaddr)
+{
+	pud_t *pud;
+	unsigned index;
+	unsigned long page;
+
+	page = pgd_page_vaddr(*pgd);
+
+	index = pud_index(vaddr);
+	pud = pud_from_index(page, index);
+	if (pud_val(*pud) & 1)
+		__lguest_unmap_page_pud(pud, vaddr);
+}
+
+static void __lguest_unmap_guest_page(struct lguest_guest_info *linfo,
+				      unsigned long vaddr)
+{
+	pgd_t *pgd;
+	unsigned index;
+	u64 *cr3 = linfo->kpgdir.pgdir;
+
+	if (!cr3)
+		return;
+
+	index = pgd_index(vaddr);
+	pgd = (pgd_t*)&cr3[index];
+	if (!(pgd_val(*pgd)&1))
+		return;
+
+	__lguest_unmap_page_pgd(pgd, vaddr);
+}
+
+int lguest_map_hv_pages(struct lguest_guest_info *lguest,
+			unsigned long vaddr, int pages,
+			pgprot_t *pprot)
+{
+	unsigned long page;
+	int i;
+	int ret;
+	pgprot_t prot;
+
+	ret = -ENOMEM;
+	for (i=0; i < pages; i++) {
+		/* now add the page we want */
+		page = hvvm_get_actual_phys((void*)vaddr+PAGE_SIZE*i, &prot);
+		if (!page)
+			goto failed;
+
+		if (pprot)
+			prot = *pprot;
+		ret = __lguest_map_guest_page(lguest, lguest->kpgdir.pgdir,
+					      vaddr+PAGE_SIZE*i, page, prot);
+		if (ret < 0)
+			goto failed;
+	}
+	return 0;
+failed:
+	for (--i; i >= 0; i--)
+		__lguest_unmap_guest_page(lguest, vaddr+PAGE_SIZE*i);
+	return ret;
+}
+
+/**
+ * lguest_mem_addr - retrieve page that's mapped from guest.
+ * @vcpu: lguest vcpu descriptor.
+ * @addr: address to get from the guest's address space.
+ *
+ *  ONLY USE WHEN ALL ELSE FAILS!
+ */
+void *lguest_mem_addr(struct lguest_vcpu *vcpu, u64 addr)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	u64 *cr3 = linfo->kpgdir.pgdir;
+	unsigned long page;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned index = pgd_index(addr);
+
+	pgd = (pgd_t*)&cr3[index];
+	if (!(pgd_val(*pgd) & 1))
+		return NULL;
+
+	page = pgd_page_vaddr(*pgd);
+	index = pud_index(addr);
+	pud = pud_from_index(page, index);
+	if (!(pud_val(*pud) & 1))
+		return NULL;
+
+	page = pud_page_vaddr(*pud);
+	index = pmd_index(addr);
+	pmd = pmd_from_index(page, index);
+	if (!(pmd_val(*pmd) & 1))
+		return NULL;
+
+	page = pmd_page_vaddr(*pmd);
+	index = pte_index(addr);
+	pte = pte_from_index(page, index);
+	if (!(pte_val(*pte) & 1))
+		return NULL;
+
+	page = ((pte_val(*pte) & PAGE_MASK) + (addr & (PAGE_SIZE-1)));
+
+	return (void *)(page + PAGE_OFFSET);
+}
+
+void __lguest_free_guest_pmd(pmd_t *pmd)
+{
+	pte_t *pte;
+	unsigned long page;
+	int i;
+
+	page = pmd_page_vaddr(*pmd);
+
+	for (i=0; i < PTRS_PER_PTE; i++) {
+		pte = pte_from_index(page, i);
+		if (!(pte_val(*pte) & 1))
+			continue;
+		/* FIXME: do some checks here??? */
+	}
+	set_pmd(pmd, __pmd(0));
+	free_page(page);
+}
+
+void __lguest_free_guest_pud(pud_t *pud)
+{
+	pmd_t *pmd;
+	unsigned long page;
+	int i;
+
+	page = pud_page_vaddr(*pud);
+
+	for (i=0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_from_index(page, i);
+		if (!(pmd_val(*pmd) & 1))
+			continue;
+		__lguest_free_guest_pmd(pmd);
+	}
+	set_pud(pud, __pud(0));
+	free_page(page);
+}
+
+void __lguest_free_guest_pgd(pgd_t *pgd)
+{
+	pud_t *pud;
+	unsigned long page;
+	int i;
+
+	page = pgd_page_vaddr(*pgd);
+
+	for (i=0; i < PTRS_PER_PUD; i++) {
+		pud = pud_from_index(page, i);
+		if (!(pud_val(*pud) & 1))
+			continue;
+		__lguest_free_guest_pud(pud);
+	}
+	set_pgd(pgd, __pgd(0));
+	free_page(page);
+}
+
+void __lguest_free_guest_pages(u64 *cr3)
+{
+	pgd_t *pgd;
+	int i;
+
+	if (!cr3)
+		return;
+
+	for (i=0; i < PTRS_PER_PGD; i++) {
+		pgd = (pgd_t*)&cr3[i];
+		if (!(pgd_val(*pgd) & 1))
+			continue;
+		__lguest_free_guest_pgd(pgd);
+	}
+	free_page((u64)cr3);
+}
+
+void __lguest_free_guest_upages(struct lguest_guest_info *linfo, u64 *cr3)
+{
+	pgd_t *pgd;
+	int i;
+
+	if (!cr3)
+		return;
+
+	for (i=0; i < pgd_index(linfo->page_offset); i++) {
+		pgd = (pgd_t*)&cr3[i];
+		if (!(pgd_val(*pgd) & 1))
+			continue;
+		__lguest_free_guest_pgd(pgd);
+	}
+	free_page((u64)cr3);
+}
+
+void lguest_free_guest_pages(struct lguest_guest_info *linfo)
+{
+	int i;
+
+	/* This frees all the guest kernel pages */
+	__lguest_free_guest_pages(linfo->kpgdir.pgdir);
+
+	for (i=0; i < LGUEST_PGDIRS; i++)
+		__lguest_free_guest_upages(linfo, linfo->pgdirs[i].pgdir);
+}
+
+void lguest_unmap_guest_pages(struct lguest_guest_info *lguest,
+			     unsigned long vaddr, int pages)
+{
+	int i;
+
+	for (i=0; i < pages; i++)
+		__lguest_unmap_guest_page(lguest, vaddr+PAGE_SIZE*i);
+}
+
+int lguest_init_vcpu_pagetable(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+
+	mutex_lock(&linfo->page_lock);
+	vcpu->pgdir = new_pgdir(linfo, linfo->kpgdir.cr3);
+	BUG_ON(!vcpu->pgdir);
+	if (!vcpu->pgdir->count++)
+		vcpu->pgdir->flags |= LGUEST_PGD_BUSY_FL;
+	list_add(&vcpu->pgdir->list, &linfo->pgd_list);
+	mutex_unlock(&linfo->page_lock);
+
+	return 0;
+}
+
+int init_guest_pagetable(struct lguest_guest_info *linfo, u64 pgtable)
+{
+	int ret = -ENOMEM;
+
+	linfo->kpgdir.cr3 = pgtable;
+	linfo->kpgdir.pgdir = (u64*)get_zeroed_page(GFP_KERNEL);
+	if (!linfo->kpgdir.pgdir)
+		return -ENOMEM;
+	linfo->kpgdir.flags |= LGUEST_PGD_BUSY_FL | LGUEST_PGD_MASTER_FL;
+	linfo->kpgdir.count = -1;
+
+	/*
+	 * The list is used to update all the kernel page tables,
+	 * so that they all have the same mappings.
+	 */
+	list_add(&linfo->kpgdir.list, &linfo->pgd_list);
+
+	ret = lguest_map_hv_pages(linfo, lguest_hv_addr,
+				  lguest_hv_pages, NULL);
+	if (ret < 0)
+		goto out;
+
+	return 0;
+ out:
+	free_page((u64)linfo->kpgdir.pgdir);
+
+	return ret;
+}
+
Index: work-pv/arch/x86_64/Makefile
==================================================================---
work-pv.orig/arch/x86_64/Makefile
+++ work-pv/arch/x86_64/Makefile
@@ -84,6 +84,7 @@ core-y					+= arch/x86_64/kernel/ \
 core-$(CONFIG_IA32_EMULATION)		+= arch/x86_64/ia32/
 drivers-$(CONFIG_PCI)			+= arch/x86_64/pci/
 drivers-$(CONFIG_OPROFILE)		+= arch/x86_64/oprofile/
+drivers-$(CONFIG_LGUEST_GUEST)		+= arch/x86_64/lguest/
 
 boot := arch/x86_64/boot
 
Index: work-pv/include/asm-x86_64/lguest.h
==================================================================--- /dev/null
+++ work-pv/include/asm-x86_64/lguest.h
@@ -0,0 +1,350 @@
+#ifndef _LGUEST_H_
+#define _LGUEST_H_
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include <linux/futex.h>
+#include <asm/lguest_user.h>
+
+/* XXX: Come up with better magic later on */
+#define LGUEST_MAGIC_R13 0x1
+#define LGUEST_MAGIC_R14 0x2
+#define LGUEST_MAGIC_R15 0x3
+
+#define LGUEST_MAX_VCPUS 64
+
+#define LGUEST_PGDS_PER_VCPU 8
+#define LGUEST_PGDIRS (LGUEST_MAX_VCPUS * LGUEST_PGDS_PER_VCPU)
+
+#define LGUEST_IRQS 32
+
+#define LHCALL_FLUSH_ASYNC	0
+#define LHCALL_LGUEST_INIT	1
+#define LHCALL_CRASH		2
+#define LHCALL_LOAD_GDT		3
+#define LHCALL_NEW_PGTABLE	4
+#define LHCALL_FLUSH_TLB	5
+#define LHCALL_LOAD_IDT_ENTRY	6
+#define LHCALL_SET_STACK	7
+#define LHCALL_TS		8
+#define LHCALL_TIMER_READ	9
+#define LHCALL_TIMER_START	10
+#define LHCALL_HALT		11
+#define LHCALL_GET_WALLCLOCK	12
+#define LHCALL_BIND_DMA		13
+#define LHCALL_SEND_DMA		14
+#define LHCALL_FLUSH_TLB_SIG	15
+#define LHCALL_SET_PTE		16
+#define LHCALL_SET_PMD		17
+#define LHCALL_SET_PUD		18
+#define LHCALL_SET_PGD		19
+#define LHCALL_CLEAR_PTE	20
+#define LHCALL_CLEAR_PMD	21
+#define LHCALL_CLEAR_PUD	22
+#define LHCALL_CLEAR_PGD	23
+#define LHCALL_LOAD_TLS		24
+#define LHCALL_RDMSR		25
+#define LHCALL_WRMSR		26
+#define LHCALL_IRET		27
+
+#define LHCALL_PRINT		60
+#define LHCALL_DEBUG_ME		99
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+		     : "=a"(call)
+		     : "a"(call), "d"(arg1), "b"(arg2),
"c"(arg3)
+		     : "memory");
+	return call;
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+struct lguest_vcpu;
+
+struct lguest_dma_info
+{
+	struct list_head list;
+	union futex_key key;
+	unsigned long dmas;
+	u16 next_dma;
+	u16 num_dmas;
+	u32 guest_id;
+	u8 interrupt; 	/* 0 when not registered */
+};
+
+
+/* these must be powers of two */
+#define PUD_HASH_SIZE 256
+#define PMD_HASH_SIZE 256
+#define PTE_HASH_SIZE 256
+
+#define LGUEST_PGD_BUSY_FL	(1<<0)
+#define LGUEST_PGD_MASTER_FL	(1<<1)
+#define LGUEST_PGD_LINK_FL	(1<<2)
+
+#define LGUEST_PUD_KERNEL_FL	(1<<1)
+#define LGUEST_PMD_KERNEL_FL	(1<<1)
+#define LGUEST_PTE_KERNEL_FL	(1<<1)
+
+struct lguest_pgd {
+	struct list_head list;
+	u64 cr3;
+	u64 *pgdir;
+	u64 *user_pgdir;
+	unsigned count;
+	unsigned flags;
+};
+
+struct lguest_pud {
+	struct list_head list;
+	struct lguest_pgd *pgdir;
+	u64 gpud;  /* guest pud */
+	unsigned flags;
+	unsigned idx;
+};
+
+struct lguest_pmd {
+	struct list_head list;
+	struct lguest_pud *pudir;
+	u64 gpmd;  /* guest pmd */
+	unsigned flags;
+	unsigned idx;
+};
+
+struct lguest_pte {
+	struct list_head list;
+	struct lguest_pmd *pmdir;
+	u64 gpte;  /* guest pte */
+	unsigned flags;
+	unsigned idx;
+};
+
+struct lguest_guest_info {
+	struct lguest_data __user *lguest_data;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	u32 guest_id;
+	u64 pfn_limit;
+	u64 start_kernel_map;
+	u64 page_offset;
+
+	int halted;
+	/* does it really belong here? */
+	char *dead;
+#if 0
+	unsigned long noirq_start, noirq_end;
+#endif
+	int dma_is_pending;
+	unsigned long pending_dma; /* struct lguest_dma */
+	unsigned long pending_addr; /* address they're sending to */
+
+	struct lguest_pgd kpgdir;
+	struct lguest_pgd pgdirs[LGUEST_PGDIRS];
+	struct list_head pgd_list;
+	struct list_head pud_hash[PUD_HASH_SIZE];
+	struct list_head pmd_hash[PMD_HASH_SIZE];
+	struct list_head pte_hash[PTE_HASH_SIZE];
+	struct mutex page_lock;
+
+	int timer_on;
+	int last_timer;
+
+	/* Cached wakeup: we hold a reference to this task. */
+	struct task_struct *wake;
+
+	struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+	struct lguest_vcpu *vcpu[LGUEST_MAX_VCPUS];
+};
+
+/* copied from old lguest code. Not sure if it's the best layout for us */
+struct lguest_regs
+{
+	u64 cr3;			/*   0 ( 0x0) */
+        /* Manually saved part. */
+        u64 rbx, rcx, rdx;		/*   8 ( 0x8) */
+        u64 rsi, rdi, rbp;		/*  32 (0x20) */
+        u64 r8, r9, r10, r11;		/*  56 (0x38) */
+        u64 r12, r13, r14, r15;		/*  88 (0x58) */
+        u64 rax;			/* 120 (0x78) */
+        u64 fs; /* ds; */		/* 128 (0x80) */
+        u64 trapnum, errcode;		/* 136 (0x88) */
+        /* Trap pushed part */
+        u64 rip;			/* 152 (0x98) */
+        u64 cs;				/* 160 (0xa0) */
+        u64 rflags;			/* 168 (0xa8) */
+        u64 rsp;			/* 176 (0xb0) */
+	u64 ss; /* Crappy Segment! */	/* 184 (0xb8) */
+	/* size = 192  (0xc0) */
+	char size[0];
+};
+
+struct lguest_tss_struct {
+	u32 reserved1;
+	u64 rsp0;
+	u64 rsp1;
+	u64 rsp2;
+	u64 reserved2;
+	u64 ist[7];
+	u32 reserved3;
+	u32 reserved4;
+	u16 reserved5;
+	u16 io_bitmap_base;
+	/* we don't let the guest have io privileges (yet) */
+	unsigned long io_bitmap[1];
+} __attribute__((packed)) ____cacheline_aligned;
+
+struct lguest_vcpu {
+	unsigned long host_syscall;
+	unsigned long guest_syscall;
+
+	/* Must be 16 bytes aligned at regs+sizeof(regs) */
+	struct lguest_regs regs;
+
+	struct lguest_vcpu *vcpu; /* pointer to itself */
+	unsigned long debug;
+	unsigned long magic;
+	unsigned int  id;
+	unsigned long host_stack;
+	unsigned long guest_stack;
+	unsigned long host_cr3;
+	unsigned long host_page;
+	struct desc_ptr host_gdt;
+	u16 host_gdt_buff[3];
+	struct desc_ptr host_idt;
+	u16 host_idt_buff[3];
+	unsigned long host_gdt_ptr;
+	/* Save rax on interrupts, it's used for iret hcall */
+	unsigned long rax;
+
+	/* Host save gs base pointer */
+	unsigned long host_gs_a;
+	unsigned long host_gs_d;
+
+	/* save host process gs base pointer */
+	unsigned long host_proc_gs_a;
+	unsigned long host_proc_gs_d;
+
+	/* save guest gs base pointer */
+	unsigned long guest_gs_a;
+	unsigned long guest_gs_d;
+
+	/* used for guest calling swapgs */
+	unsigned long guest_gs_shadow_a;
+	unsigned long guest_gs_shadow_d;
+
+	struct lguest_pgd *pgdir;
+
+	struct desc_ptr gdt; /* address of the GDT at this vcpu */
+	u16 gdt_buff[3];
+	struct desc_struct gdt_table[GDT_ENTRIES];
+
+	struct desc_ptr idt; /* address of the IDT at this vcpu */
+	u16 idt_buff[3];
+	struct gate_struct idt_table[IDT_ENTRIES];
+
+	struct lguest_guest_info *guest;
+
+	struct lguest_tss_struct tss;
+
+	unsigned long ts;
+
+	/* host ist 7 - we use it to prevent the NMI race */
+	unsigned long host_ist;
+
+	/* only for those above FIRST_EXTERNAL_VECTOR */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+	/* those are general. We catch every possible interrupt */
+	DECLARE_BITMAP(interrupt_disabled, LGUEST_IRQS + FIRST_EXTERNAL_VECTOR);
+	unsigned long interrupt[LGUEST_IRQS + FIRST_EXTERNAL_VECTOR];
+
+	/* nmi trampoline storage */
+
+	struct lguest_regs nmi_regs;
+	unsigned long nmi_gs_a;
+	unsigned long nmi_gs_d;
+	unsigned long nmi_gs_shadow_a;
+	unsigned long nmi_gs_shadow_d;
+	struct desc_ptr nmi_gdt;
+	u16 nmi_gdt_buff[3];
+
+	/* set when we take an nmi */
+	unsigned long nmi_sw;
+
+	/* is this enough? */
+	char nmi_stack[1048];
+	char nmi_stack_end[0];
+	char gpf_stack[1048];
+	char gpf_stack_end[0];
+	char df_stack[1048];
+	char df_stack_end[0];
+};
+
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+	u32 eax, edx, ebx, ecx;
+};
+
+struct lguest_text_ptr {
+	unsigned long next; /* guest pa address of next pointer */
+	unsigned long start;
+	unsigned long end;
+};
+
+struct lguest_data
+{
+/* Fields which change during running: */
+	/* 512 == enabled (same as eflags) */
+	unsigned int irq_enabled;
+	/* Blocked interrupts. */
+	DECLARE_BITMAP(interrupts, LGUEST_IRQS);
+
+	/* Last (userspace) address we got a GPF & reloaded gs. */
+	unsigned int gs_gpf_eip;
+
+	/* Virtual address of page fault. */
+	unsigned long cr2;
+
+	/* Async hypercall ring.  0xFF == done, 0 == pending. */
+	u8 hcall_status[LHCALL_RING_SIZE];
+	struct hcall_ring hcalls[LHCALL_RING_SIZE];
+
+/* Fields initialized by the hypervisor at boot: */
+	/* Memory not to try to access */
+	unsigned long reserve_mem;
+	/* ID of this guest (used by network driver to set ethernet address) */
+	u32 guest_id;
+
+/* Fields initialized by the guest at boot: */
+	/* Instruction range to suppress interrupts even if enabled */
+#if 0
+	unsigned long noirq_start, noirq_end;
+#endif
+	unsigned long start_kernel_map;
+	unsigned long page_offset;
+	unsigned long text; /* pa address of lguest_text_ptr addresses */
+
+/* If the kernel has kallsyms, we can use it to do backtraces of a guest */
+	unsigned long kallsyms_addresses;
+	unsigned long kallsyms_num_syms;
+	unsigned long kallsyms_names;
+	unsigned long kallsyms_token_table;
+	unsigned long kallsyms_token_index;
+	unsigned long kallsyms_markers;
+
+	unsigned long return_address;
+};
+
+extern struct lguest_data lguest_data;
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+int run_guest(struct lguest_vcpu *vcpu, char *__user user);
+
+#endif

--

Rusty Russell

2007-Apr-18 13:02 UTC

head link

[RFC/PATCH LGUEST X86_64 03/13] lguest64 core

On Thu, 2007-03-08 at 12:38 -0500, Steven Rostedt wrote:> +lg-objs := core.o hypervisor.o lguest_user.o hv_vm.o page_tables.o \
> +hypercalls.o io.o interrupts_and_traps.o lguest_debug.o
Right, I missed the trick here: hypervisor.S doesn't require any
relocations, so that fact that it's linked at the wrong address doesn't
matter at all.  Excuse me while I prepare a patch 8)
> +extern long end_hyper_text;
> +extern long start_hyper_text;
The standard way of doing this is "extern char end_hyper_text[];"
doesn't matter on x86/x86-64, but on some platforms gcc can make
assumptions about addresses based on the size of the variable (sbss
etc).  So nice to use that everywhere for asm constants.

Cheers,
Rusty.

Maybe Matching Threads

Search for more maybe matching threads

Linux Virtualization - Apr 2007 - [RFC/PATCH LGUEST X86_64 03/13] lguest64 core

[RFC/PATCH LGUEST X86_64 03/13] lguest64 core

[RFC/PATCH LGUEST X86_64 03/13] lguest64 core

Maybe Matching Threads