plain text document attachment (lguest64.patch)
This is the main core code for the lguest64.
Have fun, and don't hurt the puppies!
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Glauber de Oliveira Costa <glommer@gmail.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Index: work-pv/arch/x86_64/lguest/Makefile
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/Makefile
@@ -0,0 +1,24 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST) += lg.o
+lg-objs := core.o hypervisor.o lguest_user.o hv_vm.o page_tables.o \
+hypercalls.o io.o interrupts_and_traps.o lguest_debug.o
+
+# hypercalls.o page_tables.o interrupts_and_traps.o \
+# segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC00000+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)"
-DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+##$(obj)/core.o: $(obj)/hypervisor-blob.c
+### This links the hypervisor in the right place and turns it into a C array.
+##$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+## @$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf %#x
$$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O
binary $@
+##$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+## @od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e
's/ /,0x/g' > $@
+
Index: work-pv/arch/x86_64/lguest/core.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/core.c
@@ -0,0 +1,379 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <asm/paravirt.h>
+#include <asm/hv_vm.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "lguest.h"
+
+#define HV_OFFSET(x) (typeof(x))((unsigned long)(x)+lguest_hv_offset)
+
+unsigned long lguest_hv_addr;
+unsigned long lguest_hv_offset;
+int lguest_hv_pages;
+
+int lguest_vcpu_pages;
+int lguest_vcpu_order;
+
+DEFINE_MUTEX(lguest_lock);
+
+int lguest_address_ok(const struct lguest_guest_info *linfo, u64 addr)
+{
+ return addr / PAGE_SIZE < linfo->pfn_limit;
+}
+
+u8 lhread_u8(struct lguest_vcpu *vcpu, u64 addr)
+{
+ u8 val = 0;
+
+ if (!lguest_address_ok(vcpu->guest, addr)
+ || get_user(val, (u8 __user *)addr) != 0)
+ kill_guest_dump(vcpu, "bad read address %llx", addr);
+ return val;
+}
+
+u16 lhread_u16(struct lguest_vcpu *vcpu, u64 addr)
+{
+ u16 val = 0;
+
+ if (!lguest_address_ok(vcpu->guest, addr)
+ || get_user(val, (u16 __user *)addr) != 0)
+ kill_guest_dump(vcpu, "bad read address %llx", addr);
+ return val;
+}
+
+u64 lhread_u64(struct lguest_vcpu *vcpu, u64 addr)
+{
+ u64 val = 0;
+
+ if (!lguest_address_ok(vcpu->guest, addr)
+ || get_user(val, (u64 __user *)addr) != 0)
+ kill_guest_dump(vcpu, "bad read address %llx", addr);
+ return val;
+}
+
+void lhwrite_u64(struct lguest_vcpu *vcpu, u64 addr, u64 val)
+{
+ if (!lguest_address_ok(vcpu->guest, addr)
+ || put_user(val, (u64 __user *)addr) != 0)
+ kill_guest_dump(vcpu, "bad read address %llx", addr);
+}
+
+void lhread(struct lguest_guest_info *linfo, void *b, u64 addr, unsigned bytes)
+{
+ if (addr + bytes < addr || !lguest_address_ok(linfo, addr+bytes)
+ || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+ /* copy_from_user should do this, but as we rely on it... */
+ memset(b, 0, bytes);
+ kill_guest(linfo, "bad read address %llx len %u", addr, bytes);
+ }
+}
+
+void lhwrite(struct lguest_guest_info *linfo, u64 addr, const void *b,
+ unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(linfo, addr+bytes)
+ || copy_to_user((void __user *)addr, b, bytes) != 0)
+ kill_guest(linfo, "bad write address %llx len %u", addr, bytes);
+}
+
+static struct gate_struct *get_idt_table(void)
+{
+ struct desc_ptr idt;
+
+ asm("sidt %0":"=m" (idt));
+ return (void *)idt.address;
+}
+
+static int emulate_insn(struct lguest_vcpu *vcpu)
+{
+ u8 insn;
+ unsigned int insnlen = 0, in = 0, shift = 0;
+ unsigned long physaddr = guest_pa(vcpu->guest, vcpu->regs.rip);
+
+ if (vcpu->regs.rip < vcpu->guest->page_offset)
+ return 0;
+
+ lhread(vcpu->guest, &insn, physaddr, 1);
+
+ /* Operand size prefix means it's actually for ax. */
+ if (insn == 0x66) {
+ shift = 16;
+ insnlen = 1;
+ printk("physaddr + len: %lx\n",physaddr+insnlen);
+ lhread(vcpu->guest, &insn, physaddr + insnlen, 1);
+ }
+
+ switch (insn & 0xFE) {
+ case 0xE4: /* in <next byte>,%al */
+ insnlen += 2;
+ in = 1;
+ break;
+ case 0xEC: /* in (%dx),%al */
+ insnlen += 1;
+ in = 1;
+ break;
+ case 0xE6: /* out %al,<next byte> */
+ insnlen += 2;
+ break;
+ case 0xEE: /* out %al,(%dx) */
+ insnlen += 1;
+ break;
+ default:
+ printk("%llx: %02x unimplemented op\n", vcpu->regs.rip, insn);
+ kill_guest_dump(vcpu, "bad op");
+ return 0;
+ }
+ if (in) {
+ /* Lower bit tells is whether it's a 16 or 32 bit access */
+ if (insn & 0x1)
+ vcpu->regs.rax = 0xFFFFFFFF;
+ else
+ vcpu->regs.rax |= (0xFFFF << shift);
+ }
+ vcpu->regs.rip += insnlen;
+ return 1;
+}
+
+#define SAVE_CR2(cr2) asm volatile ("movq %%cr2, %0" : "=r"
(cr2))
+
+static void run_guest_once(struct lguest_vcpu *vcpu)
+{
+ void (*sw_guest)(struct lguest_vcpu *) = HV_OFFSET(&switch_to_guest);
+ unsigned long foo, bar;
+
+ BUG_ON(!vcpu->regs.cr3);
+ BUG_ON(!vcpu->pgdir);
+ BUG_ON(!vcpu->pgdir->pgdir);
+ asm volatile ("pushq %2; pushq %%rsp; pushfq; pushq %3; call *%6;"
+ /* The stack we pushed is off by 8, due to the previous pushq */
+ "addq $8, %%rsp"
+ : "=D"(foo), "=a"(bar)
+ : "i" (__KERNEL_DS), "i" (__KERNEL_CS),
"0" (vcpu), "1"(get_idt_table()),
+ "r" (sw_guest)
+ : "memory", "cc");
+}
+
+/* FIXME: don't know yet the right parameters to put here */
+int run_guest(struct lguest_vcpu *vcpu, char *__user user)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct desc_struct *gdt_table;
+ struct lguest_regs *regs = &vcpu->regs;
+ int ret;
+
+ unsigned long cr2 = 0;
+
+ while (!linfo->dead) {
+
+ if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+
+ if (lguest_debug) {
+ printk("hit trap %lld rip=", regs->trapnum);
+ lguest_print_address(vcpu, regs->rip);
+ printk("calling hypercall %d!\n", (unsigned)regs->rax);
+ }
+
+ regs->trapnum = 255;
+ hypercall(vcpu);
+ if (linfo->dead)
+ lguest_dump_vcpu_regs(vcpu);
+ }
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ maybe_do_interrupt(vcpu);
+
+ try_to_freeze();
+
+ if (linfo->dead)
+ return -1;
+
+
+ local_irq_disable();
+
+ /*
+ * keep a pointer to the host GDT tss address.
+ * Do this after disabling interrupts to make sure we
+ * are on the same CPU.
+ */
+ gdt_table = cpu_gdt(smp_processor_id());
+ vcpu->host_gdt_ptr = (unsigned long)gdt_table;
+ asm volatile ("sidt %0" : "=m"(vcpu->host_idt));
+
+ /* Even if *we* don't want FPU trap, guest might... */
+ if (vcpu->ts)
+ stts();
+
+ run_guest_once(vcpu);
+
+ if (regs->trapnum == 14) {
+ SAVE_CR2(cr2);
+ lgdebug_print("faulting cr2: %lx\n",cr2);
+ }
+
+ else if (regs->trapnum == 7)
+ math_state_restore();
+
+ if (lguest_debug && regs->trapnum < 32) {
+ printk("hit trap %lld rip=", regs->trapnum);
+ lguest_print_address(vcpu, regs->rip);
+ }
+
+ local_irq_enable();
+
+ BUG_ON(regs->trapnum > 0xFF);
+
+ switch (regs->trapnum) {
+ case 7:
+ /* We've intercepted a Device Not Available fault. */
+ /* If they don't want to know, just absorb it. */
+ if (!vcpu->ts)
+ continue;
+ if (reflect_trap(vcpu, 7, 1))
+ continue;
+ kill_guest(vcpu->guest, "Unhandled FPU trap at %#llx",
+ regs->rip);
+ case 13:
+ if (!regs->errcode) {
+ ret = emulate_insn(vcpu);
+ if (ret < 0) {
+ lguest_dump_vcpu_regs(vcpu);
+ return ret;
+ }
+ continue;
+ }
+ kill_guest_dump(vcpu, "took gfp errcode %lld\n",
regs->errcode);
+ lguest_dump_vcpu_regs(vcpu);
+ break;
+ case 14:
+ if (demand_page(vcpu, cr2, regs->errcode & PF_WRITE))
+ continue;
+
+ if (lguest_debug) {
+ printk ("guest taking a page fault\n");
+ lguest_print_page_tables(vcpu->pgdir->pgdir);
+ }
+
+ /* inform guest on the current state of cr2 */
+ put_user(cr2, &linfo->lguest_data->cr2);
+ if (reflect_trap(vcpu, 14, 1))
+ continue;
+
+ lguest_dump_vcpu_regs(vcpu);
+ kill_guest_dump(vcpu, "unhandled page fault at %#lx"
+ " (rip=%#llx, errcode=%#llx)",
+ cr2, regs->rip, regs->errcode);
+ break;
+ case LGUEST_TRAP_ENTRY:
+ /* hypercall! */
+ continue;
+
+ case 32 ... 255:
+ cond_resched();
+ break;
+ default:
+ kill_guest_dump(vcpu, "bad trapnum %lld\n", regs->trapnum);
+ lguest_dump_vcpu_regs(vcpu);
+ return -EINVAL;
+ }
+ }
+ return -ENOENT;
+}
+
+extern long end_hyper_text;
+extern long start_hyper_text;
+
+static int __init init(void)
+{
+ unsigned long pages;
+ unsigned long hvaddr;
+#if 0
+ unsigned long lg_hcall = (unsigned long)HV_OFFSET(&hcall_teste);
+ unsigned long *lg_host_syscall + (unsigned long
*)HV_OFFSET(&host_syscall);
+#endif
+ int order;
+ int ret;
+
+ int i;
+ printk("start_hyper_text=%p\n",&start_hyper_text);
+ printk("end_hyper_text=%p\n",&end_hyper_text);
+ printk("default_idt_entries=%p\n",&_lguest_default_idt_entries);
+ printk("sizeof(vcpu)=%ld\n",sizeof(struct lguest_vcpu));
+
+ pages = (sizeof(struct lguest_vcpu)+(PAGE_SIZE-1))/PAGE_SIZE;
+ for (order = 0; (1<<order) < pages; order++)
+ ;
+
+ lguest_vcpu_pages = pages;
+ lguest_vcpu_order = order;
+
+ ret = paravirt_enabled();
+ if (ret < 0)
+ return -EPERM;
+
+ ret = lguest_device_init();
+ if (ret < 0) {
+ return ret;
+ }
+
+ pages = (unsigned long)&end_hyper_text -
+ (unsigned long)&start_hyper_text;
+ pages = (pages + (PAGE_SIZE - 1)) / PAGE_SIZE;
+
+ ret = hvvm_map_pages(&start_hyper_text, pages, &hvaddr);
+ if (ret < 0)
+ goto out;
+ printk("hvaddr=%lx\n",hvaddr);
+
+ lguest_hv_addr = hvaddr;
+ lguest_hv_pages = pages;
+ lguest_hv_offset = hvaddr - (unsigned long)&start_hyper_text;
+
+ /* Setup LGUEST segments on all cpus */
+ for_each_possible_cpu(i) {
+ struct desc_struct *gdt_table;
+ gdt_table = cpu_gdt(i);
+ gdt_table[GDT_ENTRY_HV_CS] = gdt_table[gdt_index(__KERNEL_CS)];
+ gdt_table[GDT_ENTRY_HV_DS] = gdt_table[gdt_index(__KERNEL_DS)];
+ }
+
+// rdmsrl(MSR_LSTAR, *lg_host_syscall);
+// wrmsrl(MSR_LSTAR, lg_hcall);
+ return 0;
+#if 0
+ ret = init_pagetables(hvaddr);
+ if (ret < 0)
+ goto out2;
+
+ return 0;
+
+out2:
+ hvvm_unnmap_pages(hvaddr, pages);
+#endif
+out:
+ lguest_device_remove();
+ return ret;
+}
+
+
+static void __exit fini(void)
+{
+#if 0
+ unsigned long *lg_host_syscall + (unsigned long
*)HV_OFFSET(&host_syscall);
+
+ wrmsrl(MSR_LSTAR, *lg_host_syscall);
+#endif
+ hvvm_release_all();
+ lguest_device_remove();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
Index: work-pv/arch/x86_64/lguest/hypercalls.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/hypercalls.c
@@ -0,0 +1,324 @@
+/* Actual hypercalls, which allow guests to actually do something.
+ Copyright (C) 2007, Glauber de Oliveira Costa <gcosta@redhat.com>
+ Steven Rostedt <srostedt@redhat.com>
+ Red Hat Inc
+ Standing on the shoulders of Rusty Russell.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include "lguest.h"
+
+/* FIXME: add this to Kconfig */
+#define CONFIG_LGUEST_DEBUG 1
+
+static void guest_set_stack(struct lguest_vcpu *vcpu,
+ u64 rsp, unsigned int pages)
+{
+ /* You cannot have a stack segment with priv level 0. */
+ if (pages > 2)
+ kill_guest_dump(vcpu, "bad stack pages %u", pages);
+ vcpu->tss.rsp2 = rsp;
+ /* FIXME */
+// lg->stack_pages = pages;
+// pin_stack_pages(lg);
+}
+
+static DEFINE_MUTEX(hcall_print_lock);
+#define HCALL_PRINT_SIZ 1024
+static char hcall_print_buf[HCALL_PRINT_SIZ];
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest_vcpu *vcpu)
+{
+ struct lguest_regs *regs = &vcpu->regs;
+ struct lguest_guest_info *linfo = vcpu->guest;
+ unsigned long val;
+ unsigned long ret;
+
+ switch (regs->rax) {
+ case LHCALL_PRINT:
+ mutex_lock(&hcall_print_lock);
+ ret = strncpy_from_user(hcall_print_buf,
+ (const char __user *)regs->rdx,
+ HCALL_PRINT_SIZ);
+ if (ret < 0) {
+ kill_guest_dump(vcpu,
+ "bad hcall print pointer (%llx)",
+ regs->rdx);
+ mutex_unlock(&hcall_print_lock);
+ return -EFAULT;
+ }
+ printk("LGUEST: %s", hcall_print_buf);
+ mutex_unlock(&hcall_print_lock);
+
+ break;
+ case LHCALL_FLUSH_ASYNC:
+ break;
+ case LHCALL_LGUEST_INIT:
+ kill_guest_dump(vcpu, "already have lguest_data");
+ break;
+ case LHCALL_RDMSR:
+ switch (regs->rdx) {
+ case MSR_KERNEL_GS_BASE:
+ val = (vcpu->guest_gs_shadow_a & ((1UL << 32)-1)) |
+ (vcpu->guest_gs_shadow_d << 32);
+ lhwrite_u64(vcpu, regs->rbx, val);
+ break;
+ case MSR_GS_BASE:
+ val = (vcpu->guest_gs_a & ((1UL << 32)-1)) |
+ (vcpu->guest_gs_d << 32);
+ lhwrite_u64(vcpu, regs->rbx, val);
+ break;
+ case MSR_FS_BASE:
+ lhwrite_u64(vcpu, regs->rbx, 0);
+ break;
+ case MSR_EFER:
+ val = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+ lhwrite_u64(vcpu, regs->rbx, val);
+ break;
+ default:
+ kill_guest_dump(vcpu, "bad read of msr %llx\n", regs->rdx);
+ }
+ break;
+ case LHCALL_WRMSR:
+ switch (regs->rdx) {
+ case MSR_KERNEL_GS_BASE:
+ if ((regs->rbx >= HVVM_START) &&
+ (regs->rbx < (HVVM_START + HV_VIRT_SIZE))) {
+ kill_guest_dump(vcpu,
+ "guest trying to set GS shadow base"
+ " in hypervisor");
+ break;
+ }
+ vcpu->guest_gs_shadow_a = regs->rbx;
+ vcpu->guest_gs_shadow_d = regs->rbx >> 32;
+ break;
+ case MSR_GS_BASE:
+ if ((regs->rbx >= HVVM_START) &&
+ (regs->rbx < (HVVM_START + HV_VIRT_SIZE))) {
+ kill_guest_dump(vcpu,
+ "guest trying to set GS base in hypervisor");
+ break;
+ }
+ vcpu->guest_gs_a = regs->rbx;
+ vcpu->guest_gs_d = regs->rbx >> 32;
+ break;
+ case MSR_FS_BASE:
+ /* always zero */
+ break;
+ default:
+ kill_guest(linfo, "bad write to msr %llx\n", regs->rdx);
+ }
+ break;
+ case LHCALL_SET_PMD:
+ guest_set_pmd(vcpu, regs->rdx, regs->rbx, regs->rcx);
+ break;
+ case LHCALL_SET_PUD:
+ guest_set_pud(vcpu, regs->rdx, regs->rbx, regs->rcx);
+ break;
+ case LHCALL_SET_PGD:
+ guest_set_pgd(vcpu, regs->rdx, regs->rbx, regs->rcx);
+ break;
+ case LHCALL_SET_PTE:
+ guest_set_pte(vcpu, regs->rdx, regs->rbx, regs->rcx);
+ break;
+
+ case LHCALL_FLUSH_TLB_SIG:
+ guest_flush_tlb_single(vcpu, regs->rdx, regs->rbx);
+ break;
+ case LHCALL_FLUSH_TLB:
+ if (regs->rdx)
+ guest_pagetable_clear_all(vcpu);
+ else
+ guest_pagetable_flush_user(vcpu);
+ break;
+
+ case LHCALL_NEW_PGTABLE:
+ guest_new_pagetable(vcpu, regs->rdx);
+ break;
+
+ case LHCALL_CRASH: {
+ char msg[128];
+ lhread(linfo, msg, regs->rdx, sizeof(msg));
+ msg[sizeof(msg)-1] = '\0';
+ kill_guest_dump(vcpu, "CRASH: %s", msg);
+ break;
+ }
+ case LHCALL_LOAD_GDT:
+ /* i386 does a lot of gdt reloads. We don't.
+ * we may want to support it in the future for more
+ * strange code paths. Not now */
+ return -ENOSYS;
+
+ case LHCALL_LOAD_IDT_ENTRY: {
+ struct gate_struct g;;
+ if (regs->rdx > 0xFF) {
+ kill_guest(linfo, "There are just 255 idt entries."
+ "What are you trying to do??");
+ }
+ lhread(linfo, &g, regs->rbx, sizeof(g));
+ load_guest_idt_entry(vcpu, regs->rdx,&g);
+ break;
+ }
+ case LHCALL_SET_STACK:
+ guest_set_stack(vcpu, regs->rdx, regs->rbx);
+ break;
+ case LHCALL_TS:
+ vcpu->ts = regs->rdx;
+ break;
+ case LHCALL_TIMER_READ: {
+ u32 now = jiffies;
+ mb();
+ regs->rax = now - linfo->last_timer;
+ linfo->last_timer = now;
+ break;
+ }
+ case LHCALL_TIMER_START:
+ linfo->timer_on = 1;
+ if (regs->rdx != HZ)
+ kill_guest(linfo, "Bad clock speed %lli", regs->rdx);
+ linfo->last_timer = jiffies;
+ break;
+ case LHCALL_HALT:
+ linfo->halted = 1;
+ break;
+ case LHCALL_GET_WALLCLOCK: {
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ regs->rax = tv.tv_sec;
+ break;
+ }
+ case LHCALL_BIND_DMA:
+ printk("Binding dma....\n");
+ regs->rax = bind_dma(linfo, regs->rdx, regs->rbx,
+ regs->rcx >> 8, regs->rcx & 0xFF);
+ break;
+ case LHCALL_SEND_DMA:
+ printk("Sending dma....\n");
+ return send_dma(linfo, regs->rdx, regs->rbx);
+
+ case LHCALL_IRET:
+ guest_iret(vcpu);
+ break;
+#if 0
+ case LHCALL_LOAD_TLS:
+ guest_load_tls(lg, (struct desc_struct __user*)regs->rdx);
+ break;
+#endif
+
+ case LHCALL_DEBUG_ME:
+#ifdef CONFIG_LGUEST_DEBUG
+ lguest_debug = regs->rdx;
+ printk("lguest debug turned %s\n", regs->rdx ? "on" :
"off");
+ lguest_dump_vcpu_regs(vcpu);
+#else
+ {
+ static int once = 1;
+ if (once) {
+ once = 0;
+ printk("lguest debug is disabled, to use this "
+ "please enable CONFIG_LGUEST_DEBUG\n");
+ }
+ }
+#endif
+ break;
+ default:
+ kill_guest(linfo, "Bad hypercall %lli\n", regs->rax);
+ }
+ return 0;
+}
+
+#if 0
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+ unsigned int i, pending;
+ u8 st[LHCALL_RING_SIZE];
+
+ if (!lg->lguest_data)
+ return 0;
+
+ if (copy_from_user(&st, &lg->lguest_data->hcall_status,
sizeof(st)))
+ return -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(st); i++) {
+ struct lguest_regs regs;
+ unsigned int n = lg->next_hcall;
+
+ if (st[n] == 0xFF)
+ break;
+
+ if (++lg->next_hcall == LHCALL_RING_SIZE)
+ lg->next_hcall = 0;
+
+ get_user(regs.rax, &lg->lguest_data->hcalls[n].eax);
+ get_user(regs.rdx, &lg->lguest_data->hcalls[n].edx);
+ get_user(regs.rcx, &lg->lguest_data->hcalls[n].ecx);
+ get_user(regs.rbx, &lg->lguest_data->hcalls[n].ebx);
+ pending = do_hcall(lg, ®s);
+ put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+ if (pending)
+ return 1;
+ }
+
+ set_wakeup_process(lg, NULL);
+ return 0;
+}
+#endif
+
+int hypercall(struct lguest_vcpu *vcpu)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_regs *regs = &vcpu->regs;
+ int pending;
+
+ if (!linfo->lguest_data) {
+ if (regs->rax != LHCALL_LGUEST_INIT) {
+ kill_guest(linfo, "hypercall %lli before LGUEST_INIT",
+ regs->rax);
+ return 0;
+ }
+
+ linfo->lguest_data = (struct lguest_data __user *)regs->rdx;
+ /* We check here so we can simply copy_to_user/from_user */
+ if (!lguest_address_ok(linfo, (long)linfo->lguest_data)
+ || !lguest_address_ok(linfo, (long)(linfo->lguest_data+1))){
+ kill_guest(linfo, "bad guest page %p", linfo->lguest_data);
+ return 0;
+ }
+ /* update the page_offset info */
+ get_user(linfo->page_offset, &linfo->lguest_data->page_offset);
+ get_user(linfo->start_kernel_map,
&linfo->lguest_data->start_kernel_map);
+
+#if 0
+ get_user(linfo->noirq_start, &linfo->lguest_data->noirq_start);
+ get_user(linfo->noirq_end, &linfo->lguest_data->noirq_end);
+#endif
+ /* We reserve the top pgd entry. */
+ put_user(4U*1024*1024, &linfo->lguest_data->reserve_mem);
+ put_user(linfo->guest_id, &linfo->lguest_data->guest_id);
+ return 0;
+ }
+ pending = do_hcall(vcpu);
+ //set_wakeup_process(vcpu, NULL);
+ return pending;
+}
Index: work-pv/arch/x86_64/lguest/hypervisor.S
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/hypervisor.S
@@ -0,0 +1,711 @@
+#include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+#include "lguest.h"
+
+.text
+.align PAGE_SIZE
+
+.global start_hyper_text
+ .type start_hyper_text, @function
+start_hyper_text:
+
+.global host_syscall
+host_syscall:
+ .quad 0
+
+#define PRINT_L(L) \
+ PRINT_OUT($L)
+
+#define PRINT_N(n) \
+ PRINT_OUT($'0' + $n)
+
+#define PRINT_HEX(n) \
+ mov n, %cl; \
+ and $0xf, %cl; \
+ cmp $0xa, %cl; \
+ jge 11f; \
+ add $'0', %cl; \
+ jmp 12f; \
+11: add $('a' - 10), %cl; \
+12: PRINT_OUT(%cl);
+
+#define PRINT_NUM_BX \
+9: PRINT_HEX(%bl); \
+ shr $4, %rbx; \
+ jne 9b
+
+#define PRINT_NUM(n) \
+ movl $n, %ebx; \
+ PRINT_NUM_BX; \
+ PRINT_L('\n'); \
+ PRINT_L('\r')
+
+#define PRINT_LONG(n) \
+ movl n, %ebx; \
+ PRINT_NUM_BX; \
+ PRINT_L('\n'); \
+ PRINT_L('\r')
+
+#define PRINT_QUAD(n) \
+ movq n, %rbx; \
+ PRINT_NUM_BX; \
+ PRINT_L('\n'); \
+ PRINT_L('\r')
+
+#define PRINT_X \
+ PRINT_L('x')
+
+#define PRINT_OUT(x) \
+ mov $0x3f8, %esi; \
+21: lea 0x5(%esi), %edx; \
+ movzwl %dx, %edx; \
+ in (%dx), %al; \
+ test $0x20,%al; \
+ jne 22f; \
+ pause; \
+ jmp 21b; \
+22: \
+ movl %esi, %edx; \
+ movzwl %dx, %edx; \
+ mov x, %al; \
+ out %al, (%dx); \
+31: \
+ lea 0x5(%esi), %edx; \
+ movzwl %dx, %edx; \
+ in (%dx), %al; \
+ test $0x20,%al; \
+ jne 32f; \
+ pause; \
+ jmp 31b; \
+32: \
+
+#define PUSH_NUM \
+ pushq %rcx; \
+ pushq %rbx;
+
+#define POP_NUM \
+ pushq %rbx; \
+ pushq %rcx;
+
+#define PUSH_PRINT \
+ pushq %rsi; \
+ pushq %rdx; \
+ pushq %rax; \
+
+#define POP_PRINT \
+ popq %rax; \
+ popq %rdx; \
+ popq %rsi;
+
+#define S_PRINT_NUM(_n) \
+ PUSH_PRINT; \
+ PUSH_NUM; \
+ PRINT_NUM(_n); \
+ POP_NUM; \
+ POP_PRINT;
+
+#define S_PRINT_L(x) \
+ PUSH_PRINT; \
+ PRINT_L(x); \
+ POP_PRINT;
+
+#define S_PRINT_QUAD(_n) \
+ PUSH_PRINT; \
+ PUSH_NUM; \
+ PRINT_QUAD(_n); \
+ POP_NUM; \
+ POP_PRINT;
+
+/* Save registers on the current stack. Both for
+ * switch_to_guest and switch_to_host usage */
+#define SAVE_REGS \
+ /* Save old guest/host state */ \
+ pushq %fs; \
+ pushq %rax; \
+ pushq %r15; \
+ pushq %r14; \
+ pushq %r13; \
+ pushq %r12; \
+ pushq %r11; \
+ pushq %r10; \
+ pushq %r9; \
+ pushq %r8; \
+ pushq %rbp; \
+ pushq %rdi; \
+ pushq %rsi; \
+ pushq %rdx; \
+ pushq %rcx; \
+ pushq %rbx; \
+
+#define RESTORE_REGS \
+ /* Save old guest/host state */ \
+ popq %rbx; \
+ popq %rcx; \
+ popq %rdx; \
+ popq %rsi; \
+ popq %rdi; \
+ popq %rbp; \
+ popq %r8; \
+ popq %r9; \
+ popq %r10; \
+ popq %r11; \
+ popq %r12; \
+ popq %r13; \
+ popq %r14; \
+ popq %r15; \
+ popq %rax; \
+ popq %fs; \
+
+.macro dump_stack_regs PREFIX
+ movq $LGUEST_REGS_size, %r10
+ xorq %r11, %r11
+1: PRINT_L(\PREFIX);
+ movq %r11, %rbx;
+ PRINT_NUM_BX;
+ PRINT_L(':'); PRINT_L(' ');
+ movq %rsp, %r9
+ addq %r11, %r9
+ PRINT_QUAD((%r9))
+ addq $8, %r11
+ cmp %r11, %r10
+ ja 1b
+.endm
+
+.macro debugme VCPU C
+ testb $1,LGUEST_VCPU_debug(\VCPU)
+ jz 23f
+ PRINT_L(\C)
+23:
+.endm
+
+
+#if 0
+.global hcall_teste
+ .type hcall_teste, @function
+hcall_teste:
+ cmpq $0, %gs:pda_vcpu
+ jne handle_guest
+ jmp *host_syscall
+handle_guest:
+ /* SAVE_REGS maybe it is not the macro we want */
+ #cmpq $__PAGE_OFFSET, %rcx;
+ jb do_hypercall
+ movq %gs:pda_vcpu, %rcx;
+ movq LGUEST_VCPU_guest_syscall(%rcx), %rcx;
+#endif
+
+/**
+ * DECODE_IDT parse a IDT descriptor to find the target.
+ * @IDT - The register that holds the IDT descriptor location
+ * @IDTWORD - The word version of the IDT register
+ * (ie. IDT is %rax, then IDTWORD must be %ax)
+ * @RESULT - The regsiter to place the result.
+ *
+ * This clobbers both IDT and RESULT regs.
+ */
+.macro DECODE_IDT IDT IDTWORD RESULT
+ movzwq (\IDT), \RESULT
+ movq 4(\IDT), \IDT
+ xorw \IDTWORD, \IDTWORD
+ orq \IDT, \RESULT
+.endm
+
+/**
+ * DECODE_SSEG parse a System Segment descriptor to find the target.
+ * @SEG - The register that holds the Sys Seg descriptor location
+ * @RESULT - The regsiter to place the result.
+ * @RW - The word version of the RESULT register
+ * @RH - The high byte version of the RESULT register
+ *
+ * (ie. RESULT is %rax, then RW must be %ax and RH must be %ah)
+ *
+ * This clobbers both SEG and RESULT regs.
+ */
+/* Why does Intel need to make everything so darn complex! */
+.macro DECODE_SSEG SEG RESULT RW RH
+ movzbq 7(\SEG), \RESULT
+ shl $16, \RESULT
+ movb 4(\SEG), \RH
+ shl $8, \RESULT
+ movw 2(\SEG), \RW
+ movq 8(\SEG), \SEG
+ shlq $32, \SEG
+ orq \SEG, \RESULT
+.endm
+
+.global switch_to_guest
+ .type switch_to_guest, @function
+/* rdi holds the pointer to vcpu.
+ * Interrupts are off on entry */
+switch_to_guest:
+ SAVE_REGS
+ /* save host stack */
+ movq %rsp, LGUEST_VCPU_host_stack(%rdi)
+ /* put the guest's stack in */
+ movq %rdi, %rsp
+ /* move the stack to point to guest regs */
+ addq $LGUEST_VCPU_regs, %rsp
+ /* filling this pointer has the effect of signalizing we're
+ * running guest code */
+ movq %rdi, %gs:pda_vcpu
+
+ /* save this host's gdt and idt */
+ sgdt LGUEST_VCPU_host_gdt(%rdi)
+ sidt LGUEST_VCPU_host_idt(%rdi)
+
+ /* Save the gs base of the host (for nmi use) */
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ movq %rax, LGUEST_VCPU_host_gs_a(%rdi)
+ movq %rdx, LGUEST_VCPU_host_gs_d(%rdi)
+
+ /* Save the host proc gs pointer */
+ movl $MSR_KERNEL_GS_BASE, %ecx
+ rdmsr
+ movq %rax, LGUEST_VCPU_host_proc_gs_a(%rdi)
+ movq %rdx, LGUEST_VCPU_host_proc_gs_d(%rdi)
+
+ /* save the hosts page tables */
+ movq %cr3, %rax
+ movq %rax, LGUEST_VCPU_host_cr3(%rdi)
+
+ /*
+ * The NMI is a big PITA. There's no way to atomically load the
+ * TSS and IDT, so we can't just switch to the guest TSS without
+ * causing a race condition with the NMI.
+ * So we set up the host NMI stack in the guest TSS IST so that
+ * in case we take an NMI after loading our TR register
+ * but before we've updated the lidt, we still have a valid
+ * stack for the host nmi handler to use.
+ */
+ /* Load the guest gdt */
+ lgdt LGUEST_VCPU_gdt(%rdi)
+
+ /* Switch to guest's TSS (before loading the idt) */
+ movl $(GDT_ENTRY_TSS*8), %ebx
+ ltr %bx
+
+ /* Set host's TSS to available (clear byte 5 bit 2). */
+ movq LGUEST_VCPU_host_gdt_ptr(%rdi), %rax
+ andb $0xFD, (GDT_ENTRY_TSS*8+5)(%rax)
+
+ /* Now load the guest idt */
+ lidt LGUEST_VCPU_idt(%rdi)
+
+ /* Load the guest gs pointer */
+ movl $MSR_KERNEL_GS_BASE, %ecx
+ movq LGUEST_VCPU_guest_gs_a(%rdi), %rax
+ movq LGUEST_VCPU_guest_gs_d(%rdi), %rdx
+ wrmsr
+
+ /* Flush the TLB */
+ movq %cr4, %rax
+ movq %rax, %rbx
+ andb $~(1<<7), %al
+ movq %rax, %cr4
+ movq %rbx, %cr4
+
+ /* switch to the guests page tables */
+ popq %rax
+ movq %rax, %cr3
+
+ /* Now we swap gs to the guest gs base */
+ swapgs
+
+ /* restore guest registers */
+ RESTORE_REGS
+ /* skip trapnum and errorcode */
+ addq $0x10, %rsp;
+ iretq
+
+.macro print_trap VCPU REG
+ movq LGUEST_VCPU_trapnum(\VCPU), \REG
+ PRINT_QUAD(\REG)
+.endm
+
+#define SWITCH_TO_HOST \
+ SAVE_REGS; \
+ /* Save old pgdir */ \
+ movq %cr3, %rax; \
+ pushq %rax; \
+ /* Point rdi to the vcpu struct */ \
+ movq %rsp, %rdi; \
+ subq $LGUEST_VCPU_regs, %rdi; \
+ /* Load lguest ds segment for convenience. */ \
+ movq $(__HV_DS), %rax; \
+ movq %rax, %ds; \
+ /* Load the host page tables since that's where the gdt is */ \
+ movq LGUEST_VCPU_host_cr3(%rdi), %rax; \
+ movq %rax, %cr3; \
+ /* Switch to hosts gdt */ \
+ lgdt LGUEST_VCPU_host_gdt(%rdi); \
+ /* Set guest's TSS to available (clear byte 5 bit 2). */ \
+ movq LGUEST_VCPU_vcpu(%rdi), %rax; \
+ andb $0xFD, (LGUEST_VCPU_gdt_table+GDT_ENTRY_TSS*8+5)(%rax); \
+ /* Swap back to the host PDA */ \
+ swapgs; \
+ /* Put back the host process gs as well */ \
+ movl $MSR_KERNEL_GS_BASE,%ecx; \
+ movq LGUEST_VCPU_host_proc_gs_a(%rdi), %rax; \
+ movq LGUEST_VCPU_host_proc_gs_d(%rdi), %rdx; \
+ wrmsr; \
+ /* With PDA back now switch to host idt */ \
+ lidt LGUEST_VCPU_host_idt(%rdi); \
+ /* Switch to host's TSS. */ \
+ movl $(GDT_ENTRY_TSS*8), %eax; \
+ ltr %ax; \
+ /* put flag down. We're in the host again */ \
+ movq $0, %gs:pda_vcpu; \
+ movq LGUEST_VCPU_host_stack(%rdi), %rsp; \
+ RESTORE_REGS;
+
+/* Return to run_guest_once. */
+return_to_host:
+ SWITCH_TO_HOST
+ iretq
+
+deliver_to_host:
+ SWITCH_TO_HOST
+decode_idt_and_jmp:
+ /* Decode IDT and jump to hosts' irq handler. When that does iret, it
+ * will return to run_guest_once. This is a feature. */
+ /* We told gcc we'd clobber rdi and rax... */
+ movq LGUEST_VCPU_trapnum(%rdi), %rdi
+ shl $1, %rdi
+ leaq (%rax,%rdi,8), %rdi
+ DECODE_IDT %rdi %di %rax
+ jmp *%rax
+
+#define NMI_SWITCH_TO_HOST \
+ /* Force switch to host, GDT, CR3, and both GS bases */ \
+ movl $MSR_GS_BASE, %ecx; \
+ movq LGUEST_VCPU_host_gs_a(%rdi), %rax; \
+ movq LGUEST_VCPU_host_gs_d(%rdi), %rdx; \
+ wrmsr; \
+ movl $MSR_KERNEL_GS_BASE, %ecx; \
+ movq LGUEST_VCPU_host_proc_gs_a(%rdi), %rax; \
+ movq LGUEST_VCPU_host_proc_gs_d(%rdi), %rdx; \
+ wrmsr; \
+ movq LGUEST_VCPU_host_cr3(%rdi), %rax; \
+ movq %rax, %cr3; \
+ lgdt LGUEST_VCPU_host_gdt(%rdi);
+
+#if 0
+ /* Set host's TSS to available (clear byte 5 bit 2). */ \
+ movq LGUEST_VCPU_host_gdt_ptr(%rdi), %rax; \
+ andb $0xFD, (GDT_ENTRY_TSS*8+5)(%rax); \
+
+#endif
+
+/* Used by NMI only */
+/*
+ * The NMI is special because it uses its own stack, and needs to
+ * find the vcpu struct differently.
+ */
+nmi_trampoline:
+ /* nmi has it's own stack */
+ SAVE_REGS
+
+ /* save the cr3 */
+ movq %cr3, %rax
+ pushq %rax
+
+ /* get the vcpu struct */
+ movq %rsp, %rdi
+ subq $LGUEST_VCPU_nmi_stack_end, %rdi
+ addq $LGUEST_REGS_size, %rdi /* compensate for saved regs */
+
+ /* compensate if our end pointer is not 16 bytes aligned */
+ movq $LGUEST_VCPU_nmi_stack_end, %rax
+ andq $0xf, %rax;
+ addq %rax, %rdi;
+
+#if 0 /* in case we want to see where the nmi hit */
+ movq LGUEST_REGS_rip(%rsp), %r8
+ PRINT_L('R')
+ PRINT_QUAD(%r8)
+#endif
+
+ /*
+ * All guest descriptors are above the HV text code (here!)
+ * If we hit the suspected NMI race, our stack will be the host
+ * kernel stack, and that is in lower address space than the HV.
+ * So test to see if we are screwed. Don't do anything, but just
+ * report it!
+ */
+ call 1f
+1:
+ movq 0(%rsp), %rax /* put this RIP into rax */
+ /* If rsp >= rax; jmp */
+ cmpq %rax, %rsp
+ jge 1f
+
+ PRINT_L('H'); PRINT_L('i'); PRINT_L('t');
PRINT_L(' ');
+ PRINT_L('N'); PRINT_L('M'); PRINT_L('I');
PRINT_L(' ');
+ PRINT_L('r'); PRINT_L('a'); PRINT_L('c');
+ PRINT_L('\n'); PRINT_L('\r');
+
+1:
+ /* put back the stack from the previous call */
+ addq $8, %rsp
+
+ /*
+ * If we take another NMI while saving, we need to start over
+ * and try again. It's OK as long as we don't overwrite
+ * the saved material.
+ */
+ testq $1,LGUEST_VCPU_nmi_sw(%rdi)
+ jnz 1f
+
+ /* Copy the saved regs */
+ cld
+ movq %rdi, %rbx /* save off vcpu struct */
+ leaq LGUEST_VCPU_nmi_regs(%rdi), %rdi
+ leaq 0(%rsp), %rsi
+ movq $(LGUEST_REGS_size/8), %rcx
+ rep movsq
+
+ movq %rbx, %rdi /* put back vcpu struct */
+
+ /* save the gs base and shadow */
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ movq %rax, LGUEST_VCPU_nmi_gs_a(%rdi)
+ movq %rdx, LGUEST_VCPU_nmi_gs_d(%rdi)
+
+ movl $MSR_KERNEL_GS_BASE, %ecx
+ rdmsr
+ movq %rax, LGUEST_VCPU_nmi_gs_shadow_a(%rdi)
+ movq %rdx, LGUEST_VCPU_nmi_gs_shadow_d(%rdi)
+
+ /* save the gdt */
+ sgdt LGUEST_VCPU_nmi_gdt(%rdi)
+
+ /* set the switch flag to prevent another nmi from saving over this */
+ movq $1, LGUEST_VCPU_nmi_sw(%rdi)
+
+1:
+
+#if 0
+ S_PRINT_L('N')
+ S_PRINT_L('M')
+ S_PRINT_L('I')
+ S_PRINT_L(' ')
+ S_PRINT_L('l')
+ S_PRINT_L('g')
+ S_PRINT_L('u')
+ S_PRINT_L('e')
+ S_PRINT_L('s')
+ S_PRINT_L('t')
+ S_PRINT_L('\n')
+ S_PRINT_L('\r')
+#endif
+ NMI_SWITCH_TO_HOST
+
+ /* we want to come back here on the iret */
+ pushq $__HV_DS
+ /* put the vcpu struct as our stack */
+ pushq %rdi
+ pushfq
+ pushq $__HV_CS
+
+ movq LGUEST_VCPU_host_idt_address(%rdi), %rax
+
+ /* Decode the location of the host NMI handler */
+ leaq 32(%rax), %rbx /* NMI IDT entry */
+ DECODE_IDT %rbx %bx %rax
+
+ callq *%rax
+
+ /*
+ * Back from NMI, stack points to vcpu, and we can take
+ * more NMIs at this point. That's OK, since we only
+ * want to get to the original NMI interruption. We
+ * just restart this restore process. Nested NMIs will
+ * not destroy this data while the nmi_sw flag is set.
+ */
+ movq %rsp, %rdi
+
+ /* restore the cr3 */
+ addq $(LGUEST_VCPU_nmi_regs), %rsp
+ popq %rax
+ movq %rax, %cr3
+
+ /* restore the gdt */
+ lgdt LGUEST_VCPU_nmi_gdt(%rdi)
+
+#if 0 /* print magic */
+ movq LGUEST_VCPU_magic(%rdi), %r8
+ movq $(6*8), %r9
+1: subq $8, %r9
+ movq %r9, %rcx
+ movq %r8, %rbx
+ shr %cl, %rbx
+ PRINT_OUT(%bl)
+ cmp $0, %r9
+ jne 1b
+#endif
+
+ /* make both host and guest TSS available */
+#if 1
+ movq LGUEST_VCPU_host_gdt_ptr(%rdi), %rax
+ andb $0xFD, (GDT_ENTRY_TSS*8+5)(%rax)
+
+ andb $0xFD, (LGUEST_VCPU_gdt_table+GDT_ENTRY_TSS*8+5)(%rdi)
+#endif
+
+#if 0
+ movl $(GDT_ENTRY_TSS*8), %ebx
+ ltr %bx
+#endif
+
+ /* restore the gs base and shadow */
+ movl $MSR_GS_BASE, %ecx
+ movq LGUEST_VCPU_nmi_gs_a(%rdi), %rax
+ movq LGUEST_VCPU_nmi_gs_d(%rdi), %rdx
+ wrmsr
+
+ movl $MSR_KERNEL_GS_BASE, %ecx
+ movq LGUEST_VCPU_nmi_gs_shadow_a(%rdi), %rax
+ movq LGUEST_VCPU_nmi_gs_shadow_d(%rdi), %rdx
+ wrmsr
+
+#if 0
+ PRINT_L('O')
+ PRINT_L('U')
+ PRINT_L('T')
+ PRINT_L('\n')
+ PRINT_L('\r')
+#endif
+
+#if 1
+ /* Flush the TLB */
+ movq %cr4, %rax
+ movq %rax, %rbx
+ andb $~(1<<7), %al
+ movq %rax, %cr4
+ movq %rbx, %cr4
+#endif
+
+ RESTORE_REGS
+
+ /* skip trapnum and errcode */
+ addq $0x10, %rsp
+
+ /*
+ * Careful here, we can't modify any regs anymore
+ * but we now have to zero out the nmi switch flag.
+ * So all the work will be done by the stack pointer.
+ */
+
+#define SW_OFFSET (LGUEST_VCPU_nmi_sw - \
+ (LGUEST_VCPU_nmi_regs + LGUEST_REGS_rip))
+ movq $0, SW_OFFSET(%rsp)
+
+ /* use iret to get back to where we were. */
+ iretq;
+ /* Whoo, all done! */
+
+do_crash:
+ SAVE_REGS
+ movq %cr3, %rax;
+ pushq %rax;
+
PRINT_L('C');PRINT_L('r');PRINT_L('a');PRINT_L('s');
+
PRINT_L('h');PRINT_L('i');PRINT_L('n');PRINT_L('g');
+ PRINT_L('\n');PRINT_L('\r');
+
+ dump_stack_regs 'S'
+
+ addq $16, %rsp
+ sgdt 0(%rsp)
+
PRINT_L('G');PRINT_L('D');PRINT_L('T');PRINT_L('L');PRINT_L(':');PRINT_L('
');
+ xorq %r8, %r8
+ movw (%rsp), %r8
+ PRINT_QUAD(%r8)
+
PRINT_L('G');PRINT_L('D');PRINT_L('T');PRINT_L('A');PRINT_L(':');PRINT_L('
');
+ movq 2(%rsp), %r8
+ PRINT_QUAD(%r8)
+
+ PRINT_L('C');PRINT_L('S');PRINT_L(':');PRINT_L('
');
+ movq %cs, %rbx
+ PRINT_QUAD(%rbx)
+ movq %cs, %rbx
+ andb $(~3), %bl
+ addq %rbx, %r8
+ movq 0(%r8), %r9
+
PRINT_L('S');PRINT_L('E');PRINT_L('G');PRINT_L(':');PRINT_L('
');
+ PRINT_QUAD(%r9);
+ movq $1, %r8;
+ shl $47, %r8
+ andq %r9, %r8
+ PRINT_L('P');PRINT_L(' ');PRINT_L(':');PRINT_L('
');
+ PRINT_QUAD(%r8);
+ PRINT_L('D');PRINT_L('P');PRINT_L(':');PRINT_L('
');
+ movq $3, %r8;
+ shl $45, %r8
+ andq %r9, %r8
+ PRINT_QUAD(%r8);
+
+
+ /* just die! */
+2:
+ pause
+ jmp 2b
+
+
+/* Real hardware interrupts are delivered straight to the host. Others
+ cause us to return to run_guest_once so it can decide what to do. Note
+ that some of these are overridden by the guest to deliver directly, and
+ never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+ .data; .quad 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+/* .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N
> 14) && (\N <> 17) */
+ .if (\N < 10 || \N > 14) && (\N <> 17)
+ pushq $0
+ .endif
+ pushq $\N
+ jmp \TARGET
+ .align 8
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+ IRQ_STUB irq \TARGET
+ irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host. Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+.global _lguest_default_idt_entries
+_lguest_default_idt_entries:
+.text
+ IRQ_STUBS 0 1 return_to_host /* First two traps */
+ IRQ_STUB 2 nmi_trampoline /* NMI */
+ IRQ_STUBS 3 7 return_to_host /* Rest of traps */
+/*debug for now */
+ IRQ_STUB 8 do_crash /* Double fault! */
+#if 1
+ IRQ_STUBS 9 31 return_to_host /* Rest of traps */
+#else
+ IRQ_STUBS 9 12 return_to_host /* Rest of traps */
+ IRQ_STUB 13 do_crash /* GPF! */
+ IRQ_STUBS 14 31 return_to_host /* Rest of traps */
+#endif
+ IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
+ IRQ_STUB 128 return_to_host /* System call (overridden) */
+ IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
+
+ .align PAGE_SIZE
+.global end_hyper_text
+ .type end_hyper_text, @function
+end_hyper_text:
+ nop
Index: work-pv/arch/x86_64/lguest/interrupts_and_traps.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/interrupts_and_traps.c
@@ -0,0 +1,292 @@
+#include <linux/uaccess.h>
+#include <asm/lguest.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include "lguest.h"
+
+static void push_guest_stack(struct lguest_vcpu *vcpu,
+ u64 __user **gstack, u64 val)
+{
+ lhwrite_u64(vcpu, (u64)--(*gstack), val);
+}
+
+static u64 pop_guest_stack(struct lguest_vcpu *vcpu,
+ u64 __user **gstack)
+{
+ return lhread_u64(vcpu, (u64)(*gstack)++);
+}
+
+void guest_iret(struct lguest_vcpu *vcpu)
+{
+ struct lguest_regs *regs = &vcpu->regs;
+ u64 __user *gstack;
+ u64 cs;
+
+ gstack = (u64 __user *)guest_pa(vcpu->guest, regs->rsp);
+
+ regs->rip = pop_guest_stack(vcpu, &gstack);
+ cs = pop_guest_stack(vcpu, &gstack);
+
+ /* FIXME: determine if we are going back to userland */
+
+ regs->rflags = pop_guest_stack(vcpu, &gstack);
+ /* FIXME: check if this is correct */
+
+ if (regs->rflags & 512)
+ put_user(512, &vcpu->guest->lguest_data->irq_enabled);
+
+ /* make sure interrupts are enabled */
+ regs->rflags |= 512;
+
+ regs->rsp = pop_guest_stack(vcpu, &gstack);
+ regs->ss = pop_guest_stack(vcpu, &gstack);
+
+ /* restore the rax reg, since it was used by the guest to do the hcall */
+ regs->rax = vcpu->rax;
+
+ return;
+}
+
+int reflect_trap(struct lguest_vcpu *vcpu, int trap_num, int has_err)
+{
+ struct lguest_regs *regs = &vcpu->regs;
+ u64 __user *gstack;
+ u64 rflags, irq_enable;
+ u64 offset;
+
+ if (!vcpu->interrupt[trap_num]) {
+ printk("Not yet registered trap handler for %d\n",trap_num);
+ return 0;
+ }
+
+ /* save off the rax reg */
+ vcpu->rax = regs->rax;
+
+ /* FIXME: test for ring change and set up vcpu->tss.rsp2 ? */
+ gstack = (u64 __user *)guest_pa(vcpu->guest, regs->rsp);
+ offset = regs->rsp - (u64)gstack;
+
+ /* We use IF bit in eflags to indicate whether irqs were disabled
+ (it's always 0, since irqs are enabled when guest is running). */
+ get_user(irq_enable, &vcpu->guest->lguest_data->irq_enabled);
+ rflags = regs->rflags;
+ rflags |= (irq_enable & 512);
+
+ /* FIXME: Really? */
+ push_guest_stack(vcpu, &gstack, regs->ss);
+ push_guest_stack(vcpu, &gstack, regs->rsp);
+ push_guest_stack(vcpu, &gstack, rflags);
+ /* FIXME: determine if guest is in kernel or user mode */
+ push_guest_stack(vcpu, &gstack, __KERNEL_CS);
+ push_guest_stack(vcpu, &gstack, regs->rip);
+
+ if (has_err)
+ push_guest_stack(vcpu, &gstack, regs->errcode);
+
+ /* Change the real stack so hypervisor returns to trap handler */
+ regs->ss = __USER_DS;
+ regs->rsp = (u64)gstack + offset;
+ regs->cs = __USER_CS;
+ lgdebug_print("rip was at %p\n", (void*)regs->rip);
+ regs->rip = vcpu->interrupt[trap_num];
+
+ /* Disable interrupts for an interrupt gate. */
+ if (test_bit(trap_num, vcpu->interrupt_disabled))
+ put_user(0, &vcpu->guest->lguest_data->irq_enabled);
+ return 1;
+#if 0
+ /* Was ist da? */
+ /* GS will be neutered on way back to guest. */
+ put_user(0, &lg->lguest_data->gs_gpf_eip);
+#endif
+ return 0;
+}
+
+void maybe_do_interrupt(struct lguest_vcpu *vcpu)
+{
+ unsigned int irq;
+ DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+ if (!vcpu->guest->lguest_data)
+ return;
+
+ /* If timer has changed, set timer interrupt. */
+ if (vcpu->guest->timer_on && jiffies !=
vcpu->guest->last_timer)
+ set_bit(0, vcpu->irqs_pending);
+
+ /* Mask out any interrupts they have blocked. */
+ if (copy_from_user(&irqs, vcpu->guest->lguest_data->interrupts,
+ sizeof(irqs)))
+ return;
+
+ bitmap_andnot(irqs, vcpu->irqs_pending, irqs, LGUEST_IRQS);
+
+ irq = find_first_bit(irqs, LGUEST_IRQS);
+ if (irq >= LGUEST_IRQS)
+ return;
+
+ /* If they're halted, we re-enable interrupts. */
+ if (vcpu->guest->halted) {
+ /* Re-enable interrupts. */
+ put_user(512, &vcpu->guest->lguest_data->irq_enabled);
+ vcpu->guest->halted = 0;
+ } else {
+ /* Maybe they have interrupts disabled? */
+ u32 irq_enabled;
+ get_user(irq_enabled, &vcpu->guest->lguest_data->irq_enabled);
+ if (!irq_enabled) {
+ lgdebug_print("Irqs are disabled\n");
+ return;
+ }
+ }
+
+ if (vcpu->interrupt[irq + FIRST_EXTERNAL_VECTOR] != 0) {
+ lgdebug_print("Reflect trap: %x\n",irq+FIRST_EXTERNAL_VECTOR);
+ clear_bit(irq, vcpu->irqs_pending);
+ reflect_trap(vcpu, irq+FIRST_EXTERNAL_VECTOR, 0);
+ }
+ else {
+ lgdebug_print("out without doing it!!\n");
+ }
+
+}
+
+void check_bug_kill(struct lguest_vcpu *vcpu)
+{
+/* FIXME: Use rostedt magic kallsyms */
+#if 0
+#ifdef CONFIG_BUG
+ u32 eip = lg->state->regs.rip - PAGE_OFFSET;
+ u16 insn;
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->state->regs.rip < PAGE_OFFSET)
+ return;
+ lhread(lg, &insn, eip, sizeof(insn));
+ if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ u16 l;
+ u32 f;
+ char file[128];
+ lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+ lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+ lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+ file[sizeof(file)-1] = 0;
+ kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+ kill_guest(lg, "BUG() at %#x", eip);
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+ }
+#endif /* CONFIG_BUG */
+#endif
+}
+
+static void copy_trap(struct lguest_vcpu *vcpu,
+ unsigned int trap_num,
+ const struct gate_struct *desc)
+{
+
+ /* Not present? */
+ if (!desc->p) {
+ vcpu->interrupt[trap_num] = 0;
+ return;
+ }
+
+ switch (desc->type) {
+ case 0xE:
+ set_bit(trap_num,vcpu->interrupt_disabled);
+ break;
+ case 0xF:
+ clear_bit(trap_num,vcpu->interrupt_disabled);
+ break;
+ default:
+ kill_guest(vcpu->guest, "bad IDT type %i for irq %x",
+ desc->type,trap_num);
+ }
+
+ vcpu->interrupt[trap_num] = GATE_ADDRESS((*desc));
+}
+
+#if 0
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[]
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+ 0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+ /* movl 0, %ss:lguest_data.gs_gpf_eip */
+ 0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+ u32 addr, off;
+
+ off = sizeof(tramp)*i;
+ memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+ /* 0 is to be placed in lguest_data.gs_gpf_eip. */
+ addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+ memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+ /* Address is relative to where end of jmp will be. */
+ addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+ memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+ return (-4*1024*1024) + off;
+}
+
+#endif
+/* We bounce through the trap page, for two reasons: firstly, we need
+ the interrupt destination always mapped, to avoid double faults,
+ secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+/* guest kernel will not be mapped. we'd better do another schema */
+static void setup_idt(struct lguest_vcpu *vcpu,
+ unsigned int i,
+ const struct gate_struct *desc)
+{
+ u64 taddr;
+
+ /* Not present? */
+ if (!desc->p) {
+ /* FIXME: When we need this, we'll know... */
+ if (vcpu->idt_table[i].p)
+ kill_guest(vcpu->guest, "trying to remove irq line %i:"
+ "removing interrupts not supported",i);
+ return;
+ }
+
+#if 0
+ /* We could reflect and disable interrupts, but guest can do itself. */
+ if (desc->type != 0xF)
+ kill_guest(vcpu->guest, "bad direct IDT %i type 0x%x",
+ i, desc->type);
+#endif
+
+ /* FIXME: We may need to fix segment? */
+ _lguest_set_gate(&vcpu->idt_table[i], desc->type, GUEST_DPL, taddr,
0);
+#if 0
+ taddr = setup_trampoline(lg, i,
(desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+#endif
+}
+
+void load_guest_idt_entry(struct lguest_vcpu *vcpu, unsigned int i,
+ struct gate_struct *d)
+{
+ switch (i) {
+ /* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+ case 2:
+ case 8:
+ case 14:
+ case 15:
+ case LGUEST_TRAP_ENTRY:
+ /* FIXME: We should handle debug and int3 */
+ case 1:
+ case 3:
+ return;
+ default:
+ copy_trap(vcpu,i,d);
+ }
+}
+
Index: work-pv/arch/x86_64/lguest/lguest.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest.c
@@ -0,0 +1,705 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2007, Glauber de Oliveira Costa <gcosta@redhat.com>
+ * Steven Rostedt <srostedt@redhat.com>
+ * Red Hat Inc
+ * Standing on the shoulders of Rusty Russell.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/pfn.h>
+#include <asm/bootsetup.h>
+#include <asm/paravirt.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/asm-offsets.h>
+#include <asm/mce.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot =
(void*)__START_KERNEL_map;
+static struct lguest_text_ptr code_stack[2];
+extern int acpi_disabled;
+extern int acpi_ht;
+
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
+
+static DEFINE_SPINLOCK(hcall_print_lock);
+#define HCALL_BUFF_SIZ 1024
+static char hcall_buff[HCALL_BUFF_SIZ];
+
+/* Set to true when the lguest_init is called. */
+static int lguest_paravirt;
+
+struct lguest_print_ops {
+ void (*vprint)(const char *fmt, va_list ap);
+} *lguest_pops;
+
+void lguest_vprint(const char *fmt, va_list ap)
+{
+ if (lguest_pops)
+ lguest_pops->vprint(fmt, ap);
+}
+
+void lguest_print(const char *fmt, ...)
+{
+ va_list ap;
+
+ /* irq save? */
+ va_start(ap, fmt);
+ lguest_vprint(fmt, ap);
+ va_end(ap);
+}
+
+static void __lguest_vprint(const char *fmt, va_list ap)
+{
+ /* need to do this with interrupts disabled */
+// spin_lock(&hcall_print_lock);
+ vsnprintf(hcall_buff, HCALL_BUFF_SIZ-1, fmt, ap);
+
+ hcall(LHCALL_PRINT, __pa(hcall_buff), 0, 0);
+// spin_unlock(&hcall_print_lock);
+}
+
+struct lguest_print_ops local_pops = {__lguest_vprint };
+
+void lguest_set_debug(int d)
+{
+ if (lguest_paravirt)
+ hcall(LHCALL_DEBUG_ME, d, 0, 0);
+}
+
+void async_hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ /* Note: This code assumes we're uniprocessor. */
+ static unsigned int next_call;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (lguest_data.hcall_status[next_call] != 0xFF) {
+ /* Table full, so do normal hcall which will flush table. */
+ hcall(call, arg1, arg2, arg3);
+ } else {
+ lguest_data.hcalls[next_call].eax = call;
+ lguest_data.hcalls[next_call].edx = arg1;
+ lguest_data.hcalls[next_call].ebx = arg2;
+ lguest_data.hcalls[next_call].ecx = arg3;
+ wmb();
+ lguest_data.hcall_status[next_call] = 0;
+ if (++next_call == LHCALL_RING_SIZE)
+ next_call = 0;
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE /* Not in 2.6.20. */
+static int lazy_mode;
+static void lguest_lazy_mode(int mode)
+{
+ lazy_mode = mode;
+ if (mode == PARAVIRT_LAZY_NONE)
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3)
+{
+ if (lazy_mode == PARAVIRT_LAZY_NONE)
+ hcall(call, arg1, arg2, arg3);
+ else
+ async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long save_fl(void)
+{
+ return lguest_data.irq_enabled;
+}
+
+static void restore_fl(unsigned long flags)
+{
+ /* FIXME: Check if interrupt pending... */
+ lguest_data.irq_enabled = flags;
+}
+
+static void irq_disable(void)
+{
+ lguest_data.irq_enabled = 0;
+}
+
+static void irq_enable(void)
+{
+ /* Linux i386 code expects bit 9 set. */
+ /* FIXME: Check if interrupt pending... */
+ lguest_data.irq_enabled = 512;
+}
+
+static void lguest_load_gdt(const struct desc_ptr *desc)
+{
+ /* Does nothing. HV should have done everything for us */
+}
+
+static void lguest_load_idt(const struct desc_ptr *desc)
+{
+ unsigned int i;
+ struct gate_struct *idt = (void *)desc->address;
+
+ for (i = 0; i < (desc->size+1)/16; i++) {
+ hcall(LHCALL_LOAD_IDT_ENTRY, i, __pa((u64)&idt[i]), 0);
+ }
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+ hcall(LHCALL_CRASH, __pa(p), 0, 0);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+ .notifier_call = lguest_panic
+};
+
+static void lguest_memory_setup(void)
+{
+ /* We do this here because lockcheck barfs if before start_kernel */
+ atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+ e820.nr_map = 0;
+ add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+}
+
+static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ int is_feature = (*eax == 1);
+
+ native_cpuid(eax, ebx, ecx, edx);
+ if (is_feature) {
+ unsigned long *excap = (unsigned long *)ecx,
+ *features = (unsigned long *)edx;
+ /* Hypervisor needs to know when we flush kernel pages. */
+ set_bit(X86_FEATURE_PGE, features);
+ /* We don't have any features! */
+ clear_bit(X86_FEATURE_VME, features);
+ clear_bit(X86_FEATURE_DE, features);
+ clear_bit(X86_FEATURE_PSE, features);
+ clear_bit(X86_FEATURE_PAE, features);
+ clear_bit(X86_FEATURE_SEP, features);
+ clear_bit(X86_FEATURE_APIC, features);
+ clear_bit(X86_FEATURE_MTRR, features);
+ /* No MWAIT, either */
+ clear_bit(3, excap);
+ }
+}
+
+static unsigned long current_cr3;
+static void lguest_write_cr3(unsigned long cr3)
+{
+ hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+ current_cr3 = cr3;
+}
+
+static u64 lguest_read_msr(unsigned int msr, int *err)
+{
+ unsigned long val;
+
+ *err = 0;
+ hcall(LHCALL_RDMSR, msr, __pa(&val), 0);
+ return val;
+}
+
+static int lguest_write_msr(unsigned int msr, u64 val)
+{
+ hcall(LHCALL_WRMSR, msr, (unsigned long)val, 0);
+ return val;
+}
+
+static u64 lguest_read_tsc(void)
+{
+ /* we don't use natives, otherwise they can recurse */
+ unsigned int a,b;
+ asm volatile("rdtsc" : "=a" (a), "=d" (b));
+ return a | (unsigned long)(b) << 32 ;
+}
+
+static void lguest_flush_tlb(void)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void lguest_flush_tlb_kernel(void)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void lguest_flush_tlb_single(u64 addr)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB_SIG, current_cr3, addr, 0);
+}
+
+static void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+ hcall(LHCALL_SET_PTE, current_cr3, __pa(ptep), pte_val(pteval));
+}
+
+static void lguest_set_pte_at(struct mm_struct *mm, u64 addr, pte_t *ptep,
pte_t pteval)
+{
+ *ptep = pteval;
+ lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), __pa(ptep), pte_val(pteval));
+}
+
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ *pmdp = pmdval;
+ lazy_hcall(LHCALL_SET_PMD, current_cr3, __pa(pmdp)&PTE_MASK,
+ (__pa(pmdp)&(PAGE_SIZE-1))/8);
+}
+
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+ *pudp = pudval;
+ lazy_hcall(LHCALL_SET_PUD, current_cr3, __pa(pudp)&PTE_MASK,
+ (__pa(pudp)&(PAGE_SIZE-1))/8);
+}
+
+static void lguest_set_pgd(pgd_t *pgdp, pgd_t pgdval)
+{
+ *pgdp = pgdval;
+ lazy_hcall(LHCALL_SET_PGD, current_cr3, __pa(pgdp)&PTE_MASK,
+ (__pa(pgdp)&(PAGE_SIZE-1))/8);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void lguest_apic_write(unsigned long reg, unsigned int v)
+{
+}
+
+static unsigned int lguest_apic_read(unsigned long reg)
+{
+ return 0;
+}
+#endif
+
+#if 0
+/* We move eflags word to lguest_data.irq_enabled to restore interrupt
+ state. For page faults, gpfs and virtual interrupts, the
+ hypervisor has saved eflags manually, otherwise it was delivered
+ directly and so eflags reflects the real machine IF state,
+ ie. interrupts on. Since the kernel always dies if it takes such a
+ trap with interrupts disabled anyway, turning interrupts back on
+ unconditionally here is OK. */
+asm("lguest_iret:"
+ " pushq %rax;"
+ " movq 0x18(%rsp), %rax;"
+ "lguest_noirq_start:;"
+ " movq %rax,
lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";"
+ " popq %rax;"
+ " iretq;"
+ "lguest_noirq_end:");
+extern char lguest_noirq_start[], lguest_noirq_end[];
+#endif
+
+extern void lguest_iret(void);
+asm("lguest_iret:"
+ " movq $" __stringify(LHCALL_IRET) ", %rax\n"
+ " int $" __stringify(LGUEST_TRAP_ENTRY) );
+
+
+static void lguest_load_rsp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ lazy_hcall(LHCALL_SET_STACK, thread->rsp0, THREAD_SIZE/PAGE_SIZE, 0);
+}
+
+static void lguest_load_tr_desc(void)
+{
+}
+
+static void lguest_set_ldt(const void *addr, unsigned entries)
+{
+ /* FIXME: Implement. */
+ BUG_ON(entries);
+}
+
+static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static void lguest_set_debugreg(int regno, unsigned long value)
+{
+ /* FIXME: Implement */
+}
+
+static unsigned int lguest_cr0;
+static void lguest_clts(void)
+{
+ lazy_hcall(LHCALL_TS, 0, 0, 0);
+ lguest_cr0 &= ~8U;
+}
+
+static unsigned long lguest_read_cr0(void)
+{
+ return lguest_cr0;
+}
+
+static void lguest_write_cr0(unsigned long val)
+{
+ hcall(LHCALL_TS, val & 8, 0, 0);
+ lguest_cr0 = val;
+}
+
+static unsigned long lguest_read_cr2(void)
+{
+ return lguest_data.cr2;
+}
+
+static unsigned long lguest_read_cr3(void)
+{
+ return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static unsigned long lguest_read_cr4(void)
+{
+ return 0;
+}
+
+static void lguest_write_cr4(unsigned long val)
+{
+}
+
+static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+ do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+ update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+ set_bit(irq, lguest_data.interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+ clear_bit(irq, lguest_data.interrupts);
+ /* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+ .name = "lguest",
+ .mask = disable_lguest_irq,
+ .mask_ack = disable_lguest_irq,
+ .unmask = enable_lguest_irq,
+};
+
+static void lguest_time_init(void)
+{
+ set_irq_handler(0, lguest_time_irq);
+ hcall(LHCALL_TIMER_START,HZ,0,0);
+}
+
+static void lguest_ebda_info(unsigned *addr, unsigned *size)
+{
+ *addr = *size = 0;
+}
+
+/* From i8259.c */
+extern void (*interrupt[])(void);
+static void __init lguest_init_IRQ(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_IRQS; i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (i >= NR_IRQS)
+ break;
+ /* FIXTHEM: We should be doing it in a lot of other places */
+ if (vector != IA32_SYSCALL_VECTOR) {
+ printk("Setting vector %x as %p\n",vector, &interrupt[i]);
+ set_intr_gate(vector, interrupt[i]);
+ set_irq_chip_and_handler(i, &lguest_irq_controller,
+ handle_level_irq);
+ hcall(LHCALL_LOAD_IDT_ENTRY, vector, __pa((u64)&idt_table[vector]), 0);
+ }
+ }
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low,
u32 entry_high)
+{
+ u32 *lp = (u32 *)((char *)dt + entry*8);
+ lp[0] = entry_low;
+ lp[1] = entry_high;
+}
+
+static void lguest_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ /* FIXME: Allow this. */
+ BUG();
+}
+
+static void lguest_write_gdt_entry(void *dt, int entrynum,
+ u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+ hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static void lguest_write_idt_entry(void *dt, int entrynum,
+ u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+ hcall(LHCALL_CRASH, 0, 0 ,0);
+ hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled)
+#define DEF_LGUEST(name, code) \
+ extern const char start_##name[], end_##name[]; \
+ asm("start_" #name ": " code "; end_" #name
":")
+DEF_LGUEST(cli, "movl $0," LGUEST_IRQ);
+DEF_LGUEST(sti, "movl $512," LGUEST_IRQ);
+DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ);
+DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax");
+DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0,"
LGUEST_IRQ);
+DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */
+
+static const struct lguest_insns
+{
+ const char *start, *end;
+} lguest_insns[] = {
+ [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+ [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+ [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+ [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+ [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+ [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+ unsigned int insn_len;
+
+ /* Don't touch it if we don't have a replacement */
+ if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+ return len;
+
+ insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+ /* Similarly if we can't fit replacement. */
+ if (len < insn_len)
+ return len;
+
+ memcpy(insns, lguest_insns[type].start, insn_len);
+ if (type == PARAVIRT_INTERRUPT_RETURN) {
+ /* Jumps are relative. */
+ u64 off = (u64)lguest_iret - ((u64)insns + insn_len);
+ memcpy(insns+1, &off, sizeof(off));
+ }
+ return insn_len;
+}
+
+static void lguest_safe_halt(void)
+{
+ hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+ return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+ hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static void lguest_syscall_init(void)
+{
+ /* FIXME: Will have to implement it later */
+}
+
+static __attribute_used__ __init void lguest_init(void)
+{
+ int i;
+
+ current_cr3 = __pa(&boot_level4_pgt);
+ paravirt_ops.name = "lguest";
+ paravirt_ops.mem_type = "LGUEST";
+ paravirt_ops.paravirt_enabled = 1;
+ paravirt_ops.syscall_init = lguest_syscall_init;
+
+ paravirt_ops.save_fl = save_fl;
+ paravirt_ops.restore_fl = restore_fl;
+ paravirt_ops.irq_disable = irq_disable;
+ paravirt_ops.irq_enable = irq_enable;
+ paravirt_ops.load_gdt = lguest_load_gdt;
+ paravirt_ops.memory_setup = lguest_memory_setup;
+ paravirt_ops.cpuid = lguest_cpuid;
+ paravirt_ops.write_cr3 = lguest_write_cr3;
+ paravirt_ops.read_msr = lguest_read_msr,
+ paravirt_ops.write_msr = lguest_write_msr,
+ paravirt_ops.read_tsc = lguest_read_tsc,
+ paravirt_ops.flush_tlb_user = lguest_flush_tlb;
+ paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+ paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+ paravirt_ops.set_pte = lguest_set_pte;
+ paravirt_ops.set_pte_at = lguest_set_pte_at;
+ paravirt_ops.set_pmd = lguest_set_pmd;
+ paravirt_ops.set_pud = lguest_set_pud;
+ paravirt_ops.set_pgd = lguest_set_pgd;
+#ifdef CONFIG_X86_LOCAL_APIC
+ paravirt_ops.apic_write = lguest_apic_write;
+ paravirt_ops.apic_read = lguest_apic_read;
+#endif
+ paravirt_ops.load_idt = lguest_load_idt;
+ paravirt_ops.iret = lguest_iret;
+ paravirt_ops.load_rsp0 = lguest_load_rsp0;
+ paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+ paravirt_ops.set_ldt = lguest_set_ldt;
+ paravirt_ops.load_tls = lguest_load_tls;
+ paravirt_ops.set_debugreg = lguest_set_debugreg;
+ paravirt_ops.clts = lguest_clts;
+ paravirt_ops.read_cr0 = lguest_read_cr0;
+ paravirt_ops.write_cr0 = lguest_write_cr0;
+ paravirt_ops.init_IRQ = lguest_init_IRQ;
+ paravirt_ops.read_cr2 = lguest_read_cr2;
+ paravirt_ops.read_cr3 = lguest_read_cr3;
+ paravirt_ops.read_cr4 = lguest_read_cr4;
+ paravirt_ops.write_cr4 = lguest_write_cr4;
+ paravirt_ops.write_ldt_entry = lguest_write_ldt_entry;
+ paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+ paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+ paravirt_ops.patch = lguest_patch;
+ paravirt_ops.safe_halt = lguest_safe_halt;
+ paravirt_ops.get_wallclock = lguest_get_wallclock;
+ paravirt_ops.time_init = lguest_time_init;
+#ifdef PARAVIRT_LAZY_NONE
+ paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+#endif
+ paravirt_ops.ebda_info = lguest_ebda_info;
+
+ memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status));
+#if 0
+ lguest_data.noirq_start = (u64)lguest_noirq_start;
+ lguest_data.noirq_end = (u64)lguest_noirq_end;
+#endif
+ lguest_data.start_kernel_map = __START_KERNEL_map; /* current page offset */
+ lguest_data.page_offset = PAGE_OFFSET;
+
+ code_stack[0].next = __pa(&code_stack[1]);
+ code_stack[0].start = (unsigned long)_stext;
+ code_stack[0].end = (unsigned long)_etext;
+ code_stack[1].next = 0;
+ code_stack[1].start = (unsigned long)_sinittext;
+ code_stack[1].end = (unsigned long)_einittext;
+
+ lguest_data.text = __pa(&code_stack[0]);
+
+ lguest_data.kallsyms_addresses = __pa(&kallsyms_addresses);
+ lguest_data.kallsyms_num_syms = kallsyms_num_syms;
+ lguest_data.kallsyms_names = __pa(&kallsyms_names);
+ lguest_data.kallsyms_token_table = __pa(&kallsyms_token_table);
+ lguest_data.kallsyms_token_index = __pa(&kallsyms_token_index);
+ lguest_data.kallsyms_markers = __pa(&kallsyms_markers);
+
+ hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+
+ lguest_pops = &local_pops;
+ lguest_paravirt = 1;
+
+ memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
+ lguest_write_cr3(__pa_symbol(&init_level4_pgt));
+
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_pda(i) = &boot_cpu_pda[i];
+
+ pda_init(0);
+// copy_bootdata(real_mode_data);
+#ifdef CONFIG_SMP
+ cpu_set(0, cpu_online_map);
+#endif
+
+// strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+ /* We use top of mem for initial pagetables. */
+// init_pg_tables_end = __pa(pg0);
+
+// reserve_top_address(lguest_data.reserve_mem);
+
+ /* FIXME: Better way? */
+ /* Suppress vgacon startup code */
+ SCREEN_INFO.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+ add_preferred_console("hvc", 0, NULL);
+/*
+#ifdef CONFIG_X86_MCE
+ mcheck_disable(NULL);
+#endif
+*/
+#ifdef CONFIG_ACPI
+ acpi_disabled = 1;
+ acpi_ht = 0;
+#endif
+ if (boot->initrd_size) {
+ /* We stash this at top of memory. */
+ INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+ INITRD_SIZE = boot->initrd_size;
+ LOADER_TYPE = 0xFF;
+ }
+ pm_power_off = lguest_power_off;
+
+ start_kernel();
+}
+
+asm("lguest_maybe_init:\n"
+ " cmpq $"__stringify(LGUEST_MAGIC_R13)", %r13\n"
+ " jne 1f\n"
+ " cmpq $"__stringify(LGUEST_MAGIC_R14)", %r14\n"
+ " jne 1f\n"
+ " cmpq $"__stringify(LGUEST_MAGIC_R15)", %r15\n"
+ " je lguest_init\n"
+ "1: ret");
+
+extern void asmlinkage lguest_maybe_init(void);
+paravirt_probe(lguest_maybe_init);
Index: work-pv/arch/x86_64/lguest/lguest.h
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest.h
@@ -0,0 +1,161 @@
+#ifndef _LGUEST_GUEST_H_
+#define _LGUEST_GUEST_H_
+
+#define GUEST_DPL 0x3
+
+#define gdt_index(x) ((x) >> 3)
+
+/*
+ * Must be less than fixmap!
+ *
+ * To keep the hypervisor from needing any data sections,
+ * we need to hard code the difference between what the hypervisor
+ * may put into the GS base, and what we let the guest put in.
+ * We allow the guest to put in "Kernel addresses" to simplify
+ * the guest PDA code.
+ */
+#define LGUEST_HV_OFFSET_HIGH 0xffffffff
+#define LGUEST_HV_OFFSET_LOW 0xff000000
+
+#define LGUEST_NMI_IST 7
+
+#define LGUEST_MAGIC 0x6c6775657374 /* "lguest" */
+
+#ifndef __ASSEMBLY__
+#include <asm/lguest.h>
+
+extern void switch_to_guest(struct lguest_vcpu *);
+extern unsigned long hcall_teste;
+extern unsigned long host_syscall;
+extern unsigned long _lguest_default_idt_entries[];
+extern unsigned long lguest_hv_addr;
+extern unsigned long lguest_hv_offset;
+extern int lguest_hv_pages;
+extern int lguest_vcpu_pages;
+extern int lguest_vcpu_order;
+extern struct mutex lguest_lock;
+
+/* FIXME: Those would live better in some main kernel header */
+/* Page fault error code bits */
+#define PF_PROT (1<<0) /* or no page found */
+#define PF_WRITE (1<<1)
+#define PF_USER (1<<2)
+#define PF_RSVD (1<<3)
+#define PF_INSTR (1<<4)
+
+#define kill_guest(guest, fmt...) \
+do { \
+ if (!(guest)->dead) { \
+ (guest)->dead = kasprintf(GFP_ATOMIC, fmt); \
+ if (!(guest)->dead) \
+ (guest)->dead = (void *)-1; \
+ } \
+} while (0)
+
+#define kill_guest_dump(vcpu, fmt...) \
+do { \
+ kill_guest((vcpu)->guest, fmt); \
+ lguest_dump_vcpu_regs(vcpu); \
+} while(0)
+
+static inline void _lguest_set_gate(struct gate_struct *s, unsigned type,
unsigned long func,
+ unsigned dpl, unsigned ist)
+{
+ s->offset_low = PTR_LOW(func);
+ s->segment = __HV_CS;
+ s->ist = ist;
+ s->p = 1;
+ s->dpl = dpl;
+ s->zero0 = 0;
+ s->zero1 = 0;
+ s->type = type;
+ s->offset_middle = PTR_MIDDLE(func);
+ s->offset_high = PTR_HIGH(func);
+}
+
+static inline unsigned long guest_pa(struct lguest_guest_info *linfo, u64 addr)
+{
+ return (addr >= linfo->start_kernel_map) ?
+ (addr - linfo->start_kernel_map) :
+ (addr - linfo->page_offset);
+}
+
+int lguest_address_ok(const struct lguest_guest_info *, u64);
+
+int demand_page(struct lguest_vcpu *, u64, int);
+/* FIXME: put this in hv_vm.h */
+unsigned long hvvm_get_actual_phys(void *addr, pgprot_t *prot);
+
+int lguest_device_init(void);
+void lguest_device_remove(void);
+
+/* page_tables.h */
+int lguest_map_hv_pages(struct lguest_guest_info *lguest,
+ unsigned long vaddr, int pages,
+ pgprot_t *prot);
+int lguest_map_guest_page(struct lguest_guest_info *lguest,
+ unsigned long vaddr, unsigned long paddr,
+ pgprot_t prot);
+void lguest_unmap_guest_pages(struct lguest_guest_info *lguest,
+ unsigned long vaddr, int pages);
+void lguest_free_guest_pages(struct lguest_guest_info *lguest);
+
+void *lguest_mem_addr(struct lguest_vcpu *vcpu, u64 vaddr);
+
+void guest_set_pte(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long base,
+ unsigned long idx);
+void guest_set_pmd(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long base,
+ unsigned long val);
+void guest_set_pud(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long base,
+ unsigned long val);
+void guest_set_pgd(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long base,
+ unsigned long val);
+void guest_flush_tlb_single(struct lguest_vcpu *vcpu, u64 cr3, u64 vaddr);
+void guest_pagetable_clear_all(struct lguest_vcpu *vcpu);
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu);
+void guest_new_pagetable(struct lguest_vcpu *vcpu, u64 pgtable);
+
+int init_guest_pagetable(struct lguest_guest_info *linfo, u64 pgtable);
+int lguest_init_vcpu_pagetable(struct lguest_vcpu *vcpu);
+
+int hypercall(struct lguest_vcpu *vcpu);
+
+/* core.c */
+u8 lhread_u8(struct lguest_vcpu *vcpu, u64 addr);
+u16 lhread_u16(struct lguest_vcpu *vcpu, u64 addr);
+u64 lhread_u64(struct lguest_vcpu *vcpu, u64 addr);
+void lhwrite_u64(struct lguest_vcpu *vcpu, u64 addr, u64 val);
+
+void lhread(struct lguest_guest_info *, void *, u64, unsigned);
+void lhwrite(struct lguest_guest_info *, u64, const void *, unsigned);
+
+/* io.c */
+u32 bind_dma(struct lguest_guest_info *, unsigned long, unsigned long,
+ u16, u8);
+int send_dma(struct lguest_guest_info *, unsigned long, unsigned long);
+
+/* interrupts_and_traps.c */
+
+void load_guest_idt_entry(struct lguest_vcpu *, unsigned int,
+ struct gate_struct *);
+void maybe_do_interrupt(struct lguest_vcpu *);
+void guest_iret(struct lguest_vcpu *vcpu);
+int reflect_trap(struct lguest_vcpu *, int, int);
+
+/* lguest_debug.c */
+extern int lguest_debug;
+void lgdebug_print(const char *fmt, ...);
+void lgdebug_vprint(const char *fmt, va_list ap);
+void lguest_dump_vcpu_regs(struct lguest_vcpu *vcpu);
+void lguest_dump_trace(struct lguest_vcpu *vcpu, struct lguest_regs *regs);
+void lguest_print_address(struct lguest_vcpu *vcpu, unsigned long address);
+void lguest_print_page_tables(u64 *cr3);
+void lguest_print_guest_page_tables(struct lguest_vcpu *vcpu, u64 cr3);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
Index: work-pv/arch/x86_64/lguest/lguest_user.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest_user.c
@@ -0,0 +1,436 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <asm/lguest_user.h>
+#include <asm/hv_vm.h>
+#include "lguest.h"
+
+static int next_guest_id;
+
+#if 0
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+ unsigned long addr, udma, irq;
+
+ if (get_user(addr, input) != 0)
+ return -EFAULT;
+ udma = get_dma_buffer(lg, addr, &irq);
+ if (!udma)
+ return -ENOENT;
+
+ /* We put irq number in udma->used_len. */
+ lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+ return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+ u32 irq;
+
+ if (get_user(irq, input) != 0)
+ return -EFAULT;
+ if (irq >= LGUEST_IRQS)
+ return -EINVAL;
+ set_bit(irq, lg->irqs_pending);
+ return 0;
+}
+#endif
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+ struct lguest_vcpu *vcpu = file->private_data;
+ struct lguest_guest_info *linfo = vcpu->guest;
+ int ret;
+
+ if (!vcpu)
+ return -EINVAL;
+
+ if (linfo->dead) {
+ size_t len;
+
+ if (linfo->dead == (void *)-1)
+ return -ENOMEM;
+
+ len = min(size, strlen(linfo->dead)+1);
+ if (copy_to_user(user, linfo->dead, len) != 0)
+ return -EFAULT;
+ return len;
+ }
+
+#if 0
+ if (lg->dma_is_pending)
+ lg->dma_is_pending = 0;
+#endif
+
+ ret = run_guest(vcpu, user);
+ if (ret != -EINTR)
+ ret = -ENOENT;
+ return ret;
+}
+
+struct lguest_vcpu *allocate_vcpu(struct lguest_guest_info *linfo)
+{
+ struct lguest_vcpu *vcpu;
+ unsigned long hv_vcpu;
+ int ret;
+
+ vcpu = (void*)__get_free_pages(GFP_KERNEL, lguest_vcpu_order);
+ if (!vcpu)
+ return NULL;
+ memset(vcpu, 0, sizeof(*vcpu));
+
+ ret = hvvm_map_pages(vcpu, lguest_vcpu_pages, &hv_vcpu);
+ if (ret < 0)
+ goto out;
+
+ ret = lguest_map_hv_pages(linfo, hv_vcpu, lguest_vcpu_pages, NULL);
+ if (ret < 0)
+ goto out2;
+
+ vcpu->host_page = (unsigned long)vcpu;
+
+ return (struct lguest_vcpu*)hv_vcpu;
+
+out2:
+ hvvm_unmap_pages(hv_vcpu, lguest_vcpu_pages);
+out:
+ free_pages((unsigned long)vcpu, lguest_vcpu_order);
+
+ return NULL;
+}
+
+void free_vcpu(struct lguest_guest_info *linfo, struct lguest_vcpu *vcpu)
+{
+ unsigned long hv_vcpu = (unsigned long)vcpu;
+ free_pages(vcpu->host_page, lguest_vcpu_order);
+ lguest_unmap_guest_pages(linfo, hv_vcpu, lguest_vcpu_pages);
+ hvvm_unmap_pages(hv_vcpu, lguest_vcpu_pages);
+ lguest_free_guest_pages(linfo);
+}
+
+#if 0
+static void print_tss(struct ldttss_desc *tss)
+{
+ u64 base;
+ u64 limit;
+ int i;
+ u16 iobp = 0x64;
+
+ base = (tss->base0) + ((u64)tss->base1 << 16) +
+ ((u64)tss->base2 << 24) + ((u64)tss->base3 << 32);
+ limit = (tss->limit0) + ((u64)tss->limit1 << 16);
+ if (tss->g)
+ limit <<= 12;
+ printk(" base: %016llx\n", base);
+ printk(" limit: %llx\n", limit);
+ printk(" type: %x\n", tss->type);
+ printk(" dpl: %d\n", tss->dpl);
+ printk(" p: %d\n", tss->p);
+ printk(" g: %d\n", tss->g);
+
+ for (i=0; i < limit; i += 4) {
+ printk(" %8x: %08x\n", i, *(u32*)(base+i));
+ if (i == 0x64) {
+ iobp = (u16)((*(u32*)(base+i))>>16);
+ }
+ if (i >= iobp && *(s32*)(base+i) == -1L)
+ break;
+ }
+}
+#endif
+
+/* should be in some other file ? */
+int vcpu_start(int cpu, struct lguest_guest_info *linfo,
+ unsigned long entry_point,
+ void *pgd)
+{
+ struct lguest_vcpu *vcpu;
+ struct desc_struct *gdt_table;
+ struct lguest_regs *regs;
+ struct ldttss_desc *tss;
+ struct lguest_tss_struct *tss_ptr;
+ u64 target;
+ u64 limit;
+ u64 base;
+ int i;
+
+ if (cpu > LGUEST_MAX_VCPUS)
+ return -EINVAL;
+
+ vcpu = allocate_vcpu(linfo);
+ if (!vcpu)
+ return -ENOMEM;
+
+ printk("vcpu: %p\n", vcpu);
+
+ /*
+ * Point back to itself to make it easier to read from gs:base in
+ * hypervisor.S
+ */
+ vcpu->vcpu = vcpu;
+ vcpu->magic = LGUEST_MAGIC;
+ gdt_table = cpu_gdt(get_cpu());
+ put_cpu();
+
+ /* Our gdt is basically host's, except for the privilege level */
+ for (i = 0; i < GDT_ENTRIES; i++) {
+ vcpu->gdt_table[i] = gdt_table[i];
+
+ if (!gdt_table[i].type)
+ continue;
+
+ switch (i) {
+ /* Keep TSS, and HV, and Host KERNEL segments the same */
+ case GDT_ENTRY_TSS:
+ /* The TSS will be modified below */
+ case GDT_ENTRY_HV_CS:
+ case GDT_ENTRY_HV_DS:
+ case __KERNEL_CS >> 3:
+ case __KERNEL_DS >> 3:
+ break;
+ default:
+ vcpu->gdt_table[i].dpl = GUEST_DPL;
+ }
+ }
+
+ for (i = 0; i < IDT_ENTRIES; i++) {
+ unsigned dpl = i == LGUEST_TRAP_ENTRY ? GUEST_DPL : 0;
+ /* NMI gets its own stack */
+ int ist = (i == 2) ? LGUEST_NMI_IST :
+ /* temp debug for now */
+ (i == 8) ? 6 : /* Double Fault */
+// (i == 13) ? 5 : /* GPF */
+ 0;
+
+ _lguest_set_gate(&vcpu->idt_table[i], 0xe,
+ _lguest_default_idt_entries[i] +
+ lguest_hv_offset, dpl, ist);
+ }
+
+ vcpu->gdt.size = 8 * GDT_ENTRIES - 1;
+ vcpu->gdt.address = (unsigned long)&vcpu->gdt_table;
+
+ vcpu->idt.size = 16 * IDT_ENTRIES -1;
+ vcpu->idt.address = (unsigned long)vcpu->idt_table;
+ rdmsrl(MSR_LSTAR, vcpu->host_syscall);
+
+ vcpu->id = cpu;
+ vcpu->guest = linfo;
+ linfo->vcpu[cpu] = vcpu;
+
+ lguest_init_vcpu_pagetable(vcpu);
+
+ /* setup the tss */
+ tss = (struct ldttss_desc*)&vcpu->gdt_table[GDT_ENTRY_TSS];
+ limit = sizeof(struct lguest_tss_struct);
+ base = (u64)&vcpu->tss;
+ tss->limit0 = (u16)limit;
+ tss->base0 = (u16)base;
+ tss->base1 = (u8)(base>>16);
+ tss->base2 = (u8)(base>>24);
+ tss->base3 = (u32)(base>>32);
+ tss->type = 0x9;
+ tss->g = 0; /* small tss */
+
+ vcpu->tss.rsp0 = (unsigned long)(&vcpu->regs.size);
+
+ /* NMI can happen at any time, so give it its own stack */
+ vcpu->tss.ist[LGUEST_NMI_IST-1] = (unsigned
long)(&vcpu->nmi_stack_end);
+ printk("nmi stack at: %llx\n", vcpu->tss.ist[LGUEST_NMI_IST-1]);
+
+ /* temp debug stuff */
+ vcpu->tss.ist[5-1] = (unsigned long)(&vcpu->gpf_stack_end);
+ vcpu->tss.ist[6-1] = (unsigned long)(&vcpu->df_stack_end);
+ /*
+ * Load the host nmi stack into the guest tss. This prevents races
+ * in loading the TR and IDT.
+ */
+ tss = (struct ldttss_desc *)&gdt_table[GDT_ENTRY_TSS];
+ target = (u64)tss->base0 |
+ ((u64)tss->base1 << 16) |
+ ((u64)tss->base2 << 24) |
+ ((u64)tss->base3 << 32);
+
+ tss_ptr = (struct lguest_tss_struct*)target;
+
+ vcpu->tss.ist[NMI_STACK-1] = tss_ptr->ist[NMI_STACK-1];
+
+ /*
+ * The rsp0 had better be on 16 bytes aligned, or the interrupt
+ * will put the stack at a undesireable location.
+ */
+ /* Don't remove this test!!! */
+ if (unlikely(vcpu->tss.rsp0 & 0xf)) {
+ printk("HV ALIGNMENT BUG! don't put stack here!!\n");
+ printk(" tss.rsp0 stack was set to %llx\n",
+ vcpu->tss.rsp0);
+ goto out;
+ }
+
+ vcpu->tss.io_bitmap_base = 0x68;
+ vcpu->tss.io_bitmap[0] = -1UL;
+
+ regs = &vcpu->regs;
+ regs->cr3 = __pa(vcpu->pgdir->pgdir);
+ regs->rax = regs->rbx = regs->rcx = regs->rdx + regs->r8 =
regs->r9 = regs->r10 = regs->r11 + regs->r12 = regs->rdi =
regs->rsi = regs->rbp = 0;
+ regs->r13 = LGUEST_MAGIC_R13;
+ regs->r14 = LGUEST_MAGIC_R14;
+ regs->r15 = LGUEST_MAGIC_R15;
+ regs->fs = 0;
+ regs->trapnum = 0;
+ regs->errcode = 0;
+ regs->rip = entry_point;
+// regs->rip = 0x1000100;
+ regs->cs = __USER_CS;
+ regs->rflags = 0x202; /* Interrupts enabled. */
+ regs->rsp = 0;
+ regs->ss = __USER_DS;
+
+ return 0;
+out:
+ free_vcpu(linfo, vcpu);
+ return -EINVAL;
+}
+
+static int initialize_guest(struct file *file, const u64 __user *input)
+{
+ struct lguest_guest_info *linfo;
+ int err;
+ u64 args[4];
+ int i;
+
+ if (file->private_data)
+ return -EBUSY;
+
+ if (copy_from_user(args, input, sizeof(args)) != 0)
+ return -EFAULT;
+
+ linfo = kzalloc(sizeof(*linfo), GFP_KERNEL);
+ if (!linfo)
+ return -ENOMEM;
+
+ mutex_init(&linfo->page_lock);
+
+ /* FIXME: protect the guest_id counter */
+ linfo->guest_id = ++next_guest_id;
+
+ linfo->pfn_limit = args[0];
+ linfo->page_offset = args[3];
+ linfo->start_kernel_map = args[3];
+
+ mutex_init(&linfo->page_lock);
+ INIT_LIST_HEAD(&linfo->pgd_list);
+
+ for (i=0; i < PUD_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&linfo->pud_hash[i]);
+
+ for (i=0; i < PMD_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&linfo->pmd_hash[i]);
+
+ for (i=0; i < PTE_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&linfo->pte_hash[i]);
+
+ err = init_guest_pagetable(linfo, args[1]);
+ if (err)
+ return -ENOMEM; /* what else to return ?? */
+#if 0
+
+ lg->state = setup_guest_state(i,
lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+ if (!lg->state) {
+ err = -ENOEXEC;
+ goto release_pgtable;
+ }
+#endif
+ err = vcpu_start(0, linfo, args[2], __va(read_cr3()));
+ if (err < 0)
+ return err;
+
+ file->private_data = linfo->vcpu[0];
+
+ return sizeof(args);
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+ size_t size, loff_t *off)
+{
+ struct lguest_vcpu *vcpu = file->private_data;
+ u64 req;
+
+ if (get_user(req, input) != 0)
+ return -EFAULT;
+ input += sizeof(req);
+
+ if (req != LHREQ_INITIALIZE && !vcpu)
+ return -EINVAL;
+#if 0
+ if (lg && lg->dead)
+ return -ENOENT;
+#endif
+
+ switch (req) {
+ case LHREQ_INITIALIZE:
+ return initialize_guest(file, (const u64 __user *)input);
+#if 0
+ case LHREQ_GETDMA:
+ return user_get_dma(lg, (const u32 __user *)input);
+ case LHREQ_IRQ:
+ return user_send_irq(lg, (const u32 __user *)input);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+ struct lguest_vcpu *vcpu = file->private_data;
+ struct lguest_guest_info *linfo;
+
+ if (!vcpu)
+ return -EBADFD;
+
+ linfo = vcpu->guest;
+ /* FIXME: need to handle multiple vcpus */
+ free_vcpu(linfo, vcpu);
+ kfree(linfo);
+#if 0
+ mutex_lock(&lguest_lock);
+ release_all_dma(lg);
+ free_page((long)lg->trap_page);
+ free_guest_pagetable(lg);
+ mmput(lg->mm);
+ if (lg->dead != (void *)1)
+ kfree(lg->dead);
+ memset(lg->state, 0, sizeof(*lg->state));
+ memset(lg, 0, sizeof(*lg));
+ mutex_unlock(&lguest_lock);
+#endif
+ return 0;
+}
+
+static struct file_operations lguest_fops = {
+ .owner = THIS_MODULE,
+ .release = close,
+ .write = write,
+ .read = read,
+};
+static struct miscdevice lguest_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "lguest",
+ .fops = &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+ return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+ misc_deregister(&lguest_dev);
+}
Index: work-pv/arch/x86_64/lguest/page_tables.c
==================================================================--- /dev/null
+++ work-pv/arch/x86_64/lguest/page_tables.c
@@ -0,0 +1,1285 @@
+/* Shadow page table operations.
+ * Copyright (C) Steven Rostedt, Red Hat Inc, 2007
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include <asm/hv_vm.h>
+#include "lguest.h"
+
+/* move this to hv_vm.h */
+#define HVVM_END (HVVM_START + HV_VIRT_SIZE)
+
+#define HASH_PUD(x) (((u64)(x)>>PAGE_SHIFT) & (PUD_HASH_SIZE-1))
+#define HASH_PMD(x) (((u64)(x)>>PAGE_SHIFT) & (PMD_HASH_SIZE-1))
+#define HASH_PTE(x) (((u64)(x)>>PAGE_SHIFT) & (PTE_HASH_SIZE-1))
+
+/* guest and host share the same offset into the page tables */
+/* 9 bits at 8 byte increments */
+#define guest_host_idx(vaddr) ((vaddr) & (0x1ff<<3))
+
+
+/* These access the guest versions. */
+static u64 gtoplev(struct lguest_vcpu *vcpu, unsigned long vaddr)
+{
+ unsigned index = pgd_index(vaddr);
+
+ return vcpu->pgdir->cr3 + index * sizeof(u64);
+}
+
+
+#if 0
+
+/* FIXME: we need to put these in and make it more secure! */
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+ if ((entry & (_PAGE_PWT|_PAGE_PSE))
+ || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+ kill_guest(lg, "bad page table entry");
+ return entry & ~_PAGE_GLOBAL;
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+ unsigned int i;
+ u32 stack = lg->state->tss.esp1;
+
+ for (i = 0; i < lg->stack_pages; i++)
+ if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+ kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+ unsigned int i;
+
+ release_all_pagetables(lg);
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+ int cpu = smp_processor_id();
+
+ hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+ /* Since hypervisor less that 4MB, we simply mug top pte page. */
+ lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] +
(__pa(hypervisor_pte_page(cpu))| __PAGE_KERNEL);
+}
+
+#endif
+
+static int __lguest_map_guest_page(struct lguest_guest_info *linfo, u64 *cr3,
+ unsigned long vaddr, unsigned long paddr,
+ pgprot_t pprot);
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned long ret = -1UL;
+
+ down_read(¤t->mm->mmap_sem);
+ if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+ 1, write, 1, &page, &vma) == 1)
+ ret = page_to_pfn(page);
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
+static int is_hv_page(int pgd_idx, int pud_idx, int pmd_idx, int pte_idx)
+{
+ /* Never release the hv pages */
+ u64 addr = (u64)pgd_idx << PGDIR_SHIFT |
+ (u64)pud_idx << PUD_SHIFT |
+ (u64)pmd_idx << PMD_SHIFT |
+ (u64)pte_idx << PAGE_SHIFT;
+ /* sign extend */
+ if (pgd_idx & (1<<8))
+ addr |= 0xffffULL << 48;
+ return (addr >= HVVM_START) &&
+ (addr < (HVVM_START + HV_VIRT_SIZE));
+}
+
+static void release_pte(u64 pte)
+{
+ if (pte & _PAGE_PRESENT)
+ put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+static int release_pmd(int pgd_idx, int pud_idx, u64 *pmd, int idx)
+{
+ int save = 0;
+ if (pmd[idx] & _PAGE_PRESENT) {
+ int i;
+ u64 *ptepage = __va(pmd[idx] & PTE_MASK);
+ for (i=0; i < PTRS_PER_PMD; i++)
+ if (is_hv_page(pgd_idx, pud_idx, idx, i))
+ save = 1;
+ else
+ release_pte(ptepage[i]);
+ /* never free the HV pmds */
+ if (!save) {
+ free_page((unsigned long)ptepage);
+ pmd[idx] = 0;
+ }
+ }
+ return save;
+}
+
+static int release_pud(int pgd_idx, u64 *pud, int idx)
+{
+ int save = 0;
+ if (pud[idx] & _PAGE_PRESENT) {
+ int i;
+ u64 *pmdpage = __va(pud[idx] & PTE_MASK);
+ for (i=0; i < PTRS_PER_PUD; i++)
+ if (release_pmd(pgd_idx, idx, pmdpage, i))
+ save = 1;
+ /* never free the HV puds */
+ if (!save) {
+ free_page((unsigned long)pmdpage);
+ pud[idx] = 0;
+ }
+ }
+ return save;
+}
+
+static int release_pgd(u64 *pgd, int idx)
+{
+ int save = 0;
+
+ if (pgd[idx] & _PAGE_PRESENT) {
+ int i;
+ u64 *pudpage = __va(pgd[idx] & PTE_MASK);
+ for (i=0; i < PTRS_PER_PGD; i++) {
+ if (release_pud(idx, pudpage, i))
+ save = 1;
+ }
+ /* never free the HV pgd */
+ if (!save) {
+ free_page((unsigned long)pudpage);
+ pgd[idx] = 0;
+ }
+ }
+ return save;
+}
+
+static struct lguest_pgd *find_pgd(struct lguest_guest_info *linfo, u64 cr3)
+{
+ struct lguest_pgd *pgdir;
+
+ list_for_each_entry(pgdir, &linfo->pgd_list, list)
+ if (!(pgdir->flags & LGUEST_PGD_MASTER_FL) && pgdir->cr3 ==
cr3)
+ break;
+
+ if (pgdir == list_entry(&linfo->pgd_list, struct lguest_pgd, list))
+ return NULL;
+
+ return pgdir;
+}
+
+static struct lguest_pud *find_pud(struct lguest_guest_info *linfo, u64 gpud)
+{
+ unsigned idx = HASH_PUD(gpud);
+ struct lguest_pud *pudir;
+
+ list_for_each_entry(pudir, &linfo->pud_hash[idx], list)
+ if (pudir->gpud == gpud)
+ break;
+
+ if (pudir == list_entry(&linfo->pud_hash[idx], struct lguest_pud,
list))
+ return NULL;
+
+ return pudir;
+}
+
+static struct lguest_pmd *find_pmd(struct lguest_guest_info *linfo, u64 gpmd)
+{
+ unsigned idx = HASH_PMD(gpmd);
+ struct lguest_pmd *pmdir;
+
+ list_for_each_entry(pmdir, &linfo->pmd_hash[idx], list)
+ if (pmdir->gpmd == gpmd)
+ break;
+
+ if (pmdir == list_entry(&linfo->pmd_hash[idx], struct lguest_pmd,
list))
+ return NULL;
+
+ return pmdir;
+}
+
+static struct lguest_pte *find_pte(struct lguest_guest_info *linfo, u64 gpte)
+{
+ unsigned idx = HASH_PTE(gpte);
+ struct lguest_pte *pte;
+
+ list_for_each_entry(pte, &linfo->pte_hash[idx], list)
+ if (pte->gpte == gpte)
+ break;
+
+ if (pte == list_entry(&linfo->pte_hash[idx], struct lguest_pte, list))
+ return NULL;
+
+ return pte;
+}
+
+static void __release_pte_hash(struct lguest_vcpu *vcpu, struct lguest_pte
*pte)
+{
+ list_del(&pte->list);
+ kfree(pte);
+}
+
+static void __release_pmd_hash(struct lguest_vcpu *vcpu, struct lguest_pmd
*pmdir)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pte *pte;
+ int i;
+
+ list_del(&pmdir->list);
+
+ for (i=0; i < PTRS_PER_PMD; i++) {
+ u64 gpte;
+
+ gpte = lhread_u64(vcpu, pmdir->gpmd+i*sizeof(u64));
+ if (!gpte)
+ continue;
+ pte = find_pte(linfo, gpte & PTE_MASK);
+ if (!pte)
+ continue;
+ __release_pte_hash(vcpu, pte);
+ }
+
+ kfree(pmdir);
+}
+
+static void __release_pud_hash(struct lguest_vcpu *vcpu, struct lguest_pud
*pudir)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pmd *pmdir;
+ int i;
+
+ list_del(&pudir->list);
+
+ for (i=0; i < PTRS_PER_PUD; i++) {
+ u64 gpmd;
+
+ gpmd = lhread_u64(vcpu, pudir->gpud+i*sizeof(u64));
+ if (!gpmd)
+ continue;
+ pmdir = find_pmd(linfo, gpmd & PTE_MASK);
+ if (!pmdir)
+ continue;
+ __release_pmd_hash(vcpu, pmdir);
+ }
+
+ kfree(pudir);
+}
+
+static struct lguest_pud *hash_pud(struct lguest_vcpu *vcpu, u64 gpud, unsigned
idx)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pud *pudir;
+ unsigned h;
+
+ mutex_lock(&linfo->page_lock);
+ pudir = find_pud(linfo, gpud);
+ if (!pudir) {
+ /* FIXME: make this a slab? */
+ pudir = kzalloc(sizeof(*pudir), GFP_KERNEL);
+ if (!pudir)
+ goto out;
+ h = HASH_PUD(gpud);
+ list_add(&pudir->list, &linfo->pud_hash[h]);
+ pudir->pgdir = vcpu->pgdir;
+ pudir->gpud = gpud;
+ pudir->idx = idx;
+ }
+out:
+ mutex_unlock(&linfo->page_lock);
+
+ return pudir;
+}
+
+static struct lguest_pmd *hash_pmd(struct lguest_vcpu *vcpu, struct lguest_pud
*pudir,
+ u64 gpmd, unsigned idx)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pmd *pmdir;
+ unsigned h;
+
+ mutex_lock(&linfo->page_lock);
+ pmdir = find_pmd(linfo, gpmd);
+ if (!pmdir) {
+ /* FIXME: make this a slab? */
+ pmdir = kzalloc(sizeof(*pmdir), GFP_KERNEL);
+ if (!pmdir)
+ goto out;
+ h = HASH_PMD(gpmd);
+ list_add(&pmdir->list, &linfo->pmd_hash[h]);
+ pmdir->pudir = pudir;
+ pmdir->gpmd = gpmd;
+ pmdir->idx = idx;
+ }
+out:
+ mutex_unlock(&linfo->page_lock);
+
+ return pmdir;
+}
+
+static struct lguest_pte *hash_pte(struct lguest_vcpu *vcpu, struct lguest_pmd
*pmdir,
+ u64 gpte, unsigned idx)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pte *pte;
+ unsigned h;
+
+ mutex_lock(&linfo->page_lock);
+ pte = find_pte(linfo, gpte);
+ if (!pte) {
+ /* FIXME: make this a slab? */
+ pte = kzalloc(sizeof(*pte), GFP_KERNEL);
+ if (!pte)
+ goto out;
+ h = HASH_PTE(gpte);
+ list_add(&pte->list, &linfo->pte_hash[h]);
+ pte->pmdir = pmdir;
+ pte->gpte = gpte;
+ pte->idx = idx;
+ }
+out:
+ mutex_unlock(&linfo->page_lock);
+
+ return pte;
+}
+
+void guest_set_pte(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long vaddr,
+ unsigned long value)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pud *pudir;
+ struct lguest_pmd *pmdir;
+ struct lguest_pte *ptedir;
+ unsigned long idx = (vaddr & (PAGE_SIZE-1)) / 8;
+ u64 base = vaddr & PTE_MASK;
+ u64 pgd;
+ u64 pud;
+ u64 pmd;
+ u64 pte;
+ u64 *pudpage;
+ u64 *pmdpage;
+ u64 *ptepage;
+
+ mutex_lock(&linfo->page_lock);
+
+ ptedir = find_pte(linfo, base);
+ if (!ptedir)
+ goto out;
+
+ pmdir = ptedir->pmdir;
+ pudir = pmdir->pudir;
+
+ pgd = vcpu->pgdir->pgdir[pudir->idx];
+ if (!(pgd & _PAGE_PRESENT))
+ goto out;
+
+ pudpage = __va(pgd & PTE_MASK);
+ pud = pudpage[pmdir->idx];
+
+ if (!(pud & _PAGE_PRESENT))
+ goto out;
+
+ pmdpage = __va(pud & PTE_MASK);
+ pmd = pmdpage[ptedir->idx];
+
+ if (!(pmd & _PAGE_PRESENT))
+ goto out;
+
+ ptepage = __va(pmd & PTE_MASK);
+ pte = ptepage[idx];
+
+ if (!(pte & _PAGE_PRESENT))
+ goto out;
+
+ /* If the guest is trying to touch HV area, kill it! */
+ if (is_hv_page(pudir->idx, pmdir->idx, ptedir->idx, idx)) {
+ kill_guest_dump(vcpu, "guest trying to write to HV area\n");
+ goto out;
+ }
+
+ /* FIXME: perhaps we could set the pte now ? */
+
+ release_pte(ptepage[idx]);
+ __release_pte_hash(vcpu, ptedir);
+
+out:
+ mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pmd(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long base,
+ unsigned long idx)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pud *pudir;
+ struct lguest_pmd *pmdir;
+ u64 pgd;
+ u64 pud;
+ u64 pmd;
+ u64 *pudpage;
+ u64 *pmdpage;
+ int save;
+
+ if (idx >= PTRS_PER_PMD) {
+ kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+ return;
+ }
+
+ mutex_lock(&linfo->page_lock);
+
+ pmdir = find_pmd(linfo, base);
+ if (!pmdir)
+ goto out;
+
+ pudir = pmdir->pudir;
+
+ pgd = vcpu->pgdir->pgdir[pudir->idx];
+ if (!(pgd & _PAGE_PRESENT))
+ goto out;
+
+ pudpage = __va(pgd & PTE_MASK);
+ pud = pudpage[pmdir->idx];
+
+ if (!(pud & _PAGE_PRESENT))
+ goto out;
+
+ pmdpage = __va(pud & PTE_MASK);
+ pmd = pmdpage[idx];
+
+ if (!(pmd & _PAGE_PRESENT))
+ goto out;
+
+ save = release_pmd(pudir->idx, pmdir->idx, pmdpage, idx);
+ if (!save)
+ __release_pmd_hash(vcpu, pmdir);
+
+out:
+ mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pud(struct lguest_vcpu *vcpu,
+ unsigned long cr3, unsigned long base,
+ unsigned long idx)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pud *pudir;
+ u64 pgd;
+ u64 pud;
+ u64 *pudpage;
+ int save;
+
+ if (idx >= PTRS_PER_PUD) {
+ kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+ return;
+ }
+
+ mutex_lock(&linfo->page_lock);
+
+ pudir = find_pud(linfo, base);
+ if (!pudir)
+ goto out;
+
+ pgd = vcpu->pgdir->pgdir[pudir->idx];
+ if (!(pgd & _PAGE_PRESENT))
+ goto out;
+
+ pudpage = __va(pgd & PTE_MASK);
+ pud = pudpage[idx];
+
+ if (!(pud & _PAGE_PRESENT))
+ goto out;
+
+ save = release_pud(pudir->idx, pudpage, idx);
+ if (!save)
+ __release_pud_hash(vcpu, pudir);
+
+out:
+ mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pgd(struct lguest_vcpu *vcpu, unsigned long cr3,
+ unsigned long base, unsigned long idx)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pgd *pgdir;
+ struct lguest_pud *pudir;
+ u64 gpud;
+ u64 pgd;
+ u64 pud;
+ int save;
+
+ pgdir = vcpu->pgdir;
+
+ if (idx >= PTRS_PER_PGD) {
+ kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+ return;
+ }
+
+ mutex_lock(&linfo->page_lock);
+
+ pgd = pgdir->pgdir[idx];
+ if (!(pgd & _PAGE_PRESENT))
+ goto out;
+
+ pud = pgd & PTE_MASK;
+
+ gpud = lhread_u64(vcpu, base + idx * sizeof(u64));
+ pudir = find_pud(linfo, gpud & PTE_MASK);
+ if (pudir)
+ __release_pud_hash(vcpu, pudir);
+ save = release_pgd(pgdir->pgdir, idx);
+
+ if (!save && idx >= guest_host_idx(linfo->page_offset >>
(PGDIR_SHIFT-3))) {
+ /* All guest procesess share the same kernel PML4Es */
+ /*
+ * So we only free the tree once, but then reset
+ * all the others.
+ */
+ list_for_each_entry(pgdir, &linfo->pgd_list, list) {
+ pgd = pgdir->pgdir[idx];
+ if (!(pgd & _PAGE_PRESENT))
+ continue;
+ BUG_ON((pgd & PTE_MASK) != pud);
+ pgdir->pgdir[idx] = 0;
+ }
+ }
+out:
+ mutex_unlock(&linfo->page_lock);
+}
+
+void guest_flush_tlb_single(struct lguest_vcpu *vcpu, u64 cr3, u64 vaddr)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pgd *pgdir;
+ unsigned long pgd_idx;
+ unsigned long pud_idx;
+ unsigned long pmd_idx;
+ unsigned long idx;
+ u64 pgd;
+ u64 pud;
+ u64 pmd;
+ u64 pte;
+ u64 *pudpage;
+ u64 *pmdpage;
+ u64 *ptepage;
+
+ mutex_lock(&linfo->page_lock);
+
+ if (vaddr > linfo->page_offset)
+ pgdir = &linfo->kpgdir;
+ else
+ pgdir = find_pgd(linfo, cr3);
+
+ pgd_idx = pgd_index(vaddr);
+ pgd = pgdir->pgdir[pgd_idx];
+ if (!(pgd & _PAGE_PRESENT))
+ goto out;
+
+ pud_idx = pud_index(vaddr);
+ pudpage = __va(pgd & PTE_MASK);
+ pud = pudpage[pud_idx];
+
+ if (!(pud & _PAGE_PRESENT))
+ goto out;
+
+ pmd_idx = pmd_index(vaddr);
+ pmdpage = __va(pud & PTE_MASK);
+ pmd = pmdpage[pmd_idx];
+
+ if (!(pmd & _PAGE_PRESENT))
+ goto out;
+
+ idx = pte_index(vaddr);
+ ptepage = __va(pmd & PTE_MASK);
+ pte = ptepage[idx];
+
+ if (!(pte & _PAGE_PRESENT))
+ goto out;
+
+ /* If the guest is trying to touch HV area, kill it! */
+ if (is_hv_page(pgd_idx, pud_idx, pmd_idx, idx)) {
+ kill_guest_dump(vcpu, "guest trying to write to HV area\n");
+ goto out;
+ }
+
+ release_pte(ptepage[idx]);
+ /* FIXME: what about the hash?? */
+
+out:
+ mutex_unlock(&linfo->page_lock);
+}
+
+static void flush_user_mappings(struct lguest_guest_info *linfo, struct
lguest_pgd *pgdir)
+{
+ unsigned int i;
+ for (i = 0; i < pgd_index(linfo->page_offset); i++)
+ release_pgd(pgdir->pgdir, i);
+}
+
+static struct lguest_pgd *new_pgdir(struct lguest_guest_info *linfo, u64 cr3)
+{
+ unsigned int next;
+ unsigned int i;
+
+ next = random32() % LGUEST_PGDIRS;
+ for (i=(next+1) % LGUEST_PGDIRS; i != next; i = (i+1) % LGUEST_PGDIRS) {
+ if (linfo->pgdirs[i].flags & LGUEST_PGD_BUSY_FL)
+ continue;
+ break;
+ }
+ BUG_ON(linfo->pgdirs[i].flags & LGUEST_PGD_BUSY_FL);
+
+ next = i;
+
+ linfo->pgdirs[next].cr3 = cr3;
+ if (!linfo->pgdirs[next].pgdir) {
+ linfo->pgdirs[next].pgdir = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!linfo->pgdirs[next].pgdir)
+ return NULL;
+ /* all kernel pages are the same */
+ for (i=pgd_index(linfo->page_offset); i < PTRS_PER_PGD; i++)
+ linfo->pgdirs[next].pgdir[i] = linfo->kpgdir.pgdir[i];
+ } else {
+ BUG_ON(!(linfo->pgdirs[next].flags & LGUEST_PGD_LINK_FL));
+ /* Release all the non-kernel mappings. */
+ flush_user_mappings(linfo, &linfo->pgdirs[next]);
+ }
+
+ return &linfo->pgdirs[next];
+}
+
+void guest_new_pagetable(struct lguest_vcpu *vcpu, u64 pgtable)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pgd *newpgdir;
+
+ mutex_lock(&linfo->page_lock);
+ newpgdir = find_pgd(linfo, pgtable);
+ if (vcpu->pgdir) {
+ if (!(--vcpu->pgdir->count))
+ vcpu->pgdir->flags &= ~(LGUEST_PGD_BUSY_FL);
+ }
+ if (!newpgdir)
+ newpgdir = new_pgdir(linfo, pgtable);
+ if (!newpgdir) {
+ kill_guest_dump(vcpu, "no more pgd's available!\n");
+ goto out;
+ }
+ vcpu->pgdir = newpgdir;
+ if (!vcpu->pgdir->count++)
+ vcpu->pgdir->flags |= LGUEST_PGD_BUSY_FL;
+ vcpu->regs.cr3 = __pa(vcpu->pgdir->pgdir);
+ if (!(vcpu->pgdir->flags & LGUEST_PGD_LINK_FL)) {
+ list_add(&vcpu->pgdir->list, &linfo->pgd_list);
+ vcpu->pgdir->flags |= LGUEST_PGD_LINK_FL;
+ }
+// pin_stack_pages(lg);
+out:
+ mutex_unlock(&linfo->page_lock);
+}
+
+static void release_all_pagetables(struct lguest_guest_info *linfo)
+{
+ struct lguest_pgd *pgdir, *next;
+ int i;
+
+ /* We share the kernel pages, so do them once */
+ for (i=0; i < PTRS_PER_PGD; i++)
+ release_pgd(linfo->kpgdir.pgdir, i);
+
+ list_for_each_entry(pgdir, &linfo->pgd_list, list) {
+ if (pgdir->pgdir)
+ for (i=0; i < pgd_index(linfo->page_offset); i++)
+ release_pgd(pgdir->pgdir, i);
+ }
+ /* now release any pgdirs that are not busy */
+ list_for_each_entry_safe(pgdir, next, &linfo->pgd_list, list) {
+ if (!(pgdir->flags & LGUEST_PGD_BUSY_FL)) {
+ BUG_ON(pgdir->count);
+ pgdir->flags &= ~LGUEST_PGD_LINK_FL;
+ list_del(&pgdir->list);
+ free_page((u64)pgdir->pgdir);
+ pgdir->cr3 = 0;
+ pgdir->pgdir = NULL;
+ }
+ }
+}
+
+void guest_pagetable_clear_all(struct lguest_vcpu *vcpu)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+
+ mutex_lock(&linfo->page_lock);
+ release_all_pagetables(linfo);
+// pin_stack_pages(lg);
+ mutex_unlock(&linfo->page_lock);
+}
+
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ unsigned int i;
+
+ for (i = 0; i < pgd_index(linfo->page_offset); i++)
+ release_pgd(vcpu->pgdir->pgdir, i);
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+ swapped. It'd be nice to have a callback when Linux wants to swap out.
*/
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return 0 if failed, 1 if good */
+static int page_in(struct lguest_vcpu *vcpu, u64 vaddr, pgprot_t prot)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ struct lguest_pud *pudir;
+ struct lguest_pmd *pmdir;
+ struct lguest_pte *ptedir;
+ u64 val;
+ u64 paddr;
+ u64 gpgd, gpud, gpmd, gpte;
+ u64 flags = pgprot_val(prot);
+ int write;
+ int ret;
+
+ gpgd = gtoplev(vcpu, vaddr);
+ val = lhread_u64(vcpu, gpgd);
+ if (!(val & _PAGE_PRESENT)) {
+ printk("pgd not present pgd:%llx vaddr:%llx val:%llx\n", gpgd,
vaddr, val);
+ return 0;
+ }
+
+ gpud = val & PTE_MASK;
+
+ pudir = hash_pud(vcpu, gpud, pgd_index(vaddr));
+ if (!pudir)
+ return 0; /* -ENOMEM */
+
+ if (vaddr >= linfo->page_offset)
+ pudir->flags |= LGUEST_PUD_KERNEL_FL;
+
+ gpud += pud_index(vaddr) * sizeof(u64);
+ val = lhread_u64(vcpu, gpud);
+ if (!(val & _PAGE_PRESENT)) {
+ printk("pud not present?\n");
+ return 0;
+ }
+
+ gpmd = val & PTE_MASK;
+
+ pmdir = hash_pmd(vcpu, pudir, gpmd, pud_index(vaddr));
+ if (!pmdir)
+ return 0; /* -ENOMEM */
+
+ if (vaddr >= linfo->page_offset)
+ pmdir->flags |= LGUEST_PMD_KERNEL_FL;
+
+ gpmd += pmd_index(vaddr) * sizeof(u64);
+ val = lhread_u64(vcpu, gpmd);
+ if (!(val & _PAGE_PRESENT)) {
+ printk("pmd not present?\n");
+ return 0;
+ }
+
+ /* The guest might have set up a 2M page */
+ if (val & (1<<7)) {
+ /* 2M pages */
+ /*
+ * Although the guest may have mapped this into 2M pages
+ * we haven't and wont. So we still need to find the 4K
+ * page position.
+ */
+ paddr = val & ~((1<<20)-1);
+ paddr += pte_index(vaddr) << PAGE_SHIFT;
+ paddr &= PTE_MASK; /* can still have the NX bit set */
+ } else {
+ /* 4K pages */
+ gpte = val & PTE_MASK;
+
+ ptedir = hash_pte(vcpu, pmdir, gpte, pmd_index(vaddr));
+ if (!ptedir)
+ return 0; /* -ENOMEM */
+
+ gpte += pte_index(vaddr) * sizeof(u64);
+ val = lhread_u64(vcpu, gpte);
+ if (!(val & _PAGE_PRESENT) || ((flags & _PAGE_DIRTY) && !(val
& _PAGE_RW))) {
+ printk("pte not present or dirty?\n");
+ return 0;
+ }
+ /* this is the guest's paddr */
+ paddr = val & PTE_MASK;
+
+ }
+
+ /* FIXME: check these values */
+
+ /*
+ * FIXME: if this isn't write, we lose the lguest_data when we do
+ * a put_user in the hypercall init.
+ */
+ write = 1; // val & _PAGE_DIRTY ? 1 : 0;
+
+ val = get_pfn(paddr >> PAGE_SHIFT, write);
+ if (val == (unsigned long)-1UL) {
+ printk("bad 1\n");
+ kill_guest_dump(vcpu, "page %llx not mapped", paddr);
+ return 0;
+ }
+
+ /* now we have the actual paddr */
+ val <<= PAGE_SHIFT;
+
+ ret = __lguest_map_guest_page(vcpu->guest, vcpu->pgdir->pgdir,
+ vaddr, val, __pgprot(flags));
+ if (ret < 0) {
+ printk("bad 2\n");
+ kill_guest_dump(vcpu, "can't map page");
+ return 0;
+ }
+ return 1;
+}
+
+int demand_page(struct lguest_vcpu *vcpu, u64 vaddr, int write)
+{
+ return page_in(vcpu, vaddr, (write ? PAGE_SHARED_EXEC : PAGE_COPY_EXEC));
+}
+
+
+static pud_t *pud_from_index(unsigned long addr, unsigned index)
+{
+ pud_t *pud = (pud_t*)addr;
+
+ return &pud[index];
+}
+
+static pmd_t *pmd_from_index(unsigned long addr, unsigned index)
+{
+ pmd_t *pmd = (pmd_t*)addr;
+
+ return &pmd[index];
+}
+
+static pte_t *pte_from_index(unsigned long addr, unsigned index)
+{
+ pte_t *pte = (pte_t*)addr;
+
+ return &pte[index];
+}
+
+static int __lguest_map_guest_pte(pmd_t *pmd, unsigned long vaddr,
+ unsigned long paddr, pgprot_t prot)
+{
+ unsigned long page;
+ pte_t *pte;
+ unsigned index;
+
+ page = pmd_page_vaddr(*pmd);
+
+ index = pte_index(vaddr);
+ pte = pte_from_index(page, index);
+ if (pte_val(*pte) & _PAGE_PRESENT &&
+ pte_val(*pte) == pte_val(pfn_pte(paddr>>PAGE_SHIFT, prot)) ) {
+ printk("stange page faulting!\n");
+ printk("paddr=%lx (paddr)=%lx\n", paddr, *(unsigned long
*)__va(paddr));
+ printk("vaddr: %lx pte %x val: %lx\n", vaddr, index,
pte_val(*pte));
+ }
+
+ set_pte(pte, mk_pte(pfn_to_page(paddr >> PAGE_SHIFT), prot));
+
+ return 0;
+}
+
+static int __lguest_map_guest_pmd(pud_t *pud, unsigned long vaddr, unsigned
long paddr,
+ pgprot_t prot)
+{
+ unsigned long page;
+ pmd_t *pmd;
+ unsigned index;
+
+ page = pud_page_vaddr(*pud);
+
+ index = pmd_index(vaddr);
+ pmd = pmd_from_index(page, index);
+ if (!pmd_val(*pmd)) {
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(page)));
+ }
+
+ return __lguest_map_guest_pte(pmd, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_pud(pgd_t *pgd, unsigned long vaddr, unsigned
long paddr,
+ pgprot_t prot)
+{
+ unsigned long page;
+ pud_t *pud;
+ unsigned index;
+
+ page = pgd_page_vaddr(*pgd);
+
+ index = pud_index(vaddr);
+ pud = pud_from_index(page, index);
+ if (!pud_val(*pud)) {
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(page)));
+ }
+
+ return __lguest_map_guest_pmd(pud, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_pgd(u64 *cr3,
+ unsigned long vaddr, unsigned long paddr,
+ pgprot_t prot)
+{
+ unsigned long page;
+ unsigned index;
+ pgd_t *pgd;
+
+ index = pgd_index(vaddr);
+ pgd = (pgd_t*)&cr3[index];
+ if (!pgd_val(*pgd)) {
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(page)));
+ }
+
+ return __lguest_map_guest_pud(pgd, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_page(struct lguest_guest_info *linfo, u64 *cr3,
+ unsigned long vaddr, unsigned long paddr,
+ pgprot_t prot)
+{
+ int ret;
+
+ ret = __lguest_map_guest_pgd(cr3, vaddr, paddr, prot);
+ if (ret < 0)
+ return ret;
+
+ /* All guest kernel pages are the same */
+ if (vaddr >= linfo->page_offset) {
+ struct lguest_pgd *pgdir;
+ unsigned index;
+ pgd_t *pgd;
+ u64 val;
+
+ index = pgd_index(vaddr);
+ pgd = (pgd_t*)&cr3[index];
+ val = pgd_val(*pgd);
+
+ list_for_each_entry(pgdir, &linfo->pgd_list, list)
+ pgdir->pgdir[index] = val;
+ }
+ return ret;
+}
+
+static void __lguest_unmap_page_pmd(pmd_t *pmd, unsigned long vaddr)
+{
+ pte_t *pte;
+ unsigned index;
+ unsigned long page;
+
+ page = pmd_page_vaddr(*pmd);
+
+ index = pte_index(vaddr);
+ pte = pte_from_index(page, index);
+ if (pte_val(*pte) & 1)
+ set_pte(pte, __pte(0));
+}
+
+static void __lguest_unmap_page_pud(pud_t *pud, unsigned long vaddr)
+{
+ pmd_t *pmd;
+ unsigned index;
+ unsigned long page;
+
+ page = pud_page_vaddr(*pud);
+
+ index = pmd_index(vaddr);
+ pmd = pmd_from_index(page, index);
+ if (pmd_val(*pmd) & 1)
+ __lguest_unmap_page_pmd(pmd, vaddr);
+}
+
+static void __lguest_unmap_page_pgd(pgd_t *pgd, unsigned long vaddr)
+{
+ pud_t *pud;
+ unsigned index;
+ unsigned long page;
+
+ page = pgd_page_vaddr(*pgd);
+
+ index = pud_index(vaddr);
+ pud = pud_from_index(page, index);
+ if (pud_val(*pud) & 1)
+ __lguest_unmap_page_pud(pud, vaddr);
+}
+
+static void __lguest_unmap_guest_page(struct lguest_guest_info *linfo,
+ unsigned long vaddr)
+{
+ pgd_t *pgd;
+ unsigned index;
+ u64 *cr3 = linfo->kpgdir.pgdir;
+
+ if (!cr3)
+ return;
+
+ index = pgd_index(vaddr);
+ pgd = (pgd_t*)&cr3[index];
+ if (!(pgd_val(*pgd)&1))
+ return;
+
+ __lguest_unmap_page_pgd(pgd, vaddr);
+}
+
+int lguest_map_hv_pages(struct lguest_guest_info *lguest,
+ unsigned long vaddr, int pages,
+ pgprot_t *pprot)
+{
+ unsigned long page;
+ int i;
+ int ret;
+ pgprot_t prot;
+
+ ret = -ENOMEM;
+ for (i=0; i < pages; i++) {
+ /* now add the page we want */
+ page = hvvm_get_actual_phys((void*)vaddr+PAGE_SIZE*i, &prot);
+ if (!page)
+ goto failed;
+
+ if (pprot)
+ prot = *pprot;
+ ret = __lguest_map_guest_page(lguest, lguest->kpgdir.pgdir,
+ vaddr+PAGE_SIZE*i, page, prot);
+ if (ret < 0)
+ goto failed;
+ }
+ return 0;
+failed:
+ for (--i; i >= 0; i--)
+ __lguest_unmap_guest_page(lguest, vaddr+PAGE_SIZE*i);
+ return ret;
+}
+
+/**
+ * lguest_mem_addr - retrieve page that's mapped from guest.
+ * @vcpu: lguest vcpu descriptor.
+ * @addr: address to get from the guest's address space.
+ *
+ * ONLY USE WHEN ALL ELSE FAILS!
+ */
+void *lguest_mem_addr(struct lguest_vcpu *vcpu, u64 addr)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+ u64 *cr3 = linfo->kpgdir.pgdir;
+ unsigned long page;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned index = pgd_index(addr);
+
+ pgd = (pgd_t*)&cr3[index];
+ if (!(pgd_val(*pgd) & 1))
+ return NULL;
+
+ page = pgd_page_vaddr(*pgd);
+ index = pud_index(addr);
+ pud = pud_from_index(page, index);
+ if (!(pud_val(*pud) & 1))
+ return NULL;
+
+ page = pud_page_vaddr(*pud);
+ index = pmd_index(addr);
+ pmd = pmd_from_index(page, index);
+ if (!(pmd_val(*pmd) & 1))
+ return NULL;
+
+ page = pmd_page_vaddr(*pmd);
+ index = pte_index(addr);
+ pte = pte_from_index(page, index);
+ if (!(pte_val(*pte) & 1))
+ return NULL;
+
+ page = ((pte_val(*pte) & PAGE_MASK) + (addr & (PAGE_SIZE-1)));
+
+ return (void *)(page + PAGE_OFFSET);
+}
+
+void __lguest_free_guest_pmd(pmd_t *pmd)
+{
+ pte_t *pte;
+ unsigned long page;
+ int i;
+
+ page = pmd_page_vaddr(*pmd);
+
+ for (i=0; i < PTRS_PER_PTE; i++) {
+ pte = pte_from_index(page, i);
+ if (!(pte_val(*pte) & 1))
+ continue;
+ /* FIXME: do some checks here??? */
+ }
+ set_pmd(pmd, __pmd(0));
+ free_page(page);
+}
+
+void __lguest_free_guest_pud(pud_t *pud)
+{
+ pmd_t *pmd;
+ unsigned long page;
+ int i;
+
+ page = pud_page_vaddr(*pud);
+
+ for (i=0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_from_index(page, i);
+ if (!(pmd_val(*pmd) & 1))
+ continue;
+ __lguest_free_guest_pmd(pmd);
+ }
+ set_pud(pud, __pud(0));
+ free_page(page);
+}
+
+void __lguest_free_guest_pgd(pgd_t *pgd)
+{
+ pud_t *pud;
+ unsigned long page;
+ int i;
+
+ page = pgd_page_vaddr(*pgd);
+
+ for (i=0; i < PTRS_PER_PUD; i++) {
+ pud = pud_from_index(page, i);
+ if (!(pud_val(*pud) & 1))
+ continue;
+ __lguest_free_guest_pud(pud);
+ }
+ set_pgd(pgd, __pgd(0));
+ free_page(page);
+}
+
+void __lguest_free_guest_pages(u64 *cr3)
+{
+ pgd_t *pgd;
+ int i;
+
+ if (!cr3)
+ return;
+
+ for (i=0; i < PTRS_PER_PGD; i++) {
+ pgd = (pgd_t*)&cr3[i];
+ if (!(pgd_val(*pgd) & 1))
+ continue;
+ __lguest_free_guest_pgd(pgd);
+ }
+ free_page((u64)cr3);
+}
+
+void __lguest_free_guest_upages(struct lguest_guest_info *linfo, u64 *cr3)
+{
+ pgd_t *pgd;
+ int i;
+
+ if (!cr3)
+ return;
+
+ for (i=0; i < pgd_index(linfo->page_offset); i++) {
+ pgd = (pgd_t*)&cr3[i];
+ if (!(pgd_val(*pgd) & 1))
+ continue;
+ __lguest_free_guest_pgd(pgd);
+ }
+ free_page((u64)cr3);
+}
+
+void lguest_free_guest_pages(struct lguest_guest_info *linfo)
+{
+ int i;
+
+ /* This frees all the guest kernel pages */
+ __lguest_free_guest_pages(linfo->kpgdir.pgdir);
+
+ for (i=0; i < LGUEST_PGDIRS; i++)
+ __lguest_free_guest_upages(linfo, linfo->pgdirs[i].pgdir);
+}
+
+void lguest_unmap_guest_pages(struct lguest_guest_info *lguest,
+ unsigned long vaddr, int pages)
+{
+ int i;
+
+ for (i=0; i < pages; i++)
+ __lguest_unmap_guest_page(lguest, vaddr+PAGE_SIZE*i);
+}
+
+int lguest_init_vcpu_pagetable(struct lguest_vcpu *vcpu)
+{
+ struct lguest_guest_info *linfo = vcpu->guest;
+
+ mutex_lock(&linfo->page_lock);
+ vcpu->pgdir = new_pgdir(linfo, linfo->kpgdir.cr3);
+ BUG_ON(!vcpu->pgdir);
+ if (!vcpu->pgdir->count++)
+ vcpu->pgdir->flags |= LGUEST_PGD_BUSY_FL;
+ list_add(&vcpu->pgdir->list, &linfo->pgd_list);
+ mutex_unlock(&linfo->page_lock);
+
+ return 0;
+}
+
+int init_guest_pagetable(struct lguest_guest_info *linfo, u64 pgtable)
+{
+ int ret = -ENOMEM;
+
+ linfo->kpgdir.cr3 = pgtable;
+ linfo->kpgdir.pgdir = (u64*)get_zeroed_page(GFP_KERNEL);
+ if (!linfo->kpgdir.pgdir)
+ return -ENOMEM;
+ linfo->kpgdir.flags |= LGUEST_PGD_BUSY_FL | LGUEST_PGD_MASTER_FL;
+ linfo->kpgdir.count = -1;
+
+ /*
+ * The list is used to update all the kernel page tables,
+ * so that they all have the same mappings.
+ */
+ list_add(&linfo->kpgdir.list, &linfo->pgd_list);
+
+ ret = lguest_map_hv_pages(linfo, lguest_hv_addr,
+ lguest_hv_pages, NULL);
+ if (ret < 0)
+ goto out;
+
+ return 0;
+ out:
+ free_page((u64)linfo->kpgdir.pgdir);
+
+ return ret;
+}
+
Index: work-pv/arch/x86_64/Makefile
==================================================================---
work-pv.orig/arch/x86_64/Makefile
+++ work-pv/arch/x86_64/Makefile
@@ -84,6 +84,7 @@ core-y += arch/x86_64/kernel/ \
core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/
drivers-$(CONFIG_PCI) += arch/x86_64/pci/
drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/
+drivers-$(CONFIG_LGUEST_GUEST) += arch/x86_64/lguest/
boot := arch/x86_64/boot
Index: work-pv/include/asm-x86_64/lguest.h
==================================================================--- /dev/null
+++ work-pv/include/asm-x86_64/lguest.h
@@ -0,0 +1,350 @@
+#ifndef _LGUEST_H_
+#define _LGUEST_H_
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include <linux/futex.h>
+#include <asm/lguest_user.h>
+
+/* XXX: Come up with better magic later on */
+#define LGUEST_MAGIC_R13 0x1
+#define LGUEST_MAGIC_R14 0x2
+#define LGUEST_MAGIC_R15 0x3
+
+#define LGUEST_MAX_VCPUS 64
+
+#define LGUEST_PGDS_PER_VCPU 8
+#define LGUEST_PGDIRS (LGUEST_MAX_VCPUS * LGUEST_PGDS_PER_VCPU)
+
+#define LGUEST_IRQS 32
+
+#define LHCALL_FLUSH_ASYNC 0
+#define LHCALL_LGUEST_INIT 1
+#define LHCALL_CRASH 2
+#define LHCALL_LOAD_GDT 3
+#define LHCALL_NEW_PGTABLE 4
+#define LHCALL_FLUSH_TLB 5
+#define LHCALL_LOAD_IDT_ENTRY 6
+#define LHCALL_SET_STACK 7
+#define LHCALL_TS 8
+#define LHCALL_TIMER_READ 9
+#define LHCALL_TIMER_START 10
+#define LHCALL_HALT 11
+#define LHCALL_GET_WALLCLOCK 12
+#define LHCALL_BIND_DMA 13
+#define LHCALL_SEND_DMA 14
+#define LHCALL_FLUSH_TLB_SIG 15
+#define LHCALL_SET_PTE 16
+#define LHCALL_SET_PMD 17
+#define LHCALL_SET_PUD 18
+#define LHCALL_SET_PGD 19
+#define LHCALL_CLEAR_PTE 20
+#define LHCALL_CLEAR_PMD 21
+#define LHCALL_CLEAR_PUD 22
+#define LHCALL_CLEAR_PGD 23
+#define LHCALL_LOAD_TLS 24
+#define LHCALL_RDMSR 25
+#define LHCALL_WRMSR 26
+#define LHCALL_IRET 27
+
+#define LHCALL_PRINT 60
+#define LHCALL_DEBUG_ME 99
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+ : "=a"(call)
+ : "a"(call), "d"(arg1), "b"(arg2),
"c"(arg3)
+ : "memory");
+ return call;
+}
+
+void async_hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+struct lguest_vcpu;
+
+struct lguest_dma_info
+{
+ struct list_head list;
+ union futex_key key;
+ unsigned long dmas;
+ u16 next_dma;
+ u16 num_dmas;
+ u32 guest_id;
+ u8 interrupt; /* 0 when not registered */
+};
+
+
+/* these must be powers of two */
+#define PUD_HASH_SIZE 256
+#define PMD_HASH_SIZE 256
+#define PTE_HASH_SIZE 256
+
+#define LGUEST_PGD_BUSY_FL (1<<0)
+#define LGUEST_PGD_MASTER_FL (1<<1)
+#define LGUEST_PGD_LINK_FL (1<<2)
+
+#define LGUEST_PUD_KERNEL_FL (1<<1)
+#define LGUEST_PMD_KERNEL_FL (1<<1)
+#define LGUEST_PTE_KERNEL_FL (1<<1)
+
+struct lguest_pgd {
+ struct list_head list;
+ u64 cr3;
+ u64 *pgdir;
+ u64 *user_pgdir;
+ unsigned count;
+ unsigned flags;
+};
+
+struct lguest_pud {
+ struct list_head list;
+ struct lguest_pgd *pgdir;
+ u64 gpud; /* guest pud */
+ unsigned flags;
+ unsigned idx;
+};
+
+struct lguest_pmd {
+ struct list_head list;
+ struct lguest_pud *pudir;
+ u64 gpmd; /* guest pmd */
+ unsigned flags;
+ unsigned idx;
+};
+
+struct lguest_pte {
+ struct list_head list;
+ struct lguest_pmd *pmdir;
+ u64 gpte; /* guest pte */
+ unsigned flags;
+ unsigned idx;
+};
+
+struct lguest_guest_info {
+ struct lguest_data __user *lguest_data;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ u32 guest_id;
+ u64 pfn_limit;
+ u64 start_kernel_map;
+ u64 page_offset;
+
+ int halted;
+ /* does it really belong here? */
+ char *dead;
+#if 0
+ unsigned long noirq_start, noirq_end;
+#endif
+ int dma_is_pending;
+ unsigned long pending_dma; /* struct lguest_dma */
+ unsigned long pending_addr; /* address they're sending to */
+
+ struct lguest_pgd kpgdir;
+ struct lguest_pgd pgdirs[LGUEST_PGDIRS];
+ struct list_head pgd_list;
+ struct list_head pud_hash[PUD_HASH_SIZE];
+ struct list_head pmd_hash[PMD_HASH_SIZE];
+ struct list_head pte_hash[PTE_HASH_SIZE];
+ struct mutex page_lock;
+
+ int timer_on;
+ int last_timer;
+
+ /* Cached wakeup: we hold a reference to this task. */
+ struct task_struct *wake;
+
+ struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+ struct lguest_vcpu *vcpu[LGUEST_MAX_VCPUS];
+};
+
+/* copied from old lguest code. Not sure if it's the best layout for us */
+struct lguest_regs
+{
+ u64 cr3; /* 0 ( 0x0) */
+ /* Manually saved part. */
+ u64 rbx, rcx, rdx; /* 8 ( 0x8) */
+ u64 rsi, rdi, rbp; /* 32 (0x20) */
+ u64 r8, r9, r10, r11; /* 56 (0x38) */
+ u64 r12, r13, r14, r15; /* 88 (0x58) */
+ u64 rax; /* 120 (0x78) */
+ u64 fs; /* ds; */ /* 128 (0x80) */
+ u64 trapnum, errcode; /* 136 (0x88) */
+ /* Trap pushed part */
+ u64 rip; /* 152 (0x98) */
+ u64 cs; /* 160 (0xa0) */
+ u64 rflags; /* 168 (0xa8) */
+ u64 rsp; /* 176 (0xb0) */
+ u64 ss; /* Crappy Segment! */ /* 184 (0xb8) */
+ /* size = 192 (0xc0) */
+ char size[0];
+};
+
+struct lguest_tss_struct {
+ u32 reserved1;
+ u64 rsp0;
+ u64 rsp1;
+ u64 rsp2;
+ u64 reserved2;
+ u64 ist[7];
+ u32 reserved3;
+ u32 reserved4;
+ u16 reserved5;
+ u16 io_bitmap_base;
+ /* we don't let the guest have io privileges (yet) */
+ unsigned long io_bitmap[1];
+} __attribute__((packed)) ____cacheline_aligned;
+
+struct lguest_vcpu {
+ unsigned long host_syscall;
+ unsigned long guest_syscall;
+
+ /* Must be 16 bytes aligned at regs+sizeof(regs) */
+ struct lguest_regs regs;
+
+ struct lguest_vcpu *vcpu; /* pointer to itself */
+ unsigned long debug;
+ unsigned long magic;
+ unsigned int id;
+ unsigned long host_stack;
+ unsigned long guest_stack;
+ unsigned long host_cr3;
+ unsigned long host_page;
+ struct desc_ptr host_gdt;
+ u16 host_gdt_buff[3];
+ struct desc_ptr host_idt;
+ u16 host_idt_buff[3];
+ unsigned long host_gdt_ptr;
+ /* Save rax on interrupts, it's used for iret hcall */
+ unsigned long rax;
+
+ /* Host save gs base pointer */
+ unsigned long host_gs_a;
+ unsigned long host_gs_d;
+
+ /* save host process gs base pointer */
+ unsigned long host_proc_gs_a;
+ unsigned long host_proc_gs_d;
+
+ /* save guest gs base pointer */
+ unsigned long guest_gs_a;
+ unsigned long guest_gs_d;
+
+ /* used for guest calling swapgs */
+ unsigned long guest_gs_shadow_a;
+ unsigned long guest_gs_shadow_d;
+
+ struct lguest_pgd *pgdir;
+
+ struct desc_ptr gdt; /* address of the GDT at this vcpu */
+ u16 gdt_buff[3];
+ struct desc_struct gdt_table[GDT_ENTRIES];
+
+ struct desc_ptr idt; /* address of the IDT at this vcpu */
+ u16 idt_buff[3];
+ struct gate_struct idt_table[IDT_ENTRIES];
+
+ struct lguest_guest_info *guest;
+
+ struct lguest_tss_struct tss;
+
+ unsigned long ts;
+
+ /* host ist 7 - we use it to prevent the NMI race */
+ unsigned long host_ist;
+
+ /* only for those above FIRST_EXTERNAL_VECTOR */
+ DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+ /* those are general. We catch every possible interrupt */
+ DECLARE_BITMAP(interrupt_disabled, LGUEST_IRQS + FIRST_EXTERNAL_VECTOR);
+ unsigned long interrupt[LGUEST_IRQS + FIRST_EXTERNAL_VECTOR];
+
+ /* nmi trampoline storage */
+
+ struct lguest_regs nmi_regs;
+ unsigned long nmi_gs_a;
+ unsigned long nmi_gs_d;
+ unsigned long nmi_gs_shadow_a;
+ unsigned long nmi_gs_shadow_d;
+ struct desc_ptr nmi_gdt;
+ u16 nmi_gdt_buff[3];
+
+ /* set when we take an nmi */
+ unsigned long nmi_sw;
+
+ /* is this enough? */
+ char nmi_stack[1048];
+ char nmi_stack_end[0];
+ char gpf_stack[1048];
+ char gpf_stack_end[0];
+ char df_stack[1048];
+ char df_stack_end[0];
+};
+
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+ u32 eax, edx, ebx, ecx;
+};
+
+struct lguest_text_ptr {
+ unsigned long next; /* guest pa address of next pointer */
+ unsigned long start;
+ unsigned long end;
+};
+
+struct lguest_data
+{
+/* Fields which change during running: */
+ /* 512 == enabled (same as eflags) */
+ unsigned int irq_enabled;
+ /* Blocked interrupts. */
+ DECLARE_BITMAP(interrupts, LGUEST_IRQS);
+
+ /* Last (userspace) address we got a GPF & reloaded gs. */
+ unsigned int gs_gpf_eip;
+
+ /* Virtual address of page fault. */
+ unsigned long cr2;
+
+ /* Async hypercall ring. 0xFF == done, 0 == pending. */
+ u8 hcall_status[LHCALL_RING_SIZE];
+ struct hcall_ring hcalls[LHCALL_RING_SIZE];
+
+/* Fields initialized by the hypervisor at boot: */
+ /* Memory not to try to access */
+ unsigned long reserve_mem;
+ /* ID of this guest (used by network driver to set ethernet address) */
+ u32 guest_id;
+
+/* Fields initialized by the guest at boot: */
+ /* Instruction range to suppress interrupts even if enabled */
+#if 0
+ unsigned long noirq_start, noirq_end;
+#endif
+ unsigned long start_kernel_map;
+ unsigned long page_offset;
+ unsigned long text; /* pa address of lguest_text_ptr addresses */
+
+/* If the kernel has kallsyms, we can use it to do backtraces of a guest */
+ unsigned long kallsyms_addresses;
+ unsigned long kallsyms_num_syms;
+ unsigned long kallsyms_names;
+ unsigned long kallsyms_token_table;
+ unsigned long kallsyms_token_index;
+ unsigned long kallsyms_markers;
+
+ unsigned long return_address;
+};
+
+extern struct lguest_data lguest_data;
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+int run_guest(struct lguest_vcpu *vcpu, char *__user user);
+
+#endif
--