Index: 2006-10-04/config/x86_64.mk ==================================================================--- 2006-10-04.orig/config/x86_64.mk 2006-07-17 08:27:43.000000000 +0200 +++ 2006-10-04/config/x86_64.mk 2006-10-04 15:03:07.000000000 +0200 @@ -1,4 +1,5 @@ CONFIG_X86 := y +CONFIG_COMPAT := y CONFIG_HVM := y CONFIG_MIGRATE := y CONFIG_XCUTILS := y Index: 2006-10-04/xen/arch/x86/boot/x86_64.S ==================================================================--- 2006-10-04.orig/xen/arch/x86/boot/x86_64.S 2006-09-21 11:05:00.000000000 +0200 +++ 2006-10-04/xen/arch/x86/boot/x86_64.S 2006-10-04 15:03:07.000000000 +0200 @@ -223,15 +223,34 @@ high_start: .align PAGE_SIZE, 0 ENTRY(gdt_table) .quad 0x0000000000000000 /* unused */ - .quad 0x00cf9a000000ffff /* 0xe008 ring 0 code, compatibility */ - .quad 0x00af9a000000ffff /* 0xe010 ring 0 code, 64-bit mode */ - .quad 0x00cf92000000ffff /* 0xe018 ring 0 data */ + .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ + .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ + .quad 0x0000000000000000 /* reserved */ .quad 0x00cffa000000ffff /* 0xe023 ring 3 code, compatibility */ .quad 0x00cff2000000ffff /* 0xe02b ring 3 data */ .quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */ - .quad 0x0000000000000000 /* unused */ + .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ + .org gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8 .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ +#ifdef CONFIG_COMPAT + .align PAGE_SIZE, 0 +/* NB. Even rings != 0 get access to the full 4Gb, as only the */ +/* (compatibility) machine->physical mapping table lives there. */ +ENTRY(compat_gdt_table) + .quad 0x0000000000000000 /* unused */ + .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ + .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ + .quad 0x00cfba000000ffff /* 0xe019 ring 1 code, compatibility */ + .quad 0x00cfb2000000ffff /* 0xe021 ring 1 data */ + .quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */ + .quad 0x00cff2000000ffff /* 0xe033 ring 3 data */ + .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ + .org compat_gdt_table - FIRST_RESERVED_GDT_BYTE + __TSS(0) * 8 + .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ +# undef LIMIT +#endif + /* Initial PML4 -- level-4 page table. */ .align PAGE_SIZE, 0 ENTRY(idle_pg_table) Index: 2006-10-04/xen/arch/x86/domain.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/domain.c 2006-10-04 09:27:29.000000000 +0200 +++ 2006-10-04/xen/arch/x86/domain.c 2006-10-04 15:03:07.000000000 +0200 @@ -263,17 +263,18 @@ int arch_set_info_guest( if ( !(c->flags & VGCF_HVM_GUEST) ) { - fixup_guest_stack_selector(c->user_regs.ss); - fixup_guest_stack_selector(c->kernel_ss); - fixup_guest_code_selector(c->user_regs.cs); - -#ifdef __i386__ - fixup_guest_code_selector(c->event_callback_cs); - fixup_guest_code_selector(c->failsafe_callback_cs); -#endif + fixup_guest_stack_selector(d, c->user_regs.ss); + fixup_guest_stack_selector(d, c->kernel_ss); + fixup_guest_code_selector(d, c->user_regs.cs); + + if ( CONFIG_PAGING_LEVELS < 4 || IS_COMPAT(d) ) + { + fixup_guest_code_selector(d, c->event_callback_cs); + fixup_guest_code_selector(d, c->failsafe_callback_cs); + } for ( i = 0; i < 256; i++ ) - fixup_guest_code_selector(c->trap_ctxt[i].cs); + fixup_guest_code_selector(d, c->trap_ctxt[i].cs); } else if ( !hvm_enabled ) return -EINVAL; @@ -422,9 +423,11 @@ void new_thread(struct vcpu *d, * ESI = start_info * [EAX,EBX,ECX,EDX,EDI,EBP are zero] */ - regs->ds = regs->es = regs->fs = regs->gs = FLAT_KERNEL_DS; - regs->ss = FLAT_KERNEL_SS; - regs->cs = FLAT_KERNEL_CS; + regs->ds = regs->es = regs->fs = regs->gs = !IS_COMPAT(d->domain) + ? FLAT_KERNEL_DS + : FLAT_COMPAT_KERNEL_DS; + regs->ss = !IS_COMPAT(d->domain) ? FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS; + regs->cs = !IS_COMPAT(d->domain) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS; regs->eip = start_pc; regs->esp = start_stack; regs->esi = start_info; @@ -503,27 +506,30 @@ static void load_segments(struct vcpu *n all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs); } - /* This can only be non-zero if selector is NULL. */ - if ( nctxt->fs_base ) - wrmsr(MSR_FS_BASE, - nctxt->fs_base, - nctxt->fs_base>>32); - - /* Most kernels have non-zero GS base, so don''t bother testing. */ - /* (This is also a serialising instruction, avoiding AMD erratum #88.) */ - wrmsr(MSR_SHADOW_GS_BASE, - nctxt->gs_base_kernel, - nctxt->gs_base_kernel>>32); - - /* This can only be non-zero if selector is NULL. */ - if ( nctxt->gs_base_user ) - wrmsr(MSR_GS_BASE, - nctxt->gs_base_user, - nctxt->gs_base_user>>32); - - /* If in kernel mode then switch the GS bases around. */ - if ( n->arch.flags & TF_kernel_mode ) - __asm__ __volatile__ ( "swapgs" ); + if ( !IS_COMPAT(n->domain) ) + { + /* This can only be non-zero if selector is NULL. */ + if ( nctxt->fs_base ) + wrmsr(MSR_FS_BASE, + nctxt->fs_base, + nctxt->fs_base>>32); + + /* Most kernels have non-zero GS base, so don''t bother testing. */ + /* (This is also a serialising instruction, avoiding AMD erratum #88.) */ + wrmsr(MSR_SHADOW_GS_BASE, + nctxt->gs_base_kernel, + nctxt->gs_base_kernel>>32); + + /* This can only be non-zero if selector is NULL. */ + if ( nctxt->gs_base_user ) + wrmsr(MSR_GS_BASE, + nctxt->gs_base_user, + nctxt->gs_base_user>>32); + + /* If in kernel mode then switch the GS bases around. */ + if ( (n->arch.flags & TF_kernel_mode) ) + __asm__ __volatile__ ( "swapgs" ); + } if ( unlikely(!all_segs_okay) ) { @@ -534,6 +540,54 @@ static void load_segments(struct vcpu *n (unsigned long *)nctxt->kernel_sp; unsigned long cs_and_mask, rflags; + if ( IS_COMPAT(n->domain) ) + { + unsigned int *esp = ring_1(regs) ? + (unsigned int *)regs->rsp : + (unsigned int *)nctxt->kernel_sp; + unsigned int cs_and_mask, eflags; + int ret = 0; + + /* CS longword also contains full evtchn_upcall_mask. */ + cs_and_mask = (unsigned short)regs->cs | + ((unsigned int)n->vcpu_info->evtchn_upcall_mask << 16); + /* Fold upcall mask into RFLAGS.IF. */ + eflags = regs->_eflags & ~X86_EFLAGS_IF; + eflags |= !n->vcpu_info->evtchn_upcall_mask << 9; + + if ( !ring_1(regs) ) + { + ret = put_user(regs->ss, esp-1); + ret |= put_user(regs->_esp, esp-2); + esp -= 2; + } + + if ( ret | + put_user(eflags, esp-1) | + put_user(cs_and_mask, esp-2) | + put_user(regs->_eip, esp-3) | + put_user(nctxt->user_regs.gs, esp-4) | + put_user(nctxt->user_regs.fs, esp-5) | + put_user(nctxt->user_regs.es, esp-6) | + put_user(nctxt->user_regs.ds, esp-7) ) + { + DPRINTK("Error while creating failsafe callback frame.\n"); + domain_crash(n->domain); + } + + if ( test_bit(_VGCF_failsafe_disables_events, + &n->arch.guest_context.flags) ) + n->vcpu_info->evtchn_upcall_mask = 1; + + regs->entry_vector = TRAP_syscall; + regs->_eflags &= 0xFFFCBEFFUL; + regs->ss = FLAT_COMPAT_KERNEL_SS; + regs->_esp = (unsigned long)(esp-7); + regs->cs = FLAT_COMPAT_KERNEL_CS; + regs->_eip = nctxt->failsafe_callback_eip; + return; + } + if ( !(n->arch.flags & TF_kernel_mode) ) toggle_guest_mode(n); else @@ -594,7 +648,7 @@ static void save_segments(struct vcpu *v if ( regs->es ) dirty_segment_mask |= DIRTY_ES; - if ( regs->fs ) + if ( regs->fs || IS_COMPAT(v->domain) ) { dirty_segment_mask |= DIRTY_FS; ctxt->fs_base = 0; /* != 0 selector kills fs_base */ @@ -604,7 +658,7 @@ static void save_segments(struct vcpu *v dirty_segment_mask |= DIRTY_FS_BASE; } - if ( regs->gs ) + if ( regs->gs || IS_COMPAT(v->domain) ) { dirty_segment_mask |= DIRTY_GS; ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */ @@ -736,6 +790,21 @@ void context_switch(struct vcpu *prev, s { __context_switch(); + if ( IS_COMPAT(prev->domain) != IS_COMPAT(next->domain) ) + { + uint32_t efer_lo, efer_hi; + + local_flush_tlb_one(GDT_VIRT_START(next) + + FIRST_RESERVED_GDT_BYTE); + + rdmsr(MSR_EFER, efer_lo, efer_hi); + if ( !IS_COMPAT(next->domain) == !(efer_lo & EFER_SCE) ) + { + efer_lo ^= EFER_SCE; + wrmsr(MSR_EFER, efer_lo, efer_hi); + } + } + /* Re-enable interrupts before restoring state which may fault. */ local_irq_enable(); @@ -948,6 +1017,10 @@ void domain_relinquish_resources(struct put_page(mfn_to_page(pfn)); else put_page_and_type(mfn_to_page(pfn)); +#ifdef __x86_64__ + if ( pfn == pagetable_get_pfn(v->arch.guest_table_user) ) + v->arch.guest_table_user = pagetable_null(); +#endif v->arch.guest_table = pagetable_null(); } Index: 2006-10-04/xen/arch/x86/domain_build.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/domain_build.c 2006-09-21 10:56:11.000000000 +0200 +++ 2006-10-04/xen/arch/x86/domain_build.c 2006-10-04 15:03:07.000000000 +0200 @@ -316,11 +316,39 @@ int construct_dom0(struct domain *d, else nr_pages = dom0_nrpages; - if ( (rc = parseelfimage(&dsi)) != 0 ) + rc = parseelfimage(&dsi); +#ifdef CONFIG_COMPAT + if ( rc == -ENOSYS + && (rc = parseelf32image(&dsi)) == 0 ) + { + l1_pgentry_t gdt_l1e; + + set_bit(_DOMF_compat, &d->domain_flags); + + if ( nr_pages != (unsigned int)nr_pages ) + nr_pages = UINT_MAX; + + /* + * Map compatibility Xen segments into every VCPU''s GDT. See + * arch_domain_create() for further comments. + */ + gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), + PAGE_HYPERVISOR); + for ( i = 0; i < MAX_VIRT_CPUS; i++ ) + d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; + local_flush_tlb_one(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE); + } +#endif + if ( rc != 0) + { + if ( rc == -ENOSYS ) + printk("DOM0 image is not a Xen-compatible Elf image.\n"); return rc; + } dom0_pae = (dsi.pae_kernel != PAEKERN_no); - xen_pae = (CONFIG_PAGING_LEVELS == 3); + xen_pae = (CONFIG_PAGING_LEVELS == 3) || IS_COMPAT(d); if ( dom0_pae != xen_pae ) { printk("PAE mode mismatch between Xen and DOM0 (xen=%s, dom0=%s)\n", @@ -331,7 +359,13 @@ int construct_dom0(struct domain *d, if ( xen_pae && dsi.pae_kernel == PAEKERN_extended_cr3 ) set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist); - if ( (p = xen_elfnote_string(&dsi, XEN_ELFNOTE_FEATURES)) != NULL ) +#ifdef CONFIG_COMPAT + if ( IS_COMPAT(d) ) + p = xen_elf32note_string(&dsi, XEN_ELFNOTE_FEATURES); + else +#endif + p = xen_elfnote_string(&dsi, XEN_ELFNOTE_FEATURES); + if ( p != NULL ) { parse_features(p, dom0_features_supported, @@ -444,9 +478,9 @@ int construct_dom0(struct domain *d, * We''re basically forcing default RPLs to 1, so that our "what privilege * level are we returning to?" logic works. */ - v->arch.guest_context.kernel_ss = FLAT_KERNEL_SS; + v->arch.guest_context.kernel_ss = !IS_COMPAT(d) ? FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS; for ( i = 0; i < 256; i++ ) - v->arch.guest_context.trap_ctxt[i].cs = FLAT_KERNEL_CS; + v->arch.guest_context.trap_ctxt[i].cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS; #if defined(__i386__) @@ -595,6 +629,12 @@ int construct_dom0(struct domain *d, return -EINVAL; } + if ( IS_COMPAT(d) ) + { + v->arch.guest_context.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS; + v->arch.guest_context.event_callback_cs = FLAT_COMPAT_KERNEL_CS; + } + /* WARNING: The new domain must have its ''processor'' field filled in! */ maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table; l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; @@ -604,6 +644,8 @@ int construct_dom0(struct domain *d, l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); + if ( IS_COMPAT(d) ) + v->arch.guest_table_user = v->arch.guest_table; l4tab += l4_table_offset(dsi.v_start); mfn = alloc_spfn; @@ -714,8 +756,19 @@ int construct_dom0(struct domain *d, write_ptbase(v); /* Copy the OS image and free temporary buffer. */ +#ifdef CONFIG_COMPAT + if ( IS_COMPAT(d) ) + (void)loadelf32image(&dsi); + else +#endif (void)loadelfimage(&dsi); +#ifdef CONFIG_COMPAT + if ( IS_COMPAT(d) ) + hypercall_page + xen_elf32note_numeric(&dsi, XEN_ELFNOTE_HYPERCALL_PAGE, &hypercall_page_defined); + else +#endif hypercall_page xen_elfnote_numeric(&dsi, XEN_ELFNOTE_HYPERCALL_PAGE, &hypercall_page_defined); if ( hypercall_page_defined ) @@ -750,7 +803,7 @@ int construct_dom0(struct domain *d, si->mfn_list = vphysmap_start; sprintf(si->magic, "xen-%i.%i-x86_%d%s", xen_major_version(), xen_minor_version(), - BITS_PER_LONG, xen_pae ? "p" : ""); + !IS_COMPAT(d) ? BITS_PER_LONG : 32, xen_pae ? "p" : ""); /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < d->tot_pages; pfn++ ) @@ -914,13 +967,28 @@ int elf_sanity_check(Elf_Ehdr *ehdr) (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) || (ehdr->e_type != ET_EXEC) ) { - printk("DOM0 image is not a Xen-compatible Elf image.\n"); return 0; } return 1; } +#ifdef CONFIG_COMPAT +int elf32_sanity_check(Elf32_Ehdr *ehdr) +{ + if ( !IS_ELF(*ehdr) || + (ehdr->e_ident[EI_CLASS] != ELFCLASS32) || + (ehdr->e_machine != EM_386) || + (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) || + (ehdr->e_type != ET_EXEC) ) + { + return 0; + } + + return 1; +} +#endif + /* * Local variables: * mode: C Index: 2006-10-04/xen/arch/x86/mm.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/mm.c 2006-10-04 09:27:53.000000000 +0200 +++ 2006-10-04/xen/arch/x86/mm.c 2006-10-04 15:03:07.000000000 +0200 @@ -407,7 +407,7 @@ static int alloc_segdesc_page(struct pag descs = map_domain_page(page_to_mfn(page)); for ( i = 0; i < 512; i++ ) - if ( unlikely(!check_descriptor(&descs[i])) ) + if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) ) goto fail; unmap_domain_page(descs); @@ -2848,7 +2848,7 @@ long do_update_descriptor(u64 pa, u64 de if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) || (((unsigned int)pa % sizeof(struct desc_struct)) != 0) || !mfn_valid(mfn) || - !check_descriptor(&d) ) + !check_descriptor(dom, &d) ) { UNLOCK_BIGLOCK(dom); return -EINVAL; Index: 2006-10-04/xen/arch/x86/traps.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/traps.c 2006-10-04 09:28:00.000000000 +0200 +++ 2006-10-04/xen/arch/x86/traps.c 2006-10-04 15:03:07.000000000 +0200 @@ -1812,6 +1812,13 @@ void set_tss_desc(unsigned int n, void * (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 9); +#ifdef CONFIG_COMPAT + _set_tssldt_desc( + compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + 11); +#endif } void __init trap_init(void) @@ -1886,7 +1893,7 @@ long do_set_trap_table(XEN_GUEST_HANDLE( if ( cur.address == 0 ) break; - fixup_guest_code_selector(cur.cs); + fixup_guest_code_selector(current->domain, cur.cs); memcpy(&dst[cur.vector], &cur, sizeof(cur)); Index: 2006-10-04/xen/arch/x86/x86_32/mm.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/x86_32/mm.c 2006-08-23 11:24:59.000000000 +0200 +++ 2006-10-04/xen/arch/x86/x86_32/mm.c 2006-10-04 15:03:07.000000000 +0200 @@ -227,7 +227,7 @@ long do_stack_switch(unsigned long ss, u int nr = smp_processor_id(); struct tss_struct *t = &init_tss[nr]; - fixup_guest_stack_selector(ss); + fixup_guest_stack_selector(current->domain, ss); current->arch.guest_context.kernel_ss = ss; current->arch.guest_context.kernel_sp = esp; @@ -238,7 +238,7 @@ long do_stack_switch(unsigned long ss, u } /* Returns TRUE if given descriptor is valid for GDT or LDT. */ -int check_descriptor(struct desc_struct *d) +int check_descriptor(const struct domain *dom, struct desc_struct *d) { unsigned long base, limit; u32 a = d->a, b = d->b; @@ -258,8 +258,8 @@ int check_descriptor(struct desc_struct * gates (consider a call gate pointing at another kernel descriptor with * DPL 0 -- this would get the OS ring-0 privileges). */ - if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL << 13) ) - d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL << 13); + if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) ) + d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13); if ( !(b & _SEGMENT_S) ) { @@ -281,8 +281,8 @@ int check_descriptor(struct desc_struct /* Validate and fix up the target code selector. */ cs = a >> 16; - fixup_guest_code_selector(cs); - if ( !guest_gate_selector_okay(cs) ) + fixup_guest_code_selector(dom, cs); + if ( !guest_gate_selector_okay(dom, cs) ) goto bad; a = d->a = (d->a & 0xffffU) | (cs << 16); Index: 2006-10-04/xen/arch/x86/x86_32/traps.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/x86_32/traps.c 2006-09-21 10:56:11.000000000 +0200 +++ 2006-10-04/xen/arch/x86/x86_32/traps.c 2006-10-04 15:03:07.000000000 +0200 @@ -323,7 +323,7 @@ void init_int80_direct_trap(struct vcpu * switch to the Xen stack and we need to swap back to the guest * kernel stack before passing control to the system call entry point. */ - if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) || + if ( TI_GET_IF(ti) || !guest_gate_selector_okay(v->domain, ti->cs) || supervisor_mode_kernel ) { v->arch.int80_desc.a = v->arch.int80_desc.b = 0; @@ -353,7 +353,7 @@ static long register_guest_callback(stru long ret = 0; struct vcpu *v = current; - fixup_guest_code_selector(reg->address.cs); + fixup_guest_code_selector(v->domain, reg->address.cs); switch ( reg->type ) { Index: 2006-10-04/xen/arch/x86/x86_64/asm-offsets.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/x86_64/asm-offsets.c 2006-09-25 14:59:15.000000000 +0200 +++ 2006-10-04/xen/arch/x86/x86_64/asm-offsets.c 2006-10-04 15:03:07.000000000 +0200 @@ -58,12 +58,16 @@ void __dummy__(void) OFFSET(VCPU_thread_flags, struct vcpu, arch.flags); OFFSET(VCPU_event_addr, struct vcpu, arch.guest_context.event_callback_eip); + OFFSET(VCPU_event_sel, struct vcpu, + arch.guest_context.event_callback_cs); OFFSET(VCPU_failsafe_addr, struct vcpu, arch.guest_context.failsafe_callback_eip); + OFFSET(VCPU_failsafe_sel, struct vcpu, + arch.guest_context.failsafe_callback_cs); OFFSET(VCPU_syscall_addr, struct vcpu, arch.guest_context.syscall_callback_eip); - OFFSET(VCPU_kernel_sp, struct vcpu, - arch.guest_context.kernel_sp); + OFFSET(VCPU_kernel_sp, struct vcpu, arch.guest_context.kernel_sp); + OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss); OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags); OFFSET(VCPU_arch_guest_fpu_ctxt, struct vcpu, arch.guest_context.fpu_ctxt); OFFSET(VCPU_flags, struct vcpu, vcpu_flags); Index: 2006-10-04/xen/arch/x86/x86_64/mm.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/x86_64/mm.c 2006-09-21 13:23:19.000000000 +0200 +++ 2006-10-04/xen/arch/x86/x86_64/mm.c 2006-10-04 15:03:07.000000000 +0200 @@ -224,7 +224,7 @@ long subarch_memory_op(int op, XEN_GUEST long do_stack_switch(unsigned long ss, unsigned long esp) { - fixup_guest_stack_selector(ss); + fixup_guest_stack_selector(current->domain, ss); current->arch.guest_context.kernel_ss = ss; current->arch.guest_context.kernel_sp = esp; return 0; @@ -284,7 +284,7 @@ long do_set_segment_base(unsigned int wh /* Returns TRUE if given descriptor is valid for GDT or LDT. */ -int check_descriptor(struct desc_struct *d) +int check_descriptor(const struct domain *dom, struct desc_struct *d) { u32 a = d->a, b = d->b; u16 cs; @@ -294,8 +294,8 @@ int check_descriptor(struct desc_struct goto good; /* Check and fix up the DPL. */ - if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL << 13) ) - d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL << 13); + if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) ) + d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13); /* All code and data segments are okay. No base/limit checking. */ if ( (b & _SEGMENT_S) ) @@ -311,8 +311,8 @@ int check_descriptor(struct desc_struct /* Validate and fix up the target code selector. */ cs = a >> 16; - fixup_guest_code_selector(cs); - if ( !guest_gate_selector_okay(cs) ) + fixup_guest_code_selector(dom, cs); + if ( !guest_gate_selector_okay(dom, cs) ) goto bad; a = d->a = (d->a & 0xffffU) | (cs << 16); Index: 2006-10-04/xen/arch/x86/x86_64/traps.c ==================================================================--- 2006-10-04.orig/xen/arch/x86/x86_64/traps.c 2006-10-04 09:17:44.000000000 +0200 +++ 2006-10-04/xen/arch/x86/x86_64/traps.c 2006-10-04 15:03:07.000000000 +0200 @@ -187,6 +187,8 @@ asmlinkage void do_double_fault(struct c void toggle_guest_mode(struct vcpu *v) { + if ( IS_COMPAT(v->domain) ) + return; v->arch.flags ^= TF_kernel_mode; __asm__ __volatile__ ( "swapgs" ); update_cr3(v); Index: 2006-10-04/xen/common/Makefile ==================================================================--- 2006-10-04.orig/xen/common/Makefile 2006-08-28 08:32:38.000000000 +0200 +++ 2006-10-04/xen/common/Makefile 2006-10-04 15:03:07.000000000 +0200 @@ -3,6 +3,7 @@ obj-y += bitmap.o obj-y += domctl.o obj-y += domain.o obj-y += elf.o +obj-$(CONFIG_COMPAT) += elf32.o obj-y += event_channel.o obj-y += grant_table.o obj-y += kernel.o Index: 2006-10-04/xen/common/elf.c ==================================================================--- 2006-10-04.orig/xen/common/elf.c 2006-08-28 08:32:38.000000000 +0200 +++ 2006-10-04/xen/common/elf.c 2006-10-04 15:03:07.000000000 +0200 @@ -202,7 +202,7 @@ int parseelfimage(struct domain_setup_in int h, virt_base_defined, elf_pa_off_defined, virt_entry_defined; if ( !elf_sanity_check(ehdr) ) - return -EINVAL; + return -ENOSYS; if ( (ehdr->e_phoff + (ehdr->e_phnum*ehdr->e_phentsize)) > image_len ) { Index: 2006-10-04/xen/common/elf32.c ==================================================================--- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2006-10-04/xen/common/elf32.c 2006-10-04 15:03:07.000000000 +0200 @@ -0,0 +1,19 @@ +/****************************************************************************** + * elf32.c + * + * Stub to support 32-bit ELF images on 64-bit platforms. + */ + +#include <xen/config.h> +#undef ELFSIZE +#define ELFSIZE 32 +#include <xen/types.h> +#include <xen/elf.h> + +#define xen_elfnote_string xen_elf32note_string +#define xen_elfnote_numeric xen_elf32note_numeric +#define parseelfimage parseelf32image +#define loadelfimage loadelf32image +#define elf_sanity_check elf32_sanity_check + +#include "elf.c" Index: 2006-10-04/xen/include/asm-x86/config.h ==================================================================--- 2006-10-04.orig/xen/include/asm-x86/config.h 2006-10-04 09:17:44.000000000 +0200 +++ 2006-10-04/xen/include/asm-x86/config.h 2006-10-04 15:03:07.000000000 +0200 @@ -93,6 +93,7 @@ static inline void FORCE_CRASH(void) #if defined(__x86_64__) #define CONFIG_X86_64 1 +#define CONFIG_COMPAT 1 #define asmlinkage @@ -187,13 +188,21 @@ static inline void FORCE_CRASH(void) #define DIRECTMAP_VIRT_START (PML4_ADDR(262)) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2) +#define __HYPERVISOR_COMPAT_VIRT_START 0xF5800000 +#define __MACH2PHYS_COMPAT_VIRT_START 0xF5800000 +#define __MACH2PHYS_COMPAT_VIRT_END 0xF6800000 /* XXX bump this ? */ +#define HYPERVISOR_COMPAT_VIRT_START mk_unsigned_long(__HYPERVISOR_COMPAT_VIRT_START) +#define MACH2PHYS_COMPAT_VIRT_START mk_unsigned_long(__MACH2PHYS_COMPAT_VIRT_START) +#define MACH2PHYS_COMPAT_VIRT_END mk_unsigned_long(__MACH2PHYS_COMPAT_VIRT_END) +#define MACH2PHYS_COMPAT_NR_ENTRIES ((MACH2PHYS_COMPAT_VIRT_END-MACH2PHYS_COMPAT_VIRT_START)>>2) + #define PGT_base_page_table PGT_l4_page_table -#define __HYPERVISOR_CS64 0xe010 -#define __HYPERVISOR_CS32 0xe008 +#define __HYPERVISOR_CS64 0xe008 +#define __HYPERVISOR_CS32 0xe038 #define __HYPERVISOR_CS __HYPERVISOR_CS64 #define __HYPERVISOR_DS64 0x0000 -#define __HYPERVISOR_DS32 0xe018 +#define __HYPERVISOR_DS32 0xe010 #define __HYPERVISOR_DS __HYPERVISOR_DS64 /* For generic assembly code: use macros to define operation/operand sizes. */ Index: 2006-10-04/xen/include/asm-x86/desc.h ==================================================================--- 2006-10-04.orig/xen/include/asm-x86/desc.h 2006-09-21 11:04:30.000000000 +0200 +++ 2006-10-04/xen/include/asm-x86/desc.h 2006-10-04 15:03:07.000000000 +0200 @@ -18,31 +18,76 @@ #define LDT_ENTRY_SIZE 8 +#if defined(__x86_64__) + +#define FLAT_COMPAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_COMPAT_RING1_DS 0xe021 /* GDT index 260 */ +#define FLAT_COMPAT_RING1_SS 0xe021 /* GDT index 260 */ +#define FLAT_COMPAT_RING3_CS 0xe02b /* GDT index 261 */ +#define FLAT_COMPAT_RING3_DS 0xe033 /* GDT index 262 */ +#define FLAT_COMPAT_RING3_SS 0xe033 /* GDT index 262 */ + +#define FLAT_COMPAT_KERNEL_DS FLAT_COMPAT_RING1_DS +#define FLAT_COMPAT_KERNEL_CS FLAT_COMPAT_RING1_CS +#define FLAT_COMPAT_KERNEL_SS FLAT_COMPAT_RING1_SS +#define FLAT_COMPAT_USER_DS FLAT_COMPAT_RING3_DS +#define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS +#define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS + +#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2) + +#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) +#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) + +#elif defined(__i386__) + +#define FLAT_COMPAT_KERNEL_CS FLAT_KERNEL_CS +#define FLAT_COMPAT_KERNEL_DS FLAT_KERNEL_DS +#define FLAT_COMPAT_KERNEL_SS FLAT_KERNEL_SS +#define FLAT_COMPAT_USER_CS FLAT_USER_CS +#define FLAT_COMPAT_USER_DS FLAT_USER_DS +#define FLAT_COMPAT_USER_SS FLAT_USER_SS + +#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY + +#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1) + +#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY) +#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY) + +#endif + +#ifndef __ASSEMBLY__ + #define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (__TSS(n)<<3) ) #if defined(__x86_64__) -#define GUEST_KERNEL_RPL 3 +#define GUEST_KERNEL_RPL(d) (!IS_COMPAT(d) ? 3 : 1) #elif defined(__i386__) -#define GUEST_KERNEL_RPL 1 +#define GUEST_KERNEL_RPL(d) ((void)(d), 1) #endif /* Fix up the RPL of a guest segment selector. */ -#define __fixup_guest_selector(sel) \ - ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) : \ - (((sel) & ~3) | GUEST_KERNEL_RPL)) +#define __fixup_guest_selector(d, sel) \ +({ \ + uint16_t _rpl = GUEST_KERNEL_RPL(d); \ + (sel) = (((sel) & 3) >= _rpl) ? (sel) : (((sel) & ~3) | _rpl); \ +}) /* Stack selectors don''t need fixing up if the kernel runs in ring 0. */ #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL -#define fixup_guest_stack_selector(ss) ((void)0) +#define fixup_guest_stack_selector(d, ss) ((void)0) #else -#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss) +#define fixup_guest_stack_selector(d, ss) __fixup_guest_selector(d, ss) #endif /* * Code selectors are always fixed up. It allows the Xen exit stub to detect * return to guest context, even when the guest kernel runs in ring 0. */ -#define fixup_guest_code_selector(cs) __fixup_guest_selector(cs) +#define fixup_guest_code_selector(d, cs) __fixup_guest_selector(d, cs) /* * We need this function because enforcing the correct guest kernel RPL is @@ -57,11 +102,15 @@ * DPL < CPL then they''ll be cleared automatically. If SS RPL or DPL differs * from CS RPL then we''ll #GP. */ -#define guest_gate_selector_okay(sel) \ +#define guest_gate_selector_okay(d, sel) \ ((((sel)>>3) < FIRST_RESERVED_GDT_ENTRY) || /* Guest seg? */ \ - ((sel) == FLAT_KERNEL_CS) || /* Xen default seg? */ \ + ((sel) == (!IS_COMPAT(d) ? \ + FLAT_KERNEL_CS : /* Xen default seg? */ \ + FLAT_COMPAT_KERNEL_CS)) || /* Xen default compat seg? */ \ ((sel) & 4)) /* LDT seg? */ +#endif /* __ASSEMBLY__ */ + /* These are bitmasks for the high 32 bits of a descriptor table entry. */ #define _SEGMENT_TYPE (15<< 8) #define _SEGMENT_EC ( 1<<10) /* Expand-down or Conforming segment */ @@ -81,12 +130,6 @@ struct desc_struct { #if defined(__x86_64__) -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2) - -#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) - typedef struct { u64 a, b; } idt_entry_t; @@ -118,14 +161,6 @@ do { #elif defined(__i386__) -#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY - -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1) - -#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY) - typedef struct desc_struct idt_entry_t; #define _set_gate(gate_addr,type,dpl,addr) \ @@ -155,6 +190,11 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\ #endif extern struct desc_struct gdt_table[]; +#ifdef CONFIG_COMPAT +extern struct desc_struct compat_gdt_table[]; +#else +# define compat_gdt_table gdt_table +#endif struct Xgt_desc_struct { unsigned short size; Index: 2006-10-04/xen/include/asm-x86/ldt.h ==================================================================--- 2006-10-04.orig/xen/include/asm-x86/ldt.h 2005-11-17 15:51:06.000000000 +0100 +++ 2006-10-04/xen/include/asm-x86/ldt.h 2006-10-04 15:03:07.000000000 +0200 @@ -17,7 +17,8 @@ static inline void load_LDT(struct vcpu else { cpu = smp_processor_id(); - desc = gdt_table + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY; + desc = (!IS_COMPAT(v->domain) ? gdt_table : compat_gdt_table) + + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY; _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2); __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) ); } Index: 2006-10-04/xen/include/asm-x86/mm.h ==================================================================--- 2006-10-04.orig/xen/include/asm-x86/mm.h 2006-10-04 08:49:31.000000000 +0200 +++ 2006-10-04/xen/include/asm-x86/mm.h 2006-10-04 15:03:07.000000000 +0200 @@ -280,7 +280,7 @@ unsigned long pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab); #endif /* CONFIG_PAGING_LEVELS == 3 */ -int check_descriptor(struct desc_struct *d); +int check_descriptor(const struct domain *, struct desc_struct *d); /* * The MPT (machine->physical mapping table) is an array of word-sized Index: 2006-10-04/xen/include/asm-x86/regs.h ==================================================================--- 2006-10-04.orig/xen/include/asm-x86/regs.h 2006-03-09 13:13:42.000000000 +0100 +++ 2006-10-04/xen/include/asm-x86/regs.h 2006-10-04 15:03:07.000000000 +0200 @@ -38,7 +38,8 @@ enum EFLAGS { ASSERT(diff < STACK_SIZE); \ /* If a guest frame, it must be have guest privs (unless HVM guest). */ \ /* We permit CS==0 which can come from an uninitialised trap entry. */ \ - ASSERT((diff != 0) || vm86_mode(r) || ((r->cs&3) >= GUEST_KERNEL_RPL) || \ + ASSERT((diff != 0) || vm86_mode(r) || \ + ((r->cs&3) >= GUEST_KERNEL_RPL(current->domain)) || \ (r->cs == 0) || hvm_guest(current)); \ /* If not a guest frame, it must be a hypervisor frame. */ \ ASSERT((diff == 0) || (!vm86_mode(r) && (r->cs == __HYPERVISOR_CS))); \ Index: 2006-10-04/xen/include/asm-x86/x86_64/regs.h ==================================================================--- 2006-10-04.orig/xen/include/asm-x86/x86_64/regs.h 2006-03-09 13:13:42.000000000 +0100 +++ 2006-10-04/xen/include/asm-x86/x86_64/regs.h 2006-10-04 15:03:07.000000000 +0200 @@ -11,7 +11,9 @@ #define ring_3(r) (((r)->cs & 3) == 3) #define guest_kernel_mode(v, r) \ - (ring_3(r) && ((v)->arch.flags & TF_kernel_mode)) + (!IS_COMPAT(v->domain) ? \ + ring_3(r) && ((v)->arch.flags & TF_kernel_mode) : \ + ring_1(r)) #define permit_softint(dpl, v, r) \ ((dpl) >= (guest_kernel_mode(v, r) ? 1 : 3)) Index: 2006-10-04/xen/include/public/arch-x86_64.h ==================================================================--- 2006-10-04.orig/xen/include/public/arch-x86_64.h 2006-09-11 09:06:11.000000000 +0200 +++ 2006-10-04/xen/include/public/arch-x86_64.h 2006-10-04 15:03:07.000000000 +0200 @@ -192,7 +192,10 @@ DEFINE_XEN_GUEST_HANDLE(trap_info_t); #ifdef __GNUC__ /* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */ -#define __DECL_REG(name) union { uint64_t r ## name, e ## name; } +#define __DECL_REG(name) union { \ + uint64_t r ## name, e ## name; \ + uint32_t _e ## name; \ +} #else /* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */ #define __DECL_REG(name) uint64_t r ## name @@ -265,7 +268,17 @@ struct vcpu_guest_context { unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ unsigned long event_callback_eip; unsigned long failsafe_callback_eip; +#ifdef __GNUC__ + union { + unsigned long syscall_callback_eip; + struct { + unsigned int event_callback_cs; /* compat CS of event cb */ + unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */ + }; + }; +#else unsigned long syscall_callback_eip; +#endif unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ /* Segment base addresses. */ uint64_t fs_base; Index: 2006-10-04/xen/include/xen/elf.h ==================================================================--- 2006-10-04.orig/xen/include/xen/elf.h 2006-08-25 15:36:10.000000000 +0200 +++ 2006-10-04/xen/include/xen/elf.h 2006-10-04 15:03:07.000000000 +0200 @@ -533,6 +533,15 @@ extern unsigned long long xen_elfnote_nu int type, int *defined); extern const char *xen_elfnote_string(struct domain_setup_info *dsi, int type); +#ifdef CONFIG_COMPAT +extern int elf32_sanity_check(Elf32_Ehdr *ehdr); +extern int loadelf32image(struct domain_setup_info *); +extern int parseelf32image(struct domain_setup_info *); +extern unsigned long long xen_elf32note_numeric(struct domain_setup_info *, + int type, int *defined); +extern const char *xen_elf32note_string(struct domain_setup_info *, int type); +#endif + #ifdef Elf_Ehdr extern int elf_sanity_check(Elf_Ehdr *ehdr); #endif Index: 2006-10-04/xen/include/xen/sched.h ==================================================================--- 2006-10-04.orig/xen/include/xen/sched.h 2006-09-21 11:09:00.000000000 +0200 +++ 2006-10-04/xen/include/xen/sched.h 2006-10-04 15:03:07.000000000 +0200 @@ -417,6 +417,9 @@ extern struct domain *domain_list; /* Domain is paused by the hypervisor? */ #define _DOMF_paused 6 #define DOMF_paused (1UL<<_DOMF_paused) + /* Domain is a compatibility one? */ +#define _DOMF_compat 7 +#define DOMF_compat (1UL<<_DOMF_compat) static inline int vcpu_runnable(struct vcpu *v) { @@ -453,6 +456,13 @@ static inline void vcpu_unblock(struct v #define IS_PRIV(_d) \ (test_bit(_DOMF_privileged, &(_d)->domain_flags)) +#ifdef CONFIG_COMPAT +#define IS_COMPAT(_d) \ + (test_bit(_DOMF_compat, &(_d)->domain_flags)) +#else +#define IS_COMPAT(_d) 0 +#endif + #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist)) #endif /* __SCHED_H__ */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel