The major issue with supporting a significantly larger number of physical CPUs appears to be the use of per-CPU GDT entries - at present, x86-64 could support only up to 126 CPUs (with code changes to also use the top-most GDT page, that would be 254). Instead of trying to go with incremental steps here, by converting the GDT itself to be per-CPU, limitations in that respect go away entirely. Signed-off-by: Jan Beulich <jbeulich@novell.com> Index: 2008-09-19/xen/arch/x86/boot/wakeup.S ==================================================================--- 2008-09-19.orig/xen/arch/x86/boot/wakeup.S 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/boot/wakeup.S 2008-09-19 13:56:36.000000000 +0200 @@ -168,7 +168,7 @@ wakeup_32: .word 0,0,0 lgdt_descr: .word LAST_RESERVED_GDT_BYTE - .quad gdt_table - FIRST_RESERVED_GDT_BYTE + .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE wakeup_64: lgdt lgdt_descr(%rip) Index: 2008-09-19/xen/arch/x86/boot/x86_32.S ==================================================================--- 2008-09-19.orig/xen/arch/x86/boot/x86_32.S 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/boot/x86_32.S 2008-09-19 13:56:36.000000000 +0200 @@ -78,7 +78,7 @@ idt_descr: .word 0 gdt_descr: .word LAST_RESERVED_GDT_BYTE - .long gdt_table - FIRST_RESERVED_GDT_BYTE + .long boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .align 32 @@ -94,7 +94,7 @@ ENTRY(idle_pg_table) #define GUEST_DESC(d) \ .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff, \ ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d) -ENTRY(gdt_table) +ENTRY(boot_cpu_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */ @@ -102,4 +102,6 @@ ENTRY(gdt_table) GUEST_DESC(0x00c0b200) /* 0xe021 ring 1 3.xxGB data at 0x0 */ GUEST_DESC(0x00c0fa00) /* 0xe02b ring 3 3.xxGB code at 0x0 */ GUEST_DESC(0x00c0f200) /* 0xe033 ring 3 3.xxGB data at 0x0 */ + .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE,0 Index: 2008-09-19/xen/arch/x86/boot/x86_64.S ==================================================================--- 2008-09-19.orig/xen/arch/x86/boot/x86_64.S 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/boot/x86_64.S 2008-09-19 13:56:36.000000000 +0200 @@ -85,7 +85,7 @@ multiboot_ptr: .word 0 gdt_descr: .word LAST_RESERVED_GDT_BYTE - .quad gdt_table - FIRST_RESERVED_GDT_BYTE + .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .word 0,0,0 idt_descr: @@ -96,7 +96,7 @@ ENTRY(stack_start) .quad cpu0_stack .align PAGE_SIZE, 0 -ENTRY(gdt_table) +ENTRY(boot_cpu_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ @@ -105,11 +105,13 @@ ENTRY(gdt_table) .quad 0x00cff2000000ffff /* 0xe02b ring 3 data */ .quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ + .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 /* NB. Even rings != 0 get access to the full 4Gb, as only the */ /* (compatibility) machine->physical mapping table lives there. */ -ENTRY(compat_gdt_table) +ENTRY(boot_cpu_compat_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ @@ -118,4 +120,6 @@ ENTRY(compat_gdt_table) .quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */ .quad 0x00cff2000000ffff /* 0xe033 ring 3 data */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ + .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 + .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 Index: 2008-09-19/xen/arch/x86/cpu/common.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/cpu/common.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/cpu/common.c 2008-09-19 13:56:36.000000000 +0200 @@ -575,6 +575,9 @@ void __cpuinit cpu_init(void) if (cpu_has_pat) wrmsrl(MSR_IA32_CR_PAT, host_pat); + /* Install correct page table. */ + write_ptbase(current); + *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current); asm volatile ( "lgdt %0" : "=m" (gdt_load) ); @@ -605,9 +608,6 @@ void __cpuinit cpu_init(void) #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) ); CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); #undef CD - - /* Install correct page table. */ - write_ptbase(current); } #ifdef CONFIG_HOTPLUG_CPU Index: 2008-09-19/xen/arch/x86/domain.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/domain.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/domain.c 2008-09-19 13:57:28.000000000 +0200 @@ -211,7 +211,6 @@ static inline int may_switch_mode(struct int switch_native(struct domain *d) { - l1_pgentry_t gdt_l1e; unsigned int vcpuid; if ( d == NULL ) @@ -223,12 +222,8 @@ int switch_native(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; - /* switch gdt */ - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) { - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; if (d->vcpu[vcpuid]) release_compat_l4(d->vcpu[vcpuid]); } @@ -238,7 +233,6 @@ int switch_native(struct domain *d) int switch_compat(struct domain *d) { - l1_pgentry_t gdt_l1e; unsigned int vcpuid; if ( d == NULL ) @@ -250,15 +244,11 @@ int switch_compat(struct domain *d) d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; - /* switch gdt */ - gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) { if ( (d->vcpu[vcpuid] != NULL) && (setup_compat_l4(d->vcpu[vcpuid]) != 0) ) goto undo_and_fail; - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; } domain_set_alloc_bitsize(d); @@ -267,13 +257,10 @@ int switch_compat(struct domain *d) undo_and_fail: d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); while ( vcpuid-- != 0 ) { if ( d->vcpu[vcpuid] != NULL ) release_compat_l4(d->vcpu[vcpuid]); - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; } return -ENOMEM; } @@ -322,7 +309,12 @@ int vcpu_initialise(struct vcpu *v) if ( is_idle_domain(d) ) { v->arch.schedule_tail = continue_idle_domain; - v->arch.cr3 = __pa(idle_pg_table); + if ( v->vcpu_id ) + v->arch.cr3 = d->vcpu[0]->arch.cr3; + else if ( !*idle_vcpu ) + v->arch.cr3 = __pa(idle_pg_table); + else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) ) + return -ENOMEM; } v->arch.guest_context.ctrlreg[4] @@ -349,8 +341,7 @@ int arch_domain_create(struct domain *d, #ifdef __x86_64__ struct page_info *pg; #endif - l1_pgentry_t gdt_l1e; - int i, vcpuid, pdpt_order, paging_initialised = 0; + int i, pdpt_order, paging_initialised = 0; int rc = -ENOMEM; d->arch.hvm_domain.hap_enabled @@ -369,18 +360,6 @@ int arch_domain_create(struct domain *d, goto fail; memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order); - /* - * Map Xen segments into every VCPU''s GDT, irrespective of whether every - * VCPU will actually be used. This avoids an NMI race during context - * switch: if we take an interrupt after switching CR3 but before switching - * GDT, and the old VCPU# is invalid in the new domain, we would otherwise - * try to load CS from an invalid table. - */ - gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); - for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) - d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; - #if defined(__i386__) mapcache_domain_init(d); @@ -1193,9 +1172,12 @@ static void paravirt_ctxt_switch_to(stru static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); - unsigned int cpu = smp_processor_id(); + unsigned int i, cpu = smp_processor_id(); struct vcpu *p = per_cpu(curr_vcpu, cpu); struct vcpu *n = current; + struct desc_struct *gdt; + struct page_info *page; + struct desc_ptr gdt_desc; ASSERT(p != n); ASSERT(cpus_empty(n->vcpu_dirty_cpumask)); @@ -1221,14 +1203,30 @@ static void __context_switch(void) cpu_set(cpu, n->domain->domain_dirty_cpumask); cpu_set(cpu, n->vcpu_dirty_cpumask); + gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) : + per_cpu(compat_gdt_table, cpu); + page = virt_to_page(gdt); + for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) + { + l1e_write(n->domain->arch.mm_perdomain_pt + + (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE + i, + l1e_from_page(page + i, __PAGE_HYPERVISOR)); + } + + if ( p->vcpu_id != n->vcpu_id ) + { + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; + gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + } + write_ptbase(n); if ( p->vcpu_id != n->vcpu_id ) { - char gdt_load[10]; - *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; - *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n); - asm volatile ( "lgdt %0" : "=m" (gdt_load) ); + gdt_desc.base = GDT_VIRT_START(n); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } if ( p->domain != n->domain ) @@ -1279,8 +1277,6 @@ void context_switch(struct vcpu *prev, s uint64_t efer = read_efer(); if ( !(efer & EFER_SCE) ) write_efer(efer | EFER_SCE); - flush_tlb_one_local(GDT_VIRT_START(next) + - FIRST_RESERVED_GDT_BYTE); } #endif Index: 2008-09-19/xen/arch/x86/domain_build.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/domain_build.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/domain_build.c 2008-09-19 13:56:36.000000000 +0200 @@ -314,24 +314,11 @@ int __init construct_dom0( #if defined(__x86_64__) if ( compat32 ) { - l1_pgentry_t gdt_l1e; - d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0]; if ( nr_pages != (unsigned int)nr_pages ) nr_pages = UINT_MAX; - - /* - * Map compatibility Xen segments into every VCPU''s GDT. See - * arch_domain_create() for further comments. - */ - gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), - PAGE_HYPERVISOR); - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; - flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE); } #endif Index: 2008-09-19/xen/arch/x86/hvm/vmx/vmcs.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-19 13:56:36.000000000 +0200 @@ -446,7 +446,7 @@ static void vmx_set_host_env(struct vcpu __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); - __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3); + __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3); __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]); __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom()); Index: 2008-09-19/xen/arch/x86/setup.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/setup.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/setup.c 2008-09-19 13:56:36.000000000 +0200 @@ -115,6 +115,12 @@ extern void early_cpu_init(void); extern void vesa_init(void); extern void vesa_mtrr_init(void); +DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table; +#ifdef CONFIG_COMPAT +DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table) + = boot_cpu_compat_gdt_table; +#endif + struct tss_struct init_tss[NR_CPUS]; char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE]; @@ -224,6 +230,7 @@ static void __init percpu_init_areas(voi static void __init init_idle_domain(void) { struct domain *idle_domain; + unsigned int i; /* Domain creation requires that scheduler structures are initialised. */ scheduler_init(); @@ -236,6 +243,12 @@ static void __init init_idle_domain(void idle_vcpu[0] = this_cpu(curr_vcpu) = current; setup_idle_pagetable(); + + for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) + idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] + l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i, + __PAGE_HYPERVISOR); + } static void __init srat_detect_node(int cpu) @@ -443,7 +456,6 @@ void __init __start_xen(unsigned long mb parse_video_info(); set_current((struct vcpu *)0xfffff000); /* debug sanity */ - idle_vcpu[0] = current; set_processor_id(0); /* needed early, for smp_processor_id() */ if ( cpu_has_efer ) rdmsrl(MSR_EFER, this_cpu(efer)); Index: 2008-09-19/xen/arch/x86/smpboot.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/smpboot.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/smpboot.c 2008-09-19 13:57:46.000000000 +0200 @@ -836,10 +836,15 @@ static int __devinit do_boot_cpu(int api */ { unsigned long boot_error; + unsigned int i; int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; struct vcpu *v; + struct desc_struct *gdt; +#ifdef __x86_64__ + struct page_info *page; +#endif /* * Save current MTRR state in case it was changed since early boot @@ -865,6 +870,37 @@ static int __devinit do_boot_cpu(int api /* Debug build: detect stack overflow by setting up a guard page. */ memguard_guard_stack(stack_start.esp); + gdt = per_cpu(gdt_table, cpu); + if (gdt == boot_cpu_gdt_table) { + i = get_order_from_pages(NR_RESERVED_GDT_PAGES); +#ifdef __x86_64__ +#ifdef CONFIG_COMPAT + page = alloc_domheap_pages(NULL, i, + MEMF_node(cpu_to_node(cpu))); + per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page); + memcpy(gdt, boot_cpu_compat_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; +#endif + page = alloc_domheap_pages(NULL, i, + MEMF_node(cpu_to_node(cpu))); + per_cpu(gdt_table, cpu) = gdt = page_to_virt(page); +#else + per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i); +#endif + memcpy(gdt, boot_cpu_gdt_table, + NR_RESERVED_GDT_PAGES * PAGE_SIZE); + BUILD_BUG_ON(NR_CPUS > 0x10000); + gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; + } + + for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) + v->domain->arch.mm_perdomain_pt + [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE + i] + = l1e_from_page(virt_to_page(gdt) + i, + __PAGE_HYPERVISOR); + /* * This grunge runs the startup process for * the targeted processor. Index: 2008-09-19/xen/arch/x86/traps.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/traps.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/traps.c 2008-09-19 13:58:14.000000000 +0200 @@ -2965,13 +2965,13 @@ void set_intr_gate(unsigned int n, void void set_tss_desc(unsigned int n, void *addr) { _set_tssldt_desc( - gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 9); #ifdef CONFIG_COMPAT _set_tssldt_desc( - compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY, + per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)addr, offsetof(struct tss_struct, __cacheline_filler) - 1, 11); Index: 2008-09-19/xen/arch/x86/x86_32/mm.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/x86_32/mm.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/x86_32/mm.c 2008-09-19 13:56:36.000000000 +0200 @@ -132,6 +132,30 @@ void __init setup_idle_pagetable(void) __PAGE_HYPERVISOR)); } +unsigned long clone_idle_pagetable(struct vcpu *v) +{ + unsigned int i; + struct domain *d = v->domain; + l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0]; + l2_pgentry_t *l2_table = alloc_xenheap_page(); + + if ( !l2_table ) + return 0; + + memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table)); + l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] + l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT); + + copy_page(l2_table, idle_pg_table_l2 + + l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES); + for ( i = 0; i < PDPT_L2_ENTRIES; ++i ) + l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] + l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); + + return __pa(l3_table); +} + void __init zap_low_mappings(l2_pgentry_t *dom0_l2) { int i; @@ -186,7 +210,7 @@ void __init subarch_init_memory(void) { /* Guest kernel runs in ring 0, not ring 1. */ struct desc_struct *d; - d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; + d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; d[0].b &= ~_SEGMENT_DPL; d[1].b &= ~_SEGMENT_DPL; } Index: 2008-09-19/xen/arch/x86/x86_32/supervisor_mode_kernel.S ==================================================================--- 2008-09-19.orig/xen/arch/x86/x86_32/supervisor_mode_kernel.S 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/x86_32/supervisor_mode_kernel.S 2008-09-19 13:56:36.000000000 +0200 @@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack) # %gs:%esi now points to the guest stack before the # interrupt/exception occured. - /* - * Reverse the __TSS macro, giving us the CPU number. - * The TSS for this cpu is at init_tss + ( cpu * 128 ). - */ - str %ecx - shrl $3,%ecx # Calculate GDT index for TSS. - subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu. - shll $6,%ecx # Each TSS entry is 0x80 bytes - addl $init_tss,%ecx # but we have 2*cpu from above. + movl $PER_CPU_GDT_ENTRY*8,%ecx + lsll %ecx,%ecx + shll $7,%ecx # Each TSS entry is 0x80 bytes + addl $init_tss,%ecx # Load Xen stack from TSS. movw TSS_ss0(%ecx),%ax Index: 2008-09-19/xen/arch/x86/x86_32/traps.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/x86_32/traps.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/x86_32/traps.c 2008-09-19 13:56:36.000000000 +0200 @@ -194,13 +194,15 @@ static unsigned char doublefault_stack[D asmlinkage void do_double_fault(void) { - struct tss_struct *tss = &doublefault_tss; - unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1; + struct tss_struct *tss; + unsigned int cpu; watchdog_disable(); console_force_unlock(); + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); + /* Find information saved during fault and dump it to the console. */ tss = &init_tss[cpu]; printk("*** DOUBLE FAULT ***\n"); @@ -325,7 +327,7 @@ void __devinit subarch_percpu_traps_init tss->eflags = 2; tss->bitmap = IOBMP_INVALID_OFFSET; _set_tssldt_desc( - gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, + boot_cpu_gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, 235, 9); set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3); Index: 2008-09-19/xen/arch/x86/x86_64/mm.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/x86_64/mm.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/x86_64/mm.c 2008-09-19 13:56:36.000000000 +0200 @@ -21,6 +21,7 @@ #include <xen/lib.h> #include <xen/init.h> #include <xen/mm.h> +#include <xen/numa.h> #include <xen/sched.h> #include <xen/guest_access.h> #include <asm/current.h> @@ -206,6 +207,24 @@ void __init setup_idle_pagetable(void) __PAGE_HYPERVISOR)); } +unsigned long clone_idle_pagetable(struct vcpu *v) +{ + struct domain *d = v->domain; + struct page_info *page = alloc_domheap_page(NULL, + MEMF_node(vcpu_to_node(v))); + l4_pgentry_t *l4_table = page_to_virt(page); + + if ( !page ) + return 0; + + copy_page(l4_table, idle_pg_table); + l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] + l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3), + __PAGE_HYPERVISOR); + + return __pa(l4_table); +} + void __init zap_low_mappings(void) { BUG_ON(num_online_cpus() != 1); Index: 2008-09-19/xen/arch/x86/x86_64/traps.c ==================================================================--- 2008-09-19.orig/xen/arch/x86/x86_64/traps.c 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/arch/x86/x86_64/traps.c 2008-09-19 13:56:36.000000000 +0200 @@ -213,15 +213,14 @@ void show_page_walk(unsigned long addr) asmlinkage void double_fault(void); asmlinkage void do_double_fault(struct cpu_user_regs *regs) { - unsigned int cpu, tr; - - asm volatile ( "str %0" : "=r" (tr) ); - cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2; + unsigned int cpu; watchdog_disable(); console_force_unlock(); + asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); + /* Find information saved during fault and dump it to the console. */ printk("*** DOUBLE FAULT ***\n"); print_xen_info(); Index: 2008-09-19/xen/include/asm-x86/desc.h ==================================================================--- 2008-09-19.orig/xen/include/asm-x86/desc.h 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/include/asm-x86/desc.h 2008-09-19 13:56:36.000000000 +0200 @@ -34,11 +34,9 @@ #define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS #define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2) - -#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define LDT_ENTRY (TSS_ENTRY + 2) +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2) #elif defined(__i386__) @@ -51,17 +49,15 @@ #define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1) - -#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY) +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) +#define LDT_ENTRY (TSS_ENTRY + 1) +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1) #endif #ifndef __ASSEMBLY__ -#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (__TSS(n)<<3) ) +#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (TSS_ENTRY<<3) ) #if defined(__x86_64__) #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3) @@ -205,11 +201,19 @@ do { #endif -extern struct desc_struct gdt_table[]; +struct desc_ptr { + unsigned short limit; + unsigned long base; +} __attribute__((__packed__)) ; + +extern struct desc_struct boot_cpu_gdt_table[]; +DECLARE_PER_CPU(struct desc_struct *, gdt_table); #ifdef CONFIG_COMPAT -extern struct desc_struct compat_gdt_table[]; +extern struct desc_struct boot_cpu_compat_gdt_table[]; +DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table); #else -# define compat_gdt_table gdt_table +# define boot_cpu_compat_gdt_table boot_cpu_gdt_table +# define per_cpu__compat_gdt_table per_cpu__gdt_table #endif extern void set_intr_gate(unsigned int irq, void * addr); Index: 2008-09-19/xen/include/asm-x86/ldt.h ==================================================================--- 2008-09-19.orig/xen/include/asm-x86/ldt.h 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/include/asm-x86/ldt.h 2008-09-19 13:56:36.000000000 +0200 @@ -6,7 +6,6 @@ static inline void load_LDT(struct vcpu *v) { - unsigned int cpu; struct desc_struct *desc; unsigned long ents; @@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu } else { - cpu = smp_processor_id(); - desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table) - + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY; + desc = (!is_pv_32on64_vcpu(v) + ? this_cpu(gdt_table) : this_cpu(compat_gdt_table)) + + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY; _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2); - __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) ); + __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) ); } } Index: 2008-09-19/xen/include/asm-x86/page.h ==================================================================--- 2008-09-19.orig/xen/include/asm-x86/page.h 2008-09-19 13:56:32.000000000 +0200 +++ 2008-09-19/xen/include/asm-x86/page.h 2008-09-19 13:56:36.000000000 +0200 @@ -278,6 +278,7 @@ extern unsigned int m2p_compat_vstart; #endif void paging_init(void); void setup_idle_pagetable(void); +unsigned long clone_idle_pagetable(struct vcpu *); #endif /* !defined(__ASSEMBLY__) */ #define _PAGE_PRESENT 0x001U _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel