The major issue with supporting a significantly larger number of physical
CPUs appears to be the use of per-CPU GDT entries - at present, x86-64
could support only up to 126 CPUs (with code changes to also use the
top-most GDT page, that would be 254). Instead of trying to go with
incremental steps here, by converting the GDT itself to be per-CPU,
limitations in that respect go away entirely.
There''s one particular part of it that I''m not very happy
with, but have
had no better idea so far: In the general case, it is now necessary to
reload the GDT twice during context switch. Hence I''d appreciate ideas
on how to avoid this and stay with a single reload.
The patch has several debug items in it (which are marked as such), so is
in no case intended to go in as-is.
Jan
Index: 2008-09-01/xen/arch/x86/boot/wakeup.S
==================================================================---
2008-09-01.orig/xen/arch/x86/boot/wakeup.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/boot/wakeup.S 2008-09-09 10:44:30.000000000 +0200
@@ -168,7 +168,7 @@ wakeup_32:
.word 0,0,0
lgdt_descr:
.word LAST_RESERVED_GDT_BYTE
- .quad gdt_table - FIRST_RESERVED_GDT_BYTE
+ .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
wakeup_64:
lgdt lgdt_descr(%rip)
Index: 2008-09-01/xen/arch/x86/boot/x86_32.S
==================================================================---
2008-09-01.orig/xen/arch/x86/boot/x86_32.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/boot/x86_32.S 2008-09-09 14:45:58.000000000 +0200
@@ -78,7 +78,7 @@ idt_descr:
.word 0
gdt_descr:
.word LAST_RESERVED_GDT_BYTE
- .long gdt_table - FIRST_RESERVED_GDT_BYTE
+ .long boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
.align 32
@@ -94,7 +94,7 @@ ENTRY(idle_pg_table)
#define GUEST_DESC(d) \
.long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff,
\
((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) |
(d)
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */
@@ -102,4 +102,6 @@ ENTRY(gdt_table)
GUEST_DESC(0x00c0b200) /* 0xe021 ring 1 3.xxGB data at 0x0 */
GUEST_DESC(0x00c0fa00) /* 0xe02b ring 3 3.xxGB code at 0x0 */
GUEST_DESC(0x00c0f200) /* 0xe033 ring 3 3.xxGB data at 0x0 */
+ .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0
+ .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */
.align PAGE_SIZE,0
Index: 2008-09-01/xen/arch/x86/boot/x86_64.S
==================================================================---
2008-09-01.orig/xen/arch/x86/boot/x86_64.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/boot/x86_64.S 2008-09-09 14:45:08.000000000 +0200
@@ -85,7 +85,7 @@ multiboot_ptr:
.word 0
gdt_descr:
.word LAST_RESERVED_GDT_BYTE
- .quad gdt_table - FIRST_RESERVED_GDT_BYTE
+ .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
.word 0,0,0
idt_descr:
@@ -96,7 +96,7 @@ ENTRY(stack_start)
.quad cpu0_stack
.align PAGE_SIZE, 0
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 data */
@@ -105,11 +105,13 @@ ENTRY(gdt_table)
.quad 0x00cff2000000ffff /* 0xe02b ring 3 data */
.quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */
.quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */
+ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+ .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */
.align PAGE_SIZE, 0
/* NB. Even rings != 0 get access to the full 4Gb, as only the */
/* (compatibility) machine->physical mapping table lives there. */
-ENTRY(compat_gdt_table)
+ENTRY(boot_cpu_compat_gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 data */
@@ -118,4 +120,6 @@ ENTRY(compat_gdt_table)
.quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */
.quad 0x00cff2000000ffff /* 0xe033 ring 3 data */
.quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */
+ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+ .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */
.align PAGE_SIZE, 0
Index: 2008-09-01/xen/arch/x86/cpu/common.c
==================================================================---
2008-09-01.orig/xen/arch/x86/cpu/common.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/cpu/common.c 2008-09-10 16:09:18.000000000 +0200
@@ -575,6 +575,9 @@ void __cpuinit cpu_init(void)
if (cpu_has_pat)
wrmsrl(MSR_IA32_CR_PAT, host_pat);
+ /* Install correct page table. */
+ write_ptbase(current);
+
*(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
*(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current);
asm volatile ( "lgdt %0" : "=m" (gdt_load) );
@@ -605,9 +608,6 @@ void __cpuinit cpu_init(void)
#define CD(register) asm volatile ( "mov %0,%%db" #register : :
"r"(0UL) );
CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
#undef CD
-
- /* Install correct page table. */
- write_ptbase(current);
}
#ifdef CONFIG_HOTPLUG_CPU
Index: 2008-09-01/xen/arch/x86/domain.c
==================================================================---
2008-09-01.orig/xen/arch/x86/domain.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/domain.c 2008-09-10 15:10:05.000000000 +0200
@@ -205,7 +205,6 @@ static inline int may_switch_mode(struct
int switch_native(struct domain *d)
{
- l1_pgentry_t gdt_l1e;
unsigned int vcpuid;
if ( d == NULL )
@@ -217,12 +216,8 @@ int switch_native(struct domain *d)
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
- /* switch gdt */
- gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
{
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
if (d->vcpu[vcpuid])
release_compat_l4(d->vcpu[vcpuid]);
}
@@ -232,7 +227,6 @@ int switch_native(struct domain *d)
int switch_compat(struct domain *d)
{
- l1_pgentry_t gdt_l1e;
unsigned int vcpuid;
if ( d == NULL )
@@ -244,15 +238,11 @@ int switch_compat(struct domain *d)
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
- /* switch gdt */
- gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
{
if ( (d->vcpu[vcpuid] != NULL) &&
(setup_compat_l4(d->vcpu[vcpuid]) != 0) )
goto undo_and_fail;
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
}
domain_set_alloc_bitsize(d);
@@ -261,13 +251,10 @@ int switch_compat(struct domain *d)
undo_and_fail:
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
- gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
while ( vcpuid-- != 0 )
{
if ( d->vcpu[vcpuid] != NULL )
release_compat_l4(d->vcpu[vcpuid]);
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
}
return -ENOMEM;
}
@@ -315,7 +302,13 @@ int vcpu_initialise(struct vcpu *v)
if ( is_idle_domain(d) )
{
v->arch.schedule_tail = continue_idle_domain;
- v->arch.cr3 = __pa(idle_pg_table);
+ if ( v->vcpu_id )
+ v->arch.cr3 = d->vcpu[0]->arch.cr3;
+ else if ( !*idle_vcpu )
+ v->arch.cr3 = __pa(idle_pg_table);
+ else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
+ return -ENOMEM;
+else printk("new idle domain: CR3=%lx\n", v->arch.cr3);//temp
}
v->arch.guest_context.ctrlreg[4] @@ -342,8 +335,7 @@ int
arch_domain_create(struct domain *d,
#ifdef __x86_64__
struct page_info *pg;
#endif
- l1_pgentry_t gdt_l1e;
- int i, vcpuid, pdpt_order, paging_initialised = 0;
+ int i, pdpt_order, paging_initialised = 0;
int rc = -ENOMEM;
d->arch.hvm_domain.hap_enabled @@ -362,18 +354,6 @@ int
arch_domain_create(struct domain *d,
goto fail;
memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
- /*
- * Map Xen segments into every VCPU''s GDT, irrespective of whether
every
- * VCPU will actually be used. This avoids an NMI race during context
- * switch: if we take an interrupt after switching CR3 but before switching
- * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
- * try to load CS from an invalid table.
- */
- gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
- for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-
#if defined(__i386__)
mapcache_domain_init(d);
@@ -1183,12 +1163,26 @@ static void paravirt_ctxt_switch_to(stru
}
}
+static void check_cpu(unsigned int cpu, int line) {//temp
+ unsigned int _cpu;
+ asm("lsll %1, %0" : "=r" (_cpu) : "rm"
(PER_CPU_GDT_ENTRY << 3));
+ if(_cpu != cpu) {
+ struct desc_ptr gdt_desc;
+ asm("sgdt %0" : "=m" (gdt_desc));
+ printk("CPU#%u: wrong GDT (%lx->%u) at #%d\n", cpu,
gdt_desc.base, _cpu, line);
+ show_page_walk(gdt_desc.base + FIRST_RESERVED_GDT_BYTE);
+ }
+}
+
static void __context_switch(void)
{
struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
- unsigned int cpu = smp_processor_id();
+ unsigned int i, cpu = smp_processor_id();
struct vcpu *p = per_cpu(curr_vcpu, cpu);
struct vcpu *n = current;
+ struct desc_struct *gdt;
+ struct page_info *page;
+ struct desc_ptr gdt_desc;
ASSERT(p != n);
ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
@@ -1214,14 +1208,35 @@ static void __context_switch(void)
cpu_set(cpu, n->domain->domain_dirty_cpumask);
cpu_set(cpu, n->vcpu_dirty_cpumask);
+check_cpu(cpu, __LINE__);//temp
+ gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
+ per_cpu(compat_gdt_table, cpu);
+ page = virt_to_page(gdt);
+ for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ {
+ n->domain->arch.mm_perdomain_pt
+ [(n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+ FIRST_RESERVED_GDT_PAGE + i]
+ = l1e_from_page(page + i, __PAGE_HYPERVISOR);
+ }
+
+check_cpu(cpu, __LINE__);//temp
+ if ( p->vcpu_id != n->vcpu_id )
+ {
+ gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
+ gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+check_cpu(cpu, __LINE__);//temp
+ }
+
write_ptbase(n);
+check_cpu(cpu, __LINE__);//temp
if ( p->vcpu_id != n->vcpu_id )
{
- char gdt_load[10];
- *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
- *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
- asm volatile ( "lgdt %0" : "=m" (gdt_load) );
+ gdt_desc.base = GDT_VIRT_START(n);
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+check_cpu(cpu, __LINE__);//temp
}
if ( p->domain != n->domain )
@@ -1257,6 +1272,7 @@ void context_switch(struct vcpu *prev, s
if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
{
+check_cpu(cpu, __LINE__);//temp
local_irq_enable();
}
else
@@ -1272,8 +1288,6 @@ void context_switch(struct vcpu *prev, s
uint64_t efer = read_efer();
if ( !(efer & EFER_SCE) )
write_efer(efer | EFER_SCE);
- flush_tlb_one_local(GDT_VIRT_START(next) +
- FIRST_RESERVED_GDT_BYTE);
}
#endif
Index: 2008-09-01/xen/arch/x86/domain_build.c
==================================================================---
2008-09-01.orig/xen/arch/x86/domain_build.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/domain_build.c 2008-09-09 11:01:01.000000000 +0200
@@ -313,24 +313,11 @@ int __init construct_dom0(
#if defined(__x86_64__)
if ( compat32 )
{
- l1_pgentry_t gdt_l1e;
-
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
v->vcpu_info = (void
*)&d->shared_info->compat.vcpu_info[0];
if ( nr_pages != (unsigned int)nr_pages )
nr_pages = UINT_MAX;
-
- /*
- * Map compatibility Xen segments into every VCPU''s GDT. See
- * arch_domain_create() for further comments.
- */
- gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
- PAGE_HYPERVISOR);
- for ( i = 0; i < MAX_VIRT_CPUS; i++ )
- d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
- flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
}
#endif
Index: 2008-09-01/xen/arch/x86/hvm/vmx/vmcs.c
==================================================================---
2008-09-01.orig/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-09 14:09:36.000000000 +0200
@@ -446,7 +446,7 @@ static void vmx_set_host_env(struct vcpu
__vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
- __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
+ __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
__vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
__vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
Index: 2008-09-01/xen/arch/x86/setup.c
==================================================================---
2008-09-01.orig/xen/arch/x86/setup.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/setup.c 2008-09-10 15:23:25.000000000 +0200
@@ -115,6 +115,12 @@ extern void early_cpu_init(void);
extern void vesa_init(void);
extern void vesa_mtrr_init(void);
+DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
+#ifdef CONFIG_COMPAT
+DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
+ = boot_cpu_compat_gdt_table;
+#endif
+
struct tss_struct init_tss[NR_CPUS];
char __attribute__ ((__section__(".bss.stack_aligned")))
cpu0_stack[STACK_SIZE];
@@ -224,6 +230,7 @@ static void __init percpu_init_areas(voi
static void __init init_idle_domain(void)
{
struct domain *idle_domain;
+ unsigned int i;
/* Domain creation requires that scheduler structures are initialised. */
scheduler_init();
@@ -236,6 +243,12 @@ static void __init init_idle_domain(void
idle_vcpu[0] = this_cpu(curr_vcpu) = current;
setup_idle_pagetable();
+
+ for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] +
l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
+ __PAGE_HYPERVISOR);
+
}
static void __init srat_detect_node(int cpu)
@@ -443,7 +456,6 @@ void __init __start_xen(unsigned long mb
parse_video_info();
set_current((struct vcpu *)0xfffff000); /* debug sanity */
- idle_vcpu[0] = current;
set_processor_id(0); /* needed early, for smp_processor_id() */
if ( cpu_has_efer )
rdmsrl(MSR_EFER, this_cpu(efer));
Index: 2008-09-01/xen/arch/x86/smpboot.c
==================================================================---
2008-09-01.orig/xen/arch/x86/smpboot.c 2008-09-10 12:34:16.000000000 +0200
+++ 2008-09-01/xen/arch/x86/smpboot.c 2008-09-10 13:43:56.000000000 +0200
@@ -835,10 +835,15 @@ static int __devinit do_boot_cpu(int api
*/
{
unsigned long boot_error;
+ unsigned int i;
int timeout;
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
struct vcpu *v;
+ struct desc_struct *gdt;
+#ifdef __x86_64__
+ struct page_info *page;
+#endif
/*
* Save current MTRR state in case it was changed since early boot
@@ -864,6 +869,38 @@ static int __devinit do_boot_cpu(int api
/* Debug build: detect stack overflow by setting up a guard page. */
memguard_guard_stack(stack_start.esp);
+ gdt = per_cpu(gdt_table, cpu);
+ if (gdt == boot_cpu_gdt_table) {
+ i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+#ifdef __x86_64__
+#ifdef CONFIG_COMPAT
+ page = alloc_domheap_pages(NULL, i,
+ MEMF_node(cpu_to_node(cpu)));
+ per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
+ memcpy(gdt, boot_cpu_compat_gdt_table,
+ NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+#endif
+ page = alloc_domheap_pages(NULL, i,
+ MEMF_node(cpu_to_node(cpu)));
+ per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
+#else
+ per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
+#endif
+ memcpy(gdt, boot_cpu_gdt_table,
+ NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+ BUILD_BUG_ON(NR_CPUS > 0x10000);
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+printk("CPU#%d: GDT@%p[%p]\n", cpu, gdt, per_cpu(compat_gdt_table,
cpu));//temp
+ }
+
+ for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ v->domain->arch.mm_perdomain_pt
+ [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+ FIRST_RESERVED_GDT_PAGE + i]
+ = l1e_from_page(virt_to_page(gdt) + i,
+ __PAGE_HYPERVISOR);
+
/*
* This grunge runs the startup process for
* the targeted processor.
Index: 2008-09-01/xen/arch/x86/traps.c
==================================================================---
2008-09-01.orig/xen/arch/x86/traps.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/traps.c 2008-09-09 16:15:21.000000000 +0200
@@ -2692,6 +2692,13 @@ asmlinkage void do_general_protection(st
return;
}
+if(regs->error_code) {//temp
+ struct desc_ptr gdt_desc;
+ asm("sgdt %0" : "=m" (gdt_desc));
+ printk("CPU[%u] GDT@%lx [%lx,%x]\n", smp_processor_id(),
GDT_VIRT_START(v), gdt_desc.base, gdt_desc.limit);
+ show_page_walk(GDT_VIRT_START(v) + regs->error_code);
+}
+
#if defined(__i386__)
if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
(regs->error_code == 0) &&
@@ -2961,13 +2968,13 @@ void set_intr_gate(unsigned int n, void
void set_tss_desc(unsigned int n, void *addr)
{
_set_tssldt_desc(
- gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+ per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
(unsigned long)addr,
offsetof(struct tss_struct, __cacheline_filler) - 1,
9);
#ifdef CONFIG_COMPAT
_set_tssldt_desc(
- compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+ per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
(unsigned long)addr,
offsetof(struct tss_struct, __cacheline_filler) - 1,
11);
Index: 2008-09-01/xen/arch/x86/x86_32/mm.c
==================================================================---
2008-09-01.orig/xen/arch/x86/x86_32/mm.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/mm.c 2008-09-10 16:04:08.000000000 +0200
@@ -135,6 +135,30 @@ void __init setup_idle_pagetable(void)
__PAGE_HYPERVISOR));
}
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+ unsigned int i;
+ struct domain *d = v->domain;
+ l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
+ l2_pgentry_t *l2_table = alloc_xenheap_page();
+
+ if ( !l2_table )
+ return 0;
+
+ memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
+ l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] +
l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
+
+ copy_page(l2_table, idle_pg_table_l2 +
+ l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
+ for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
+ l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] +
l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
+
+ return __pa(l3_table);
+}
+
void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
{
int i;
@@ -189,7 +213,7 @@ void __init subarch_init_memory(void)
{
/* Guest kernel runs in ring 0, not ring 1. */
struct desc_struct *d;
- d = &gdt_table[(FLAT_RING1_CS >> 3) -
FIRST_RESERVED_GDT_ENTRY];
+ d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) -
FIRST_RESERVED_GDT_ENTRY];
d[0].b &= ~_SEGMENT_DPL;
d[1].b &= ~_SEGMENT_DPL;
}
Index: 2008-09-01/xen/arch/x86/x86_32/supervisor_mode_kernel.S
==================================================================---
2008-09-01.orig/xen/arch/x86/x86_32/supervisor_mode_kernel.S 2008-09-10
13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/supervisor_mode_kernel.S 2008-09-09
13:57:13.000000000 +0200
@@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack)
# %gs:%esi now points to the guest stack before the
# interrupt/exception occured.
- /*
- * Reverse the __TSS macro, giving us the CPU number.
- * The TSS for this cpu is at init_tss + ( cpu * 128 ).
- */
- str %ecx
- shrl $3,%ecx # Calculate GDT index
for TSS.
- subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu.
- shll $6,%ecx # Each TSS entry is
0x80 bytes
- addl $init_tss,%ecx # but we have 2*cpu
from above.
+ movl $PER_CPU_GDT_ENTRY*8,%ecx
+ lsll %ecx,%ecx
+ shll $7,%ecx # Each TSS entry is
0x80 bytes
+ addl $init_tss,%ecx
# Load Xen stack from TSS.
movw TSS_ss0(%ecx),%ax
Index: 2008-09-01/xen/arch/x86/x86_32/traps.c
==================================================================---
2008-09-01.orig/xen/arch/x86/x86_32/traps.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/traps.c 2008-09-09 14:48:33.000000000 +0200
@@ -197,13 +197,15 @@ static unsigned char doublefault_stack[D
asmlinkage void do_double_fault(void)
{
- struct tss_struct *tss = &doublefault_tss;
- unsigned int cpu =
((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
+ struct tss_struct *tss;
+ unsigned int cpu;
watchdog_disable();
console_force_unlock();
+ asm ( "lsll %1, %0" : "=r" (cpu) : "rm"
(PER_CPU_GDT_ENTRY << 3) );
+
/* Find information saved during fault and dump it to the console. */
tss = &init_tss[cpu];
printk("*** DOUBLE FAULT ***\n");
@@ -328,7 +330,7 @@ void __devinit subarch_percpu_traps_init
tss->eflags = 2;
tss->bitmap = IOBMP_INVALID_OFFSET;
_set_tssldt_desc(
- gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+ boot_cpu_gdt_table + __DOUBLEFAULT_TSS_ENTRY -
FIRST_RESERVED_GDT_ENTRY,
(unsigned long)tss, 235, 9);
set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
Index: 2008-09-01/xen/arch/x86/x86_64/mm.c
==================================================================---
2008-09-01.orig/xen/arch/x86/x86_64/mm.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_64/mm.c 2008-09-10 15:51:37.000000000 +0200
@@ -21,6 +21,7 @@
#include <xen/lib.h>
#include <xen/init.h>
#include <xen/mm.h>
+#include <xen/numa.h>
#include <xen/sched.h>
#include <xen/guest_access.h>
#include <asm/current.h>
@@ -209,6 +210,24 @@ void __init setup_idle_pagetable(void)
__PAGE_HYPERVISOR));
}
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct page_info *page = alloc_domheap_page(NULL,
+ MEMF_node(vcpu_to_node(v)));
+ l4_pgentry_t *l4_table = page_to_virt(page);
+
+ if ( !page )
+ return 0;
+
+ copy_page(l4_table, idle_pg_table);
+ l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] +
l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
+ __PAGE_HYPERVISOR);
+
+ return __pa(l4_table);
+}
+
void __init zap_low_mappings(void)
{
BUG_ON(num_online_cpus() != 1);
Index: 2008-09-01/xen/arch/x86/x86_64/traps.c
==================================================================---
2008-09-01.orig/xen/arch/x86/x86_64/traps.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_64/traps.c 2008-09-09 14:49:19.000000000 +0200
@@ -217,15 +217,14 @@ void show_page_walk(unsigned long addr)
asmlinkage void double_fault(void);
asmlinkage void do_double_fault(struct cpu_user_regs *regs)
{
- unsigned int cpu, tr;
-
- asm volatile ( "str %0" : "=r" (tr) );
- cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
+ unsigned int cpu;
watchdog_disable();
console_force_unlock();
+ asm ( "lsll %1, %0" : "=r" (cpu) : "rm"
(PER_CPU_GDT_ENTRY << 3) );
+
/* Find information saved during fault and dump it to the console. */
printk("*** DOUBLE FAULT ***\n");
print_xen_info();
Index: 2008-09-01/xen/common/domain.c
==================================================================---
2008-09-01.orig/xen/common/domain.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/common/domain.c 2008-09-10 08:29:58.000000000 +0200
@@ -172,7 +172,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
{
struct domain *d;
struct vcpu *v;
- unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
+ unsigned int vcpu_id = cpu_id % 2;//temp MAX_VIRT_CPUS;
if ( (v = idle_vcpu[cpu_id]) != NULL )
return v;
Index: 2008-09-01/xen/include/asm-x86/desc.h
==================================================================---
2008-09-01.orig/xen/include/asm-x86/desc.h 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/include/asm-x86/desc.h 2008-09-09 14:32:27.000000000 +0200
@@ -34,11 +34,9 @@
#define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS
#define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2)
-
-#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 2)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2)
#elif defined(__i386__)
@@ -51,17 +49,15 @@
#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1)
-
-#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 1)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1)
#endif
#ifndef __ASSEMBLY__
-#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : :
"a" (__TSS(n)<<3) )
+#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : :
"a" (TSS_ENTRY<<3) )
#if defined(__x86_64__)
#define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
@@ -205,11 +201,19 @@ do {
#endif
-extern struct desc_struct gdt_table[];
+struct desc_ptr {
+ unsigned short limit;
+ unsigned long base;
+} __attribute__((__packed__)) ;
+
+extern struct desc_struct boot_cpu_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, gdt_table);
#ifdef CONFIG_COMPAT
-extern struct desc_struct compat_gdt_table[];
+extern struct desc_struct boot_cpu_compat_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
#else
-# define compat_gdt_table gdt_table
+# define boot_cpu_compat_gdt_table boot_cpu_gdt_table
+# define per_cpu__compat_gdt_table per_cpu__gdt_table
#endif
extern void set_intr_gate(unsigned int irq, void * addr);
Index: 2008-09-01/xen/include/asm-x86/ldt.h
==================================================================---
2008-09-01.orig/xen/include/asm-x86/ldt.h 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/include/asm-x86/ldt.h 2008-09-09 14:13:41.000000000 +0200
@@ -6,7 +6,6 @@
static inline void load_LDT(struct vcpu *v)
{
- unsigned int cpu;
struct desc_struct *desc;
unsigned long ents;
@@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu
}
else
{
- cpu = smp_processor_id();
- desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table)
- + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY;
+ desc = (!is_pv_32on64_vcpu(v)
+ ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
+ + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
_set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2);
- __asm__ __volatile__ ( "lldt %%ax" : : "a"
(__LDT(cpu)<<3) );
+ __asm__ __volatile__ ( "lldt %%ax" : : "a"
(LDT_ENTRY << 3) );
}
}
Index: 2008-09-01/xen/include/asm-x86/page.h
==================================================================---
2008-09-01.orig/xen/include/asm-x86/page.h 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/include/asm-x86/page.h 2008-09-10 09:06:02.000000000 +0200
@@ -278,6 +278,7 @@ extern unsigned int m2p_compat_vstart;
#endif
void paging_init(void);
void setup_idle_pagetable(void);
+unsigned long clone_idle_pagetable(struct vcpu *);
#endif /* !defined(__ASSEMBLY__) */
#define _PAGE_PRESENT 0x001U
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel