Below is the patch I''ve been working with to get x86_64 SMP support going. It''s a definite work in progress. Currently, I''m able to build a second vcpu from dom0, but when it gets scheduled it crashes. Booting the same kernel with maxcpus=1 gets up to login prompt. Here''s some known issues: entry.S: - preempt_{en,dis}able is commented out for now (due to thread_info) - ugly shl,shr combo to clear upper 32 bits. smpboot.c: - faults if writing to cpu_gdt_table[1], so doing bogus reuse of cpu 0 table entries directly Here''s a copy of the crash: (XEN) (file=traps.c, line=459) Page fault: ffff83000014f67b -> ffff83000010b210 (XEN) Domain 0 (vcpu#1) crashed on cpu#1: (XEN) CPU: 1 (XEN) EIP: e030:[<ffffffff801181d0>] (XEN) EFLAGS: 0000000000010202 (XEN) rax: 0000000000735067 rbx: 0000000000000ff8 rcx: 0000000000000000 rdx: 0000000000000067 (XEN) rsi: 0000000020735067 rdi: ffff800000000000 rbp: 0000000000000ff8 rsp: ffff880000c36020 (XEN) r8: 0000000000000000 r9: 000000000000000b r10: ffffffff801181df r11: 0000000000000000 (XEN) r12: 0000000000000000 r13: 0000000000000000 r14: ffffffffff5fd023 r15: ffff880000c36120 (XEN) Xen stack trace from rsp=ffff880000c36020: (XEN) 0000ffff0000000f 0004000300020001 0008000700060005 ffff880000c40db0 ffff880000c36120 ffffffff803cba42 0000000000000000 0000000b0000000e (XEN) 0000000000000000 0000000000030001 0000000000000000 0000000000000000 0000000000000000 ffff880000c32800 0000000100000058 6c617475000000c3 (XEN) 0000000000636f6c 0000000100000000 eaeaeaeaeaeaeaea eaeaeaeaeaeaeaea eaeaeaeaeaeaeaea eaeaeaeaeaeaeaea eaeaeaeaeaeaeaea eaeaeaeaeaeaeaea (XEN) eaeaeaeaeaeaeaea 0000000000000000 0000000000000ff8 0000000000000000 0000000000000000 0000000000000ff8 ffff880000c362d8 ffffffff8010e4cb (XEN) ffff880000c362d8 0000000000000ff8 0000000000000000 0000000000000000 0000000000000ff8 0000000000000000 0000000000000000 ffffffff801181df (XEN) 000000000000000b 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 000000000000e030 ffffffffffffffff (XEN) ffffffff8010f33d 000000000000e030 0000000000010246 ffff880000c361c8 000000000000e02b 0000000000000ff8 ffffffff801188a0 ffffffff8043c2c0 (XEN) 0000000000000000 0000000000000000 ffff880000c40db0 ffff880000c362d8 ffffffff803cba42 0000000000000000 0000000b0000000e 0000000000000000 (XEN) 0000000000030001 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 (XEN) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 00000007f0000022 ffff880000c36308 0000000000008124 0000000000000001 (XEN) 0000000000000ff8 0000000000000ff8 0000000000000000 0000000000000000 ffffffffff5fd023 ffff880000c36480 ffffffff8010e4cb ffff880000c36480 (XEN) ffffffffff5fd023 0000000000000000 0000000000000000 0000000000000ff8 0000000000000ff8 0000000000000000 ffffffff801181df 000000000000000b (XEN) 0000000000000000 0000000000735067 0000000000000000 0000000000000067 0000000020735067 ffff800000000000 ffffffffffffffff ffffffff801181d0 (XEN) 000000000000e030 0000000000010202 ffff880000c36380 000000000000e02b 00000006f0000021 ffff880000c36408 0000000000008124 ffff880000c40db0 (XEN) ffff880000c36480 ffffffff803cba42 0000000000000000 0000000b0000000e 0000000000000000 0000000000030001 ffff880000c36480 ffff880000c36680 (XEN) 0000000000000000 0000000000000000 ffffffff80190830 0000000000000000 0000000000000000 000073637369646c 0000000000000000 0000000000000000 (XEN) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000ff8 0000000000000000 (XEN) 0000000000000000 0000000000000ff8 ffff880000c36638 ffffffff8010e4cb ffff880000c36638 0000000000000ff8 0000000000000000 0000000000000000 (XEN) 0000000000000ff8 0000000000000000 0000000000000000 ffffffff801181df 000000000000000b 0000000000000000 0000000000000000 0000000000000000 (XEN) 0000000000000000 0000000000000000 000000000000e030 ffffffffffffffff ffffffff8010f33d 000000000000e030 0000000000010246 ffff880000c36528 (XEN) Xen call trace from rsp=ffff880000c36020: (XEN) Trace empty. (XEN) Domain 0 shutdown: rebooting machine. And, here''s the patch: ===== linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile 1.5 vs edited ====Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -#obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o @@ -53,7 +53,7 @@ intel_cacheinfo-y += ../../../i386/kern quirks-y += ../../../i386/kernel/quirks.o c-link := init_task.o -s-link := vsyscall.o +s-link := vsyscall.o trampoline.o $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): ln -fsn $(srctree)/arch/x86_64/kernel/$(notdir $@) $@ Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c @@ -24,8 +24,13 @@ #include <asm/mtrr.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> +#if 1 /* Xen */ #include <asm/mach_apic.h> +#endif #include <asm/proto.h> +#include <asm-xen/evtchn.h> + +#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) /* * Smarter SMP flushing macros. @@ -101,6 +106,7 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ +#if 0 /* Xen */ asmlinkage void smp_invalidate_interrupt (void) { unsigned long cpu; @@ -249,6 +255,20 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1, 1); } +#else +asmlinkage void smp_invalidate_interrupt (void) +{ return; } +void flush_tlb_current_task(void) +{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } +void flush_tlb_mm (struct mm_struct * mm) +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } +void flush_tlb_all(void) +{ xen_tlb_flush_all(); } +#endif /* Xen */ + + void smp_kdb_stop(void) { send_IPI_allbutself(KDB_VECTOR); @@ -308,13 +328,13 @@ static void __smp_call_function (void (* /* Wait for response */ while (atomic_read(&data.started) != cpus) - cpu_relax(); + barrier(); if (!wait) return; while (atomic_read(&data.finished) != cpus) - cpu_relax(); + barrier(); } /* @@ -348,7 +368,11 @@ void smp_stop_cpu(void) */ cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); +#if 1 + xxprint("stop_this_cpu disable_local_APIC\n"); +#else disable_local_APIC(); +#endif local_irq_enable(); } @@ -362,8 +386,10 @@ static void smp_really_stop_cpu(void *du void smp_send_stop(void) { int nolock = 0; +#if 0 if (reboot_force) return; +#endif /* Don''t deadlock on the call lock in panic */ if (!spin_trylock(&call_lock)) { /* ignore locking because we have paniced anyways */ @@ -380,18 +406,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +asmlinkage irqreturn_t smp_reschedule_interrupt(void) { - ack_APIC_irq(); + return IRQ_HANDLED; } -asmlinkage void smp_call_function_interrupt(void) +asmlinkage irqreturn_t smp_call_function_interrupt(void) { void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; - ack_APIC_irq(); /* * Notify initiating CPU that I''ve grabbed the data and am * about to execute the function @@ -408,4 +433,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + + return IRQ_HANDLED; } Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @@ -53,6 +53,7 @@ #include <asm/kdebug.h> #include <asm/tlbflush.h> #include <asm/proto.h> +#include <asm/arch_hooks.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -75,6 +76,7 @@ int smp_threads_ready; cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +#if 0 /* * Trampoline 80x86 program as an array. */ @@ -96,6 +98,7 @@ static unsigned long __init setup_trampo memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(tramp); } +#endif /* * The bootstrap kernel entry code has set these up. Save them for @@ -110,6 +113,7 @@ static void __init smp_store_cpu_info(in identify_cpu(c); } +#if 0 /* * TSC synchronization. * @@ -246,6 +250,7 @@ static void __init synchronize_tsc_ap (v } } #undef NR_LOOPS +#endif static atomic_t init_deasserted; @@ -254,6 +259,7 @@ void __init smp_callin(void) int cpuid, phys_id; unsigned long timeout; +#if 0 /* * If waken up by an INIT in an 82489DX configuration * we may get here before an INIT-deassert IPI reaches @@ -261,11 +267,12 @@ void __init smp_callin(void) * lock up on an APIC access. */ while (!atomic_read(&init_deasserted)); +#endif /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); + phys_id = smp_processor_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", @@ -299,6 +306,7 @@ void __init smp_callin(void) cpuid); } +#if 0 /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this @@ -308,6 +316,7 @@ void __init smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); +#endif local_irq_enable(); @@ -317,7 +326,9 @@ void __init smp_callin(void) calibrate_delay(); Dprintk("Stack at about %p\n",&cpuid); +#if 0 disable_APIC_timer(); +#endif /* * Save our processor parameters @@ -331,15 +342,39 @@ void __init smp_callin(void) */ cpu_set(cpuid, cpu_callin_map); +#if 0 /* * Synchronize the TSC with the BP */ if (cpu_has_tsc) synchronize_tsc_ap(); +#endif } int cpucount; +#include <linux/interrupt.h> +static irqreturn_t ldebug_interrupt( + int irq, void *dev_id, struct pt_regs *regs) +{ + return IRQ_HANDLED; +} + +static DEFINE_PER_CPU(int, ldebug_irq); +static char ldebug_name[NR_CPUS][15]; + +void ldebug_setup(void) +{ + int cpu = smp_processor_id(); + + per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG); + sprintf(ldebug_name[cpu], "ldebug%d", cpu); + BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt, + SA_INTERRUPT, ldebug_name[cpu], NULL)); +} + +extern void local_setup_timer(void); + /* * Activate a secondary processor. */ @@ -360,6 +395,7 @@ void __init start_secondary(void) while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); +#if 0 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -373,6 +409,12 @@ void __init start_secondary(void) enable_APIC_timer(); +#else + local_setup_timer(); + ldebug_setup(); + smp_intr_init(); + local_irq_enable(); +#endif /* * low-memory mappings have been cleared, flush them from @@ -428,6 +470,7 @@ static inline void inquire_remote_apic(i } #endif +#if 0 static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { unsigned long send_status = 0, accept_status = 0; @@ -550,6 +593,7 @@ static int __init wakeup_secondary_via_I return (send_status | accept_status); } +#endif static void __init do_boot_cpu (int apicid) { @@ -557,6 +601,14 @@ static void __init do_boot_cpu (int apic unsigned long boot_error; int timeout, cpu; unsigned long start_rip; +#if 1 + vcpu_guest_context_t ctxt; + extern void startup_64_smp(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + extern void smp_trap_init(trap_info_t *); + int i; +#endif cpu = ++cpucount; /* @@ -570,7 +622,7 @@ static void __init do_boot_cpu (int apic cpu_pda[cpu].pcurrent = idle; - start_rip = setup_trampoline(); + start_rip = (unsigned long)startup_64_smp; init_rsp = idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; @@ -587,6 +639,96 @@ static void __init do_boot_cpu (int apic atomic_set(&init_deasserted, 0); +#if 1 + /* FIXME, this faults + if (cpu) + memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); + */ + cpu_gdt_descr[cpu].size = GDT_SIZE; + //cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; + cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[0]; + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS|0x3; + ctxt.user_regs.cs = __KERNEL_CS|0x3; + ctxt.user_regs.rip = start_rip; + ctxt.user_regs.rsp = idle->thread.rsp; + ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12); + + /* FPU is set up to default initial state. */ + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + smp_trap_init(ctxt.trap_ctxt); + + /* No LDT. */ + ctxt.ldt_ents = 0; + + { + unsigned long va; + int f; + + for (va = cpu_gdt_descr[cpu].address, f = 0; + va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; + va += PAGE_SIZE, f++) { + ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + ctxt.gdt_ents = GDT_ENTRIES; + } + + /* Ring 1 stack is the initial stack. */ + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.rsp; + + /* Callback handlers. */ + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(init_level4_pgt); + + boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + } + } + x86_cpu_to_apicid[cpu] = apicid; +#else Dprintk("Setting warm reset code and vector.\n"); CMOS_WRITE(0xa, 0xf); @@ -652,6 +794,7 @@ static void __init do_boot_cpu (int apic #endif } } +#endif if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ @@ -709,9 +852,15 @@ static void smp_tune_scheduling (void) * Cycle through the processors sending APIC IPIs to boot each. */ +/* XXX fix me */ +#define time_init_smp() + static void __init smp_boot_cpus(unsigned int max_cpus) { - unsigned apicid, cpu, bit, kicked; + unsigned cpu, kicked; +#if 0 + unsigned apicid, bit; +#endif nmi_watchdog_default(); @@ -725,11 +874,13 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); +#if 0 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } +#endif /* * If we couldn''t find an SMP configuration at boot time, @@ -739,13 +890,16 @@ static void __init smp_boot_cpus(unsigne printk(KERN_NOTICE "SMP motherboard not detected.\n"); io_apic_irqs = 0; cpu_online_map = cpumask_of_cpu(0); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); goto smp_done; } +#if 0 /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. @@ -771,51 +925,50 @@ static void __init smp_boot_cpus(unsigne } verify_local_APIC(); +#endif /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - smp_found_config = 0; + HYPERVISOR_shared_info->n_vcpu = 1; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); io_apic_irqs = 0; cpu_online_map = cpumask_of_cpu(0); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif disable_apic = 1; goto smp_done; } + smp_intr_init(); + +#if 0 connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) BUG(); +#endif x86_cpu_to_apicid[0] = boot_cpu_id; /* * Now scan the CPU present map and fire up the other CPUs. */ - Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + Dprintk("CPU present map: %lx\n", HYPERVISOR_shared_info->n_vcpu) kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don''t even attempt to start the boot CPU! - */ - if (apicid == boot_cpu_id || (apicid == BAD_APICID)) - continue; - - if (!physid_isset(apicid, phys_cpu_present_map)) + for (cpu = 1; kicked < NR_CPUS && + cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) { + if (max_cpus <= cpucount+1) continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) - continue; - - do_boot_cpu(apicid); + do_boot_cpu(cpu); ++kicked; } +#if 0 /* * Cleanup possible dangling ends... */ @@ -833,6 +986,7 @@ static void __init smp_boot_cpus(unsigne *((volatile int *) phys_to_virt(0x467)) = 0; } +#endif /* * Allow the user to impress friends. @@ -899,6 +1053,7 @@ static void __init smp_boot_cpus(unsigne else nr_ioapics = 0; +#if 0 setup_boot_APIC_clock(); /* @@ -906,6 +1061,7 @@ static void __init smp_boot_cpus(unsigne */ if (cpu_has_tsc && cpucount) synchronize_tsc_bp(); +#endif smp_done: time_init_smp(); @@ -950,9 +1106,36 @@ int __devinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { +#if 0 #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif zap_low_mappings(); +#endif } +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +void __init smp_intr_init(void) +{ + int cpu = smp_processor_id(); + + per_cpu(resched_irq, cpu) + bind_ipi_on_cpu_to_irq(cpu, RESCHEDULE_VECTOR); + sprintf(resched_name[cpu], "resched%d", cpu); + BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, + SA_INTERRUPT, resched_name[cpu], NULL)); + + per_cpu(callfunc_irq, cpu) + bind_ipi_on_cpu_to_irq(cpu, CALL_FUNCTION_VECTOR); + sprintf(callfunc_name[cpu], "callfunc%d", cpu); + BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), + smp_call_function_interrupt, + SA_INTERRUPT, callfunc_name[cpu], NULL)); +} Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c @@ -957,6 +957,17 @@ void __init trap_init(void) cpu_init(); } +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + /* Actual parsing is done early in setup.c. */ static int __init oops_dummy(char *s) Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S @@ -8,11 +8,14 @@ #define sizeof_vcpu_shift 3 #ifdef CONFIG_SMP -#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +#define preempt_disable(reg) +#define preempt_enable(reg) #define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ movq %gs:pda_cpunumber,reg ; \ - shl $sizeof_vcpu_shift,reg ; \ + shl $32, reg ; \ + shr $32-sizeof_vcpu_shift,reg ; \ addq HYPERVISOR_shared_info,reg #define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ #define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c @@ -20,7 +20,11 @@ */ atomic_t irq_err_count; - +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif /* * Generic, controller-independent functions: Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S @@ -41,7 +41,6 @@ .code64 ENTRY(_start) cld - movq init_rsp(%rip),%rsp /* Copy the necessary stuff from xen_start_info structure. */ movq $xen_start_info_union,%rdi movq $64,%rcx /* sizeof (union xen_start_info_union) / sizeof (long) */ @@ -52,6 +51,7 @@ ENTRY(_start) cld #endif /* CONFIG_SMP */ + movq init_rsp(%rip),%rsp /* zero EFLAGS after setting rsp */ pushq $0 popfq _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Chris Wright wrote:> Below is the patch I''ve been working with to get x86_64 SMP support > going. It''s a definite work in progress. Currently, I''m able to > build > a second vcpu from dom0, but when it gets scheduled it crashes. > Booting > the same kernel with maxcpus=1 gets up to login prompt. Here''s some > known issues:Good progress and good start, i.e. we have at least one virutal processor there :-). We''ll look at the problem too.> > entry.S: > - preempt_{en,dis}able is commented out for now (due tothread_info)> - ugly shl,shr combo to clear upper 32 bits. > smpboot.c: > - faults if writing to cpu_gdt_table[1], so doing bogus > reuse of cpu 0 table entries directly >Jun --- Intel Open Source Technology Center _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Nakajima, Jun (jun.nakajima@intel.com) wrote:> Good progress and good start, i.e. we have at least one virutal > processor there :-). > > We''ll look at the problem too.Great, thanks. I''m pretty suspect of the gdt, but only on a hunch. Any clues why the copy is faulting might be useful. thanks, -chris _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Chris Wright (chrisw@osdl.org) wrote:> * Nakajima, Jun (jun.nakajima@intel.com) wrote: > > Good progress and good start, i.e. we have at least one virutal > > processor there :-). > > > > We''ll look at the problem too. > > Great, thanks. I''m pretty suspect of the gdt, but only on a hunch. > Any clues why the copy is faulting might be useful.Here''s an updated patch that fixes that faulting. It uses gdt per page like i386 is doing, still crashing upon schedule. Why does each gdt entry need it''s own page? thanks, -chris -- ===== linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile 1.5 vs edited ====Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -#obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o @@ -53,7 +53,7 @@ intel_cacheinfo-y += ../../../i386/kern quirks-y += ../../../i386/kernel/quirks.o c-link := init_task.o -s-link := vsyscall.o +s-link := vsyscall.o trampoline.o $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): ln -fsn $(srctree)/arch/x86_64/kernel/$(notdir $@) $@ Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c @@ -24,8 +24,13 @@ #include <asm/mtrr.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> +#if 1 /* Xen */ #include <asm/mach_apic.h> +#endif #include <asm/proto.h> +#include <asm-xen/evtchn.h> + +#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) /* * Smarter SMP flushing macros. @@ -101,6 +106,7 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ +#if 0 /* Xen */ asmlinkage void smp_invalidate_interrupt (void) { unsigned long cpu; @@ -249,6 +255,20 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1, 1); } +#else +asmlinkage void smp_invalidate_interrupt (void) +{ return; } +void flush_tlb_current_task(void) +{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } +void flush_tlb_mm (struct mm_struct * mm) +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } +void flush_tlb_all(void) +{ xen_tlb_flush_all(); } +#endif /* Xen */ + + void smp_kdb_stop(void) { send_IPI_allbutself(KDB_VECTOR); @@ -308,13 +328,13 @@ static void __smp_call_function (void (* /* Wait for response */ while (atomic_read(&data.started) != cpus) - cpu_relax(); + barrier(); if (!wait) return; while (atomic_read(&data.finished) != cpus) - cpu_relax(); + barrier(); } /* @@ -348,7 +368,11 @@ void smp_stop_cpu(void) */ cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); +#if 1 + xxprint("stop_this_cpu disable_local_APIC\n"); +#else disable_local_APIC(); +#endif local_irq_enable(); } @@ -362,8 +386,10 @@ static void smp_really_stop_cpu(void *du void smp_send_stop(void) { int nolock = 0; +#if 0 if (reboot_force) return; +#endif /* Don''t deadlock on the call lock in panic */ if (!spin_trylock(&call_lock)) { /* ignore locking because we have paniced anyways */ @@ -380,18 +406,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +asmlinkage irqreturn_t smp_reschedule_interrupt(void) { - ack_APIC_irq(); + return IRQ_HANDLED; } -asmlinkage void smp_call_function_interrupt(void) +asmlinkage irqreturn_t smp_call_function_interrupt(void) { void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; - ack_APIC_irq(); /* * Notify initiating CPU that I''ve grabbed the data and am * about to execute the function @@ -408,4 +433,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + + return IRQ_HANDLED; } Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @@ -53,6 +53,7 @@ #include <asm/kdebug.h> #include <asm/tlbflush.h> #include <asm/proto.h> +#include <asm/arch_hooks.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -75,6 +76,7 @@ int smp_threads_ready; cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +#if 0 /* * Trampoline 80x86 program as an array. */ @@ -96,6 +98,7 @@ static unsigned long __init setup_trampo memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(tramp); } +#endif /* * The bootstrap kernel entry code has set these up. Save them for @@ -110,6 +113,7 @@ static void __init smp_store_cpu_info(in identify_cpu(c); } +#if 0 /* * TSC synchronization. * @@ -246,6 +250,7 @@ static void __init synchronize_tsc_ap (v } } #undef NR_LOOPS +#endif static atomic_t init_deasserted; @@ -254,6 +259,7 @@ void __init smp_callin(void) int cpuid, phys_id; unsigned long timeout; +#if 0 /* * If waken up by an INIT in an 82489DX configuration * we may get here before an INIT-deassert IPI reaches @@ -261,11 +267,12 @@ void __init smp_callin(void) * lock up on an APIC access. */ while (!atomic_read(&init_deasserted)); +#endif /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); + phys_id = smp_processor_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", @@ -299,6 +306,7 @@ void __init smp_callin(void) cpuid); } +#if 0 /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this @@ -308,6 +316,7 @@ void __init smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); +#endif local_irq_enable(); @@ -317,7 +326,9 @@ void __init smp_callin(void) calibrate_delay(); Dprintk("Stack at about %p\n",&cpuid); +#if 0 disable_APIC_timer(); +#endif /* * Save our processor parameters @@ -331,15 +342,39 @@ void __init smp_callin(void) */ cpu_set(cpuid, cpu_callin_map); +#if 0 /* * Synchronize the TSC with the BP */ if (cpu_has_tsc) synchronize_tsc_ap(); +#endif } int cpucount; +#include <linux/interrupt.h> +static irqreturn_t ldebug_interrupt( + int irq, void *dev_id, struct pt_regs *regs) +{ + return IRQ_HANDLED; +} + +static DEFINE_PER_CPU(int, ldebug_irq); +static char ldebug_name[NR_CPUS][15]; + +void ldebug_setup(void) +{ + int cpu = smp_processor_id(); + + per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG); + sprintf(ldebug_name[cpu], "ldebug%d", cpu); + BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt, + SA_INTERRUPT, ldebug_name[cpu], NULL)); +} + +extern void local_setup_timer(void); + /* * Activate a secondary processor. */ @@ -360,6 +395,7 @@ void __init start_secondary(void) while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); +#if 0 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -373,6 +409,12 @@ void __init start_secondary(void) enable_APIC_timer(); +#else + local_setup_timer(); + ldebug_setup(); + smp_intr_init(); + local_irq_enable(); +#endif /* * low-memory mappings have been cleared, flush them from @@ -428,6 +470,7 @@ static inline void inquire_remote_apic(i } #endif +#if 0 static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { unsigned long send_status = 0, accept_status = 0; @@ -550,6 +593,7 @@ static int __init wakeup_secondary_via_I return (send_status | accept_status); } +#endif static void __init do_boot_cpu (int apicid) { @@ -557,6 +601,14 @@ static void __init do_boot_cpu (int apic unsigned long boot_error; int timeout, cpu; unsigned long start_rip; +#if 1 + vcpu_guest_context_t ctxt; + extern void startup_64_smp(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + extern void smp_trap_init(trap_info_t *); + int i; +#endif cpu = ++cpucount; /* @@ -570,7 +622,7 @@ static void __init do_boot_cpu (int apic cpu_pda[cpu].pcurrent = idle; - start_rip = setup_trampoline(); + start_rip = (unsigned long)startup_64_smp; init_rsp = idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; @@ -587,6 +639,94 @@ static void __init do_boot_cpu (int apic atomic_set(&init_deasserted, 0); +#if 1 + if (cpu_gdt_descr[0].size > PAGE_SIZE) + BUG(); + cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; + memcpy((void *)cpu_gdt_descr[cpu].address, + (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size); + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS|0x3; + ctxt.user_regs.cs = __KERNEL_CS|0x3; + ctxt.user_regs.rip = start_rip; + ctxt.user_regs.rsp = idle->thread.rsp; + ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12); + + /* FPU is set up to default initial state. */ + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + smp_trap_init(ctxt.trap_ctxt); + + /* No LDT. */ + ctxt.ldt_ents = 0; + + { + unsigned long va; + int f; + + for (va = cpu_gdt_descr[cpu].address, f = 0; + va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; + va += PAGE_SIZE, f++) { + ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + ctxt.gdt_ents = GDT_ENTRIES; + } + + /* Ring 1 stack is the initial stack. */ + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.rsp; + + /* Callback handlers. */ + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(init_level4_pgt); + + boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + } + } + x86_cpu_to_apicid[cpu] = apicid; +#else Dprintk("Setting warm reset code and vector.\n"); CMOS_WRITE(0xa, 0xf); @@ -652,6 +792,7 @@ static void __init do_boot_cpu (int apic #endif } } +#endif if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ @@ -709,9 +850,15 @@ static void smp_tune_scheduling (void) * Cycle through the processors sending APIC IPIs to boot each. */ +/* XXX fix me */ +#define time_init_smp() + static void __init smp_boot_cpus(unsigned int max_cpus) { - unsigned apicid, cpu, bit, kicked; + unsigned cpu, kicked; +#if 0 + unsigned apicid, bit; +#endif nmi_watchdog_default(); @@ -725,11 +872,13 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); +#if 0 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } +#endif /* * If we couldn''t find an SMP configuration at boot time, @@ -739,13 +888,16 @@ static void __init smp_boot_cpus(unsigne printk(KERN_NOTICE "SMP motherboard not detected.\n"); io_apic_irqs = 0; cpu_online_map = cpumask_of_cpu(0); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); goto smp_done; } +#if 0 /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. @@ -771,51 +923,50 @@ static void __init smp_boot_cpus(unsigne } verify_local_APIC(); +#endif /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - smp_found_config = 0; + HYPERVISOR_shared_info->n_vcpu = 1; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); io_apic_irqs = 0; cpu_online_map = cpumask_of_cpu(0); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif disable_apic = 1; goto smp_done; } + smp_intr_init(); + +#if 0 connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) BUG(); +#endif x86_cpu_to_apicid[0] = boot_cpu_id; /* * Now scan the CPU present map and fire up the other CPUs. */ - Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + Dprintk("CPU present map: %lx\n", HYPERVISOR_shared_info->n_vcpu) kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don''t even attempt to start the boot CPU! - */ - if (apicid == boot_cpu_id || (apicid == BAD_APICID)) - continue; - - if (!physid_isset(apicid, phys_cpu_present_map)) + for (cpu = 1; kicked < NR_CPUS && + cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) { + if (max_cpus <= cpucount+1) continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) - continue; - - do_boot_cpu(apicid); + do_boot_cpu(cpu); ++kicked; } +#if 0 /* * Cleanup possible dangling ends... */ @@ -833,6 +984,7 @@ static void __init smp_boot_cpus(unsigne *((volatile int *) phys_to_virt(0x467)) = 0; } +#endif /* * Allow the user to impress friends. @@ -899,6 +1051,7 @@ static void __init smp_boot_cpus(unsigne else nr_ioapics = 0; +#if 0 setup_boot_APIC_clock(); /* @@ -906,6 +1059,7 @@ static void __init smp_boot_cpus(unsigne */ if (cpu_has_tsc && cpucount) synchronize_tsc_bp(); +#endif smp_done: time_init_smp(); @@ -950,9 +1104,36 @@ int __devinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { +#if 0 #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif zap_low_mappings(); +#endif } +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +void __init smp_intr_init(void) +{ + int cpu = smp_processor_id(); + + per_cpu(resched_irq, cpu) + bind_ipi_on_cpu_to_irq(cpu, RESCHEDULE_VECTOR); + sprintf(resched_name[cpu], "resched%d", cpu); + BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, + SA_INTERRUPT, resched_name[cpu], NULL)); + + per_cpu(callfunc_irq, cpu) + bind_ipi_on_cpu_to_irq(cpu, CALL_FUNCTION_VECTOR); + sprintf(callfunc_name[cpu], "callfunc%d", cpu); + BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), + smp_call_function_interrupt, + SA_INTERRUPT, callfunc_name[cpu], NULL)); +} Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c @@ -957,6 +957,17 @@ void __init trap_init(void) cpu_init(); } +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + /* Actual parsing is done early in setup.c. */ static int __init oops_dummy(char *s) Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S @@ -8,11 +8,14 @@ #define sizeof_vcpu_shift 3 #ifdef CONFIG_SMP -#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +#define preempt_disable(reg) +#define preempt_enable(reg) #define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ movq %gs:pda_cpunumber,reg ; \ - shl $sizeof_vcpu_shift,reg ; \ + shl $32, reg ; \ + shr $32-sizeof_vcpu_shift,reg ; \ addq HYPERVISOR_shared_info,reg #define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ #define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c @@ -20,7 +20,11 @@ */ atomic_t irq_err_count; - +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif /* * Generic, controller-independent functions: Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S @@ -41,7 +41,6 @@ .code64 ENTRY(_start) cld - movq init_rsp(%rip),%rsp /* Copy the necessary stuff from xen_start_info structure. */ movq $xen_start_info_union,%rdi movq $64,%rcx /* sizeof (union xen_start_info_union) / sizeof (long) */ @@ -52,6 +51,7 @@ ENTRY(_start) cld #endif /* CONFIG_SMP */ + movq init_rsp(%rip),%rsp /* zero EFLAGS after setting rsp */ pushq $0 popfq Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c @@ -601,6 +601,17 @@ static void __init print_memory_map(char } } +void __init smp_alloc_memory(void) +{ + int cpu; + + for (cpu = 1; cpu < NR_CPUS; cpu++) { + cpu_gdt_descr[cpu].address = (unsigned long) + alloc_bootmem_low_pages(PAGE_SIZE); + /* XXX free unused pages later */ + } +} + void __init setup_arch(char **cmdline_p) { unsigned long low_mem_size; @@ -742,6 +753,9 @@ void __init setup_arch(char **cmdline_p) } } #endif +#ifdef CONFIG_SMP + smp_alloc_memory(); +#endif paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Chris Wright wrote:> * Chris Wright (chrisw@osdl.org) wrote: >> * Nakajima, Jun (jun.nakajima@intel.com) wrote: >>> Good progress and good start, i.e. we have at least one virutal >>> processor there :-). >>> >>> We''ll look at the problem too. >> >> Great, thanks. I''m pretty suspect of the gdt, but only on a hunch. >> Any clues why the copy is faulting might be useful. > > Here''s an updated patch that fixes that faulting. It uses gdt per > page like i386 is doing, still crashing upon schedule. Why does each > gdt entry need it''s own page? > > thanks, > -chrisEach gdt entry? I think gdt should be per vcpu, but I don''t understand "each gdt entry". Can you please point to the code you are talking about? Jun --- Intel Open Source Technology Center _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Nakajima, Jun (jun.nakajima@intel.com) wrote:> Chris Wright wrote: > > Here''s an updated patch that fixes that faulting. It uses gdt per > > page like i386 is doing, still crashing upon schedule. Why does each > > gdt entry need it''s own page? > > Each gdt entry? I think gdt should be per vcpu, but I don''t understand > "each gdt entry". Can you please point to the code you are talking > about?Yes, the table is in arch/xen/x86_64/kernel/head.S (cpu_gdt_table, which has 16 8 byte entries per cpu). However rather than using that table directly, we must copy the entries into a per cpu page. Xen side has assumptions regarding gdt being page sized (8 bytes * 512 entries, e.g. in xen/arch/x86/mm.c in alloc_segdesc_page). That help? thanks, -chris _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 27 Jun 2005, at 22:52, Chris Wright wrote:> Yes, the table is in arch/xen/x86_64/kernel/head.S (cpu_gdt_table, > which > has 16 8 byte entries per cpu). However rather than using that table > directly, > we must copy the entries into a per cpu page. Xen side has assumptions > regarding gdt being page sized (8 bytes * 512 entries, e.g. in > xen/arch/x86/mm.c in alloc_segdesc_page). That help?Xen makes a shadow mapping of the per-vcpu gdt in its own address space. This, coupled with Xen reserving the last 2 pages of GDT entries for itself, requires every GDT to start on a page boundary. So, even though per-cpu gdts are not page-aligned they must be at least one page in size. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 27 Jun 2005, at 22:55, Keir Fraser wrote:> Xen makes a shadow mapping of the per-vcpu gdt in its own address > space. This, coupled with Xen reserving the last 2 pages of GDT > entries for itself, requires every GDT to start on a page boundary. > So, even though per-cpu gdts are not page-aligned they must be at > least one page in size.Last sentence should be "per-cpu gdts have to be page-aligned, so each one burns a page". :-) -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Keir Fraser (Keir.Fraser@cl.cam.ac.uk) wrote:> Xen makes a shadow mapping of the per-vcpu gdt in its own address > space. This, coupled with Xen reserving the last 2 pages of GDT entries > for itself, requires every GDT to start on a page boundary. So, even > though per-cpu gdts are not page-aligned they must be at least one page > in size.ah, thanks! i should''ve realized that. at one point i moved the cpu_gdt_table, and it was no longer page aligned (nor was it a page long) and vcpu 0 wouldn''t come up any longer. making sure it was aligned and on it''s own page got the first cpu going again...didn''t think to generalize that ;-) thanks, -chris _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Nakajima, Jun
2005-Jun-27 22:15 UTC
RE: [Xen-devel] Re: x86_64 SMP support (status update)
Keir Fraser wrote:> On 27 Jun 2005, at 22:55, Keir Fraser wrote: > >> Xen makes a shadow mapping of the per-vcpu gdt in its own address >> space. This, coupled with Xen reserving the last 2 pages of GDT >> entries for itself, requires every GDT to start on a page boundary. >> So, even though per-cpu gdts are not page-aligned they must be at >> least one page in size. > > Last sentence should be "per-cpu gdts have to be page-aligned, so each > one burns a page". :-) > > -- KeirBTW, I''m debugging nptl01 in LTP when running in domU. Most of the test cases pass except this one; kernel build was a piece of cake there :-). It uses the FS base for TLS on x86_64, and does a number of domain switching. I think this is a race condition with GDT switching, but what happens is Xen sometimes causes #GP at => below because the entry does not exist in GDT (fs = 0x5b). The nptl01 runs fine on dom0 as long as it runs _alone_. It starts failing with presence of domUs. I feel this implies some problems with GDT switching. Is there any race you think of where modifications to GDT (done by do_update_descriptor) are not be visible or deferred? static void load_segments(struct vcpu *p, struct vcpu *n) { struct vcpu_guest_context *pctxt = &p->arch.guest_context; struct vcpu_guest_context *nctxt = &n->arch.guest_context; ... /* * Either selector != 0 ==> reload. * Also reload to reset FS_BASE if it was non-zero. */ if ( unlikely(pctxt->user_regs.fs | pctxt->fs_base | nctxt->user_regs.fs) ) { => all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs); if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */ pctxt->fs_base = 0; } Jun --- Intel Open Source Technology Center _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 27 Jun 2005, at 23:15, Nakajima, Jun wrote:> The nptl01 runs fine on dom0 as long as it > runs _alone_. It starts failing with presence of domUs. I feel this > implies some problems with GDT switching. Is there any race you think > of > where modifications to GDT (done by do_update_descriptor) are not be > visible or deferred?If you change GDT page mappings (by switching %cr3) but don''t change GDT linear address or size, I don''t know whether we must execute a LGDT instruction or not. Currently we don''t. So you might want to try replacing the following test in __context_switch(): if ( p->vcpu_id != n->vcpu_id ) With: if ( 1 ) And see if that fixes things... Even if not, it''d be interesting to find out whether LGDT is potentially required in the situation described in my first sentence. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Nakajima, Jun
2005-Jun-27 22:35 UTC
RE: [Xen-devel] Re: x86_64 SMP support (status update)
Keir Fraser wrote:> On 27 Jun 2005, at 23:15, Nakajima, Jun wrote: > >> The nptl01 runs fine on dom0 as long as it >> runs _alone_. It starts failing with presence of domUs. I feel this >> implies some problems with GDT switching. Is there any race you >> think of where modifications to GDT (done by do_update_descriptor) >> are not be visible or deferred? > > If you change GDT page mappings (by switching %cr3) but don''t change > GDT linear address or size, I don''t know whether we must execute a > LGDT instruction or not. Currently we don''t. > > So you might want to try replacing the following test in > __context_switch(): > if ( p->vcpu_id != n->vcpu_id ) > With: > if ( 1 ) > And see if that fixes things...Actually I already tried that, but it did not help...> > Even if not, it''d be interesting to find out whether LGDT is > potentially required in the situation described in my first sentence. > > -- Keir >I''ll continue to look at it. Jun --- Intel Open Source Technology Center _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 27 Jun 2005, at 23:15, Nakajima, Jun wrote:> The nptl01 runs fine on dom0 as long as it > runs _alone_. It starts failing with presence of domUs. I feel this > implies some problems with GDT switching. Is there any race you think > of > where modifications to GDT (done by do_update_descriptor) are not be > visible or deferred?Actually, the bug is in XenLinux. You need to save DS-GS (and possibly also clear each one, if it is not already zero) before switching CR3 in switch_mm. We use prepare_arch_switch() hook in include/asm-xen/asm-i386/mmu_context.h for this purpose (although on i386 we only need to save/clear FS-GS at that point). What is currently happening is that a domain ctxt switch is happening after switch_mm but before __switch_to. So when Xen switches back to your test domain, it tries to load old process''s FS value and fails -- traps up to XenLinux which sets FS to zero. So you end up saving FS==0 for the nptl process and next time XenLinux switches to it you are toast. :-) Solution: Move segment save/clear to prepare_arch_switch. Remove segment save from __switch_to. If you choose to clear segment registers in prepare_arch_switch, remove the test of prev->{ds,es,fs,gs} from segment reload tests. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Nakajima, Jun (jun.nakajima@intel.com) wrote:> Good progress and good start, i.e. we have at least one virutal > processor there :-).And now some processes. pid cpu name 1 0 swapper 2 0 migration/0 3 0 ksoftirqd/0 4 1 migration/1 5 1 ksoftirqd/1 getting closer...it looks like it''s crashing near the end of setting up sched domains... thanks, -chris _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Chris, I''m here working on x86_64 SMP support too, can you send me your patch? I''m using your previous patch, and get some page faults while boot the secondary CPU. -Xin Chris Wright wrote:> * Nakajima, Jun (jun.nakajima@intel.com) wrote: >> Good progress and good start, i.e. we have at least one >> virutal processor there :-). > > And now some processes. > > pid cpu name > 1 0 swapper > 2 0 migration/0 > 3 0 ksoftirqd/0 > 4 1 migration/1 > 5 1 ksoftirqd/1 > > getting closer...it looks like it''s crashing near the end > of setting up sched domains... > > thanks, > -chris > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Li, Xin B (xin.b.li@intel.com) wrote:> Chris, I''m here working on x86_64 SMP support too, can you send me your > patch? > I''m using your previous patch, and get some page faults while boot the > secondary CPU.Yes, here it is, there''s only one line change (sorry, I thought this change was in the last one). This should get you to faulting on the first CPU (which is actually progress ;-). I think the second CPU is happliy idling in cpu_idle, and the first one is back to finishing up booting when it dies. thanks, -chris ===== linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile 1.5 vs edited ====Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -#obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o @@ -53,7 +53,7 @@ intel_cacheinfo-y += ../../../i386/kern quirks-y += ../../../i386/kernel/quirks.o c-link := init_task.o -s-link := vsyscall.o +s-link := vsyscall.o trampoline.o $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): ln -fsn $(srctree)/arch/x86_64/kernel/$(notdir $@) $@ Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c @@ -24,8 +24,13 @@ #include <asm/mtrr.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> +#if 1 /* Xen */ #include <asm/mach_apic.h> +#endif #include <asm/proto.h> +#include <asm-xen/evtchn.h> + +#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) /* * Smarter SMP flushing macros. @@ -101,6 +106,7 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ +#if 0 /* Xen */ asmlinkage void smp_invalidate_interrupt (void) { unsigned long cpu; @@ -249,6 +255,20 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1, 1); } +#else +asmlinkage void smp_invalidate_interrupt (void) +{ return; } +void flush_tlb_current_task(void) +{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } +void flush_tlb_mm (struct mm_struct * mm) +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } +void flush_tlb_all(void) +{ xen_tlb_flush_all(); } +#endif /* Xen */ + + void smp_kdb_stop(void) { send_IPI_allbutself(KDB_VECTOR); @@ -308,13 +328,13 @@ static void __smp_call_function (void (* /* Wait for response */ while (atomic_read(&data.started) != cpus) - cpu_relax(); + barrier(); if (!wait) return; while (atomic_read(&data.finished) != cpus) - cpu_relax(); + barrier(); } /* @@ -348,7 +368,11 @@ void smp_stop_cpu(void) */ cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); +#if 1 + xxprint("stop_this_cpu disable_local_APIC\n"); +#else disable_local_APIC(); +#endif local_irq_enable(); } @@ -362,8 +386,10 @@ static void smp_really_stop_cpu(void *du void smp_send_stop(void) { int nolock = 0; +#if 0 if (reboot_force) return; +#endif /* Don''t deadlock on the call lock in panic */ if (!spin_trylock(&call_lock)) { /* ignore locking because we have paniced anyways */ @@ -380,18 +406,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +asmlinkage irqreturn_t smp_reschedule_interrupt(void) { - ack_APIC_irq(); + return IRQ_HANDLED; } -asmlinkage void smp_call_function_interrupt(void) +asmlinkage irqreturn_t smp_call_function_interrupt(void) { void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; - ack_APIC_irq(); /* * Notify initiating CPU that I''ve grabbed the data and am * about to execute the function @@ -408,4 +433,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + + return IRQ_HANDLED; } Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @@ -53,6 +53,7 @@ #include <asm/kdebug.h> #include <asm/tlbflush.h> #include <asm/proto.h> +#include <asm/arch_hooks.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -75,6 +76,7 @@ int smp_threads_ready; cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +#if 0 /* * Trampoline 80x86 program as an array. */ @@ -96,6 +98,7 @@ static unsigned long __init setup_trampo memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(tramp); } +#endif /* * The bootstrap kernel entry code has set these up. Save them for @@ -110,6 +113,7 @@ static void __init smp_store_cpu_info(in identify_cpu(c); } +#if 0 /* * TSC synchronization. * @@ -246,6 +250,7 @@ static void __init synchronize_tsc_ap (v } } #undef NR_LOOPS +#endif static atomic_t init_deasserted; @@ -254,6 +259,7 @@ void __init smp_callin(void) int cpuid, phys_id; unsigned long timeout; +#if 0 /* * If waken up by an INIT in an 82489DX configuration * we may get here before an INIT-deassert IPI reaches @@ -261,11 +267,12 @@ void __init smp_callin(void) * lock up on an APIC access. */ while (!atomic_read(&init_deasserted)); +#endif /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); + phys_id = smp_processor_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", @@ -299,6 +306,7 @@ void __init smp_callin(void) cpuid); } +#if 0 /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this @@ -308,6 +316,7 @@ void __init smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); +#endif local_irq_enable(); @@ -317,7 +326,9 @@ void __init smp_callin(void) calibrate_delay(); Dprintk("Stack at about %p\n",&cpuid); +#if 0 disable_APIC_timer(); +#endif /* * Save our processor parameters @@ -331,15 +342,39 @@ void __init smp_callin(void) */ cpu_set(cpuid, cpu_callin_map); +#if 0 /* * Synchronize the TSC with the BP */ if (cpu_has_tsc) synchronize_tsc_ap(); +#endif } int cpucount; +#include <linux/interrupt.h> +static irqreturn_t ldebug_interrupt( + int irq, void *dev_id, struct pt_regs *regs) +{ + return IRQ_HANDLED; +} + +static DEFINE_PER_CPU(int, ldebug_irq); +static char ldebug_name[NR_CPUS][15]; + +void ldebug_setup(void) +{ + int cpu = smp_processor_id(); + + per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG); + sprintf(ldebug_name[cpu], "ldebug%d", cpu); + BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt, + SA_INTERRUPT, ldebug_name[cpu], NULL)); +} + +extern void local_setup_timer(void); + /* * Activate a secondary processor. */ @@ -360,6 +395,7 @@ void __init start_secondary(void) while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); +#if 0 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -373,6 +409,12 @@ void __init start_secondary(void) enable_APIC_timer(); +#else + local_setup_timer(); + ldebug_setup(); + smp_intr_init(); + local_irq_enable(); +#endif /* * low-memory mappings have been cleared, flush them from @@ -428,6 +470,7 @@ static inline void inquire_remote_apic(i } #endif +#if 0 static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { unsigned long send_status = 0, accept_status = 0; @@ -550,6 +593,7 @@ static int __init wakeup_secondary_via_I return (send_status | accept_status); } +#endif static void __init do_boot_cpu (int apicid) { @@ -557,6 +601,14 @@ static void __init do_boot_cpu (int apic unsigned long boot_error; int timeout, cpu; unsigned long start_rip; +#if 1 + vcpu_guest_context_t ctxt; + extern void startup_64_smp(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + extern void smp_trap_init(trap_info_t *); + int i; +#endif cpu = ++cpucount; /* @@ -570,7 +622,7 @@ static void __init do_boot_cpu (int apic cpu_pda[cpu].pcurrent = idle; - start_rip = setup_trampoline(); + start_rip = (unsigned long)startup_64_smp; init_rsp = idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; @@ -587,6 +639,94 @@ static void __init do_boot_cpu (int apic atomic_set(&init_deasserted, 0); +#if 1 + if (cpu_gdt_descr[0].size > PAGE_SIZE) + BUG(); + cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; + memcpy((void *)cpu_gdt_descr[cpu].address, + (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size); + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS|0x3; + ctxt.user_regs.cs = __KERNEL_CS|0x3; + ctxt.user_regs.rip = start_rip; + ctxt.user_regs.rsp = idle->thread.rsp; + ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12); + + /* FPU is set up to default initial state. */ + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + smp_trap_init(ctxt.trap_ctxt); + + /* No LDT. */ + ctxt.ldt_ents = 0; + + { + unsigned long va; + int f; + + for (va = cpu_gdt_descr[cpu].address, f = 0; + va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; + va += PAGE_SIZE, f++) { + ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + ctxt.gdt_ents = GDT_ENTRIES; + } + + /* Ring 1 stack is the initial stack. */ + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.rsp; + + /* Callback handlers. */ + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(init_level4_pgt); + + boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + } + } + x86_cpu_to_apicid[cpu] = apicid; +#else Dprintk("Setting warm reset code and vector.\n"); CMOS_WRITE(0xa, 0xf); @@ -652,6 +792,7 @@ static void __init do_boot_cpu (int apic #endif } } +#endif if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ @@ -709,9 +850,15 @@ static void smp_tune_scheduling (void) * Cycle through the processors sending APIC IPIs to boot each. */ +/* XXX fix me */ +#define time_init_smp() + static void __init smp_boot_cpus(unsigned int max_cpus) { - unsigned apicid, cpu, bit, kicked; + unsigned cpu, kicked; +#if 0 + unsigned apicid, bit; +#endif nmi_watchdog_default(); @@ -725,11 +872,13 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); +#if 0 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } +#endif /* * If we couldn''t find an SMP configuration at boot time, @@ -739,13 +888,16 @@ static void __init smp_boot_cpus(unsigne printk(KERN_NOTICE "SMP motherboard not detected.\n"); io_apic_irqs = 0; cpu_online_map = cpumask_of_cpu(0); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); goto smp_done; } +#if 0 /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. @@ -771,51 +923,50 @@ static void __init smp_boot_cpus(unsigne } verify_local_APIC(); +#endif /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - smp_found_config = 0; + HYPERVISOR_shared_info->n_vcpu = 1; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); io_apic_irqs = 0; cpu_online_map = cpumask_of_cpu(0); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif disable_apic = 1; goto smp_done; } + smp_intr_init(); + +#if 0 connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) BUG(); +#endif x86_cpu_to_apicid[0] = boot_cpu_id; /* * Now scan the CPU present map and fire up the other CPUs. */ - Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + Dprintk("CPU present map: %lx\n", HYPERVISOR_shared_info->n_vcpu) kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don''t even attempt to start the boot CPU! - */ - if (apicid == boot_cpu_id || (apicid == BAD_APICID)) - continue; - - if (!physid_isset(apicid, phys_cpu_present_map)) + for (cpu = 1; kicked < NR_CPUS && + cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) { + if (max_cpus <= cpucount+1) continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) - continue; - - do_boot_cpu(apicid); + do_boot_cpu(cpu); ++kicked; } +#if 0 /* * Cleanup possible dangling ends... */ @@ -833,6 +984,7 @@ static void __init smp_boot_cpus(unsigne *((volatile int *) phys_to_virt(0x467)) = 0; } +#endif /* * Allow the user to impress friends. @@ -899,6 +1051,7 @@ static void __init smp_boot_cpus(unsigne else nr_ioapics = 0; +#if 0 setup_boot_APIC_clock(); /* @@ -906,6 +1059,7 @@ static void __init smp_boot_cpus(unsigne */ if (cpu_has_tsc && cpucount) synchronize_tsc_bp(); +#endif smp_done: time_init_smp(); @@ -950,9 +1104,36 @@ int __devinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { +#if 0 #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif zap_low_mappings(); +#endif } +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +void __init smp_intr_init(void) +{ + int cpu = smp_processor_id(); + + per_cpu(resched_irq, cpu) + bind_ipi_on_cpu_to_irq(cpu, RESCHEDULE_VECTOR); + sprintf(resched_name[cpu], "resched%d", cpu); + BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, + SA_INTERRUPT, resched_name[cpu], NULL)); + + per_cpu(callfunc_irq, cpu) + bind_ipi_on_cpu_to_irq(cpu, CALL_FUNCTION_VECTOR); + sprintf(callfunc_name[cpu], "callfunc%d", cpu); + BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), + smp_call_function_interrupt, + SA_INTERRUPT, callfunc_name[cpu], NULL)); +} Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c @@ -957,6 +957,17 @@ void __init trap_init(void) cpu_init(); } +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + /* Actual parsing is done early in setup.c. */ static int __init oops_dummy(char *s) Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S @@ -8,11 +8,14 @@ #define sizeof_vcpu_shift 3 #ifdef CONFIG_SMP -#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +#define preempt_disable(reg) +#define preempt_enable(reg) #define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ movq %gs:pda_cpunumber,reg ; \ - shl $sizeof_vcpu_shift,reg ; \ + shl $32, reg ; \ + shr $32-sizeof_vcpu_shift,reg ; \ addq HYPERVISOR_shared_info,reg #define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ #define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c @@ -20,7 +20,11 @@ */ atomic_t irq_err_count; - +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif /* * Generic, controller-independent functions: Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S @@ -41,7 +41,6 @@ .code64 ENTRY(_start) cld - movq init_rsp(%rip),%rsp /* Copy the necessary stuff from xen_start_info structure. */ movq $xen_start_info_union,%rdi movq $64,%rcx /* sizeof (union xen_start_info_union) / sizeof (long) */ @@ -52,6 +51,7 @@ ENTRY(_start) cld #endif /* CONFIG_SMP */ + movq init_rsp(%rip),%rsp /* zero EFLAGS after setting rsp */ pushq $0 popfq Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c @@ -601,6 +601,17 @@ static void __init print_memory_map(char } } +void __init smp_alloc_memory(void) +{ + int cpu; + + for (cpu = 1; cpu < NR_CPUS; cpu++) { + cpu_gdt_descr[cpu].address = (unsigned long) + alloc_bootmem_low_pages(PAGE_SIZE); + /* XXX free unused pages later */ + } +} + void __init setup_arch(char **cmdline_p) { unsigned long low_mem_size; @@ -742,6 +753,9 @@ void __init setup_arch(char **cmdline_p) } } #endif +#ifdef CONFIG_SMP + smp_alloc_memory(); +#endif paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* Index: xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup64.c ==================================================================--- xen-unstable.orig/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup64.c +++ xen-unstable/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup64.c @@ -264,13 +264,13 @@ void __init cpu_init (void) * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ +#if 0 if (cpu) { memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); } cpu_gdt_descr[cpu].size = GDT_SIZE; cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; -#if 0 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); asm volatile("lidt %0" :: "m" (idt_descr)); #endif _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Chris Wright wrote:> Yes, here it is, there''s only one line change (sorry, I > thought this > change was in the last one). This should get you to > faulting on the > first CPU (which is actually progress ;-). I think the > second CPU is > happliy idling in cpu_idle, and the first one is back to > finishing up > booting when it dies.Chris, cpu_gdt_table in latest bk has only 15 entries, I think that''s not correct. ______________________________________________________________ ENTRY(cpu_gdt_table) /* The TLS descriptors are currently at a different place compared to i386. Hopefully nobody expects them at a fixed place (Wine?) */ .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x008ffa000000ffff /* __KERNEL_COMPAT32_CS */ .quad 0x00affa000000ffff /* __KERNEL_CS */ .quad 0x00cff2000000ffff /* __KERNEL_DS */ .quad 0x00cffa000000ffff /* __USER32_CS */ .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ .quad 0x00affa000000ffff /* __USER_CS */ .quad 0x00cffa000000ffff /* __KERNEL32_CS */ .quad 0,0 /* TSS */ .quad 0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ .quad 0 /* unused now */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ /* GDTs of other CPUs: */ .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) ______________________________________________________________ On x86_64, GDT_SIZE = GDT_ENTRIES * 8, and GDT_ENTRIES = 16. seems we missed one LDT entry here comparing to native x86_64 linux kernel. -Xin _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
* Li, Xin B (xin.b.li@intel.com) wrote:> Chris Wright wrote: > > Yes, here it is, there''s only one line change (sorry, I > > thought this > > change was in the last one). This should get you to > > faulting on the > > first CPU (which is actually progress ;-). I think the > > second CPU is > > happliy idling in cpu_idle, and the first one is back to > > finishing up > > booting when it dies. > > Chris, cpu_gdt_table in latest bk has only 15 entries, I think that''s > not correct.I noticed that, and thought I saw the same off-by-one for i386 (with 31), but I just recounted and it''s correct. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel