The following patch provides x86_64 SMP support for xen linux. Many, many thanks to the help for bug fixes, cleanups, good domU support, and more from Jun and Xin. I''ve tested this on a couple different machines, and made sure the UP build still boots and works. dom0 and domU smp both work. There''s still some cleanups to do, but we''d prefer to do the last bit in tree. This doesn''t update defconfigs just yet. That''ll be a follow-on patch shortly, so for now, you''ll have to manually enable CONFIG_SMP if you''d like to test. Signed-off-by: Chris Wright <chrisw@osdl.org> Signed-off-by: Xin Li <xin.b.li@intel.com> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> --- linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig | 4 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile | 5 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c | 2 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic.c | 123 ++++++ linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic_xen.c | 167 +++++++++ linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S | 3 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c | 5 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c | 20 + linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c | 2 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c | 58 +++ linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c | 254 ++++++++++++++ linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c | 11 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S | 9 13 files changed, 656 insertions(+), 7 deletions(-) Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile @@ -25,11 +25,12 @@ obj-$(CONFIG_ACPI_BOOT) += acpi/ c-obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -#obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o -c-obj-$(CONFIG_X86_IO_APIC) += genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_X86_XEN_GENAPIC) += genapic.o genapic_xen.o +c-obj-$(CONFIG_X86_IO_APIC) += genapic_cluster.o genapic_flat.o #obj-$(CONFIG_PM) += suspend.o #obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o #obj-$(CONFIG_CPU_FREQ) += cpufreq/ Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c @@ -48,7 +48,7 @@ void smp_local_timer_interrupt(struct pt int cpu = smp_processor_id(); profile_tick(CPU_PROFILING, regs); -#if 0 +#ifndef CONFIG_XEN if (--per_cpu(prof_counter, cpu) <= 0) { /* * The multiplier may have changed since the last time we got Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S @@ -41,7 +41,6 @@ startup_64: ENTRY(_start) cld - movq init_rsp(%rip),%rsp /* Copy the necessary stuff from xen_start_info structure. */ movq $xen_start_info_union,%rdi movq $64,%rcx /* sizeof (union xen_start_info_union) / sizeof (long) */ @@ -52,6 +51,7 @@ ENTRY(_start) cld #endif /* CONFIG_SMP */ + movq init_rsp(%rip),%rsp /* zero EFLAGS after setting rsp */ pushq $0 popfq @@ -204,6 +204,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ + .quad 0 /* unused now? __KERNEL16_CS - 16bit PM for S3 wakeup. */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c @@ -21,6 +21,11 @@ atomic_t irq_err_count; +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif /* * Generic, controller-independent functions: Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c @@ -200,12 +200,14 @@ static struct resource adapter_rom_resou #define ADAPTER_ROM_RESOURCES \ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) +#ifdef CONFIG_XEN_PRIVILEGED_GUEST static struct resource video_rom_resource = { .name = "Video ROM", .start = 0xc0000, .end = 0xc7fff, .flags = IORESOURCE_ROM, }; +#endif static struct resource video_ram_resource = { .name = "Video RAM area", @@ -599,6 +601,19 @@ static void __init print_memory_map(char } } +#ifdef CONFIG_XEN +void __init smp_alloc_memory(void) +{ + int cpu; + + for (cpu = 1; cpu < NR_CPUS; cpu++) { + cpu_gdt_descr[cpu].address = (unsigned long) + alloc_bootmem_low_pages(PAGE_SIZE); + /* XXX free unused pages later */ + } +} +#endif + void __init setup_arch(char **cmdline_p) { int i, j; @@ -740,6 +755,11 @@ void __init setup_arch(char **cmdline_p) } } #endif +#ifdef CONFIG_SMP +#ifdef CONFIG_XEN + smp_alloc_memory(); +#endif +#endif paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c @@ -276,9 +276,11 @@ void __init cpu_init (void) * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ +#ifndef CONFIG_XEN if (cpu) { memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); } +#endif cpu_gdt_descr[cpu].size = GDT_SIZE; cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c @@ -28,7 +28,12 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apicdef.h> +#ifdef CONFIG_XEN +#include <asm-xen/evtchn.h> +#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) + +#else /* * Smarter SMP flushing macros. * c/o Linus Torvalds. @@ -44,6 +49,7 @@ static struct mm_struct * flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); #define FLUSH_ALL -1ULL +#endif /* * We cannot call mmdrop() because we are in interrupt context, @@ -57,6 +63,7 @@ static inline void leave_mm (unsigned lo load_cr3(swapper_pg_dir); } +#ifndef CONFIG_XEN /* * * The flush IPI assumes that a thread switch happens in this order: @@ -250,6 +257,18 @@ void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, NULL, 1, 1); } +#else +asmlinkage void smp_invalidate_interrupt (void) +{ return; } +void flush_tlb_current_task(void) +{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } +void flush_tlb_mm (struct mm_struct * mm) +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } +void flush_tlb_all(void) +{ xen_tlb_flush_all(); } +#endif /* Xen */ void smp_kdb_stop(void) { @@ -310,13 +329,21 @@ static void __smp_call_function (void (* /* Wait for response */ while (atomic_read(&data.started) != cpus) +#ifndef CONFIG_XEN cpu_relax(); +#else + barrier(); +#endif if (!wait) return; while (atomic_read(&data.finished) != cpus) +#ifndef CONFIG_XEN cpu_relax(); +#else + barrier(); +#endif } /* @@ -350,7 +377,11 @@ void smp_stop_cpu(void) */ cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); +#ifndef CONFIG_XEN disable_local_APIC(); +#else + xxprint("stop_this_cpu disable_local_APIC\n"); +#endif local_irq_enable(); } @@ -364,8 +395,10 @@ static void smp_really_stop_cpu(void *du void smp_send_stop(void) { int nolock = 0; +#ifndef CONFIG_XEN if (reboot_force) return; +#endif /* Don''t deadlock on the call lock in panic */ if (!spin_trylock(&call_lock)) { /* ignore locking because we have paniced anyways */ @@ -376,7 +409,11 @@ void smp_send_stop(void) spin_unlock(&call_lock); local_irq_disable(); +#ifdef CONFIG_XEN + xxprint("stop_this_cpu disable_local_APIC\n"); +#else disable_local_APIC(); +#endif local_irq_enable(); } @@ -385,18 +422,32 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ +#ifndef CONFIG_XEN asmlinkage void smp_reschedule_interrupt(void) +#else +asmlinkage irqreturn_t smp_reschedule_interrupt(void) +#endif { +#ifndef CONFIG_XEN ack_APIC_irq(); +#else + return IRQ_HANDLED; +#endif } +#ifndef CONFIG_XEN asmlinkage void smp_call_function_interrupt(void) +#else +asmlinkage irqreturn_t smp_call_function_interrupt(void) +#endif { void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; +#ifndef CONFIG_XEN ack_APIC_irq(); +#endif /* * Notify initiating CPU that I''ve grabbed the data and am * about to execute the function @@ -413,10 +464,16 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } +#ifdef CONFIG_XEN + return IRQ_HANDLED; +#endif } int safe_smp_processor_id(void) { +#ifdef CONFIG_XEN + return smp_processor_id(); +#else int apicid, i; if (disable_apic) @@ -437,4 +494,5 @@ int safe_smp_processor_id(void) return 0; return 0; /* Should not happen */ +#endif } Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @@ -47,6 +47,9 @@ #include <linux/bootmem.h> #include <linux/thread_info.h> #include <linux/module.h> +#ifdef CONFIG_XEN +#include <linux/interrupt.h> +#endif #include <linux/delay.h> #include <linux/mc146818rtc.h> @@ -57,12 +60,21 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/nmi.h> +#ifdef CONFIG_XEN +#include <asm/arch_hooks.h> + +#include <asm-xen/evtchn.h> +#endif /* Change for real CPU hotplug. Note other files need to be fixed first too. */ #define __cpuinit __init #define __cpuinitdata __initdata +#if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST) + unsigned int maxcpus = NR_CPUS; +#endif + /* Number of siblings per CPU package */ int smp_num_siblings = 1; /* Package ID of each logical CPU */ @@ -96,6 +108,7 @@ cpumask_t cpu_sibling_map[NR_CPUS] __cac cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_core_map); +#ifndef CONFIG_XEN /* * Trampoline 80x86 program as an array. */ @@ -115,6 +128,7 @@ static unsigned long __cpuinit setup_tra memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(tramp); } +#endif /* * The bootstrap kernel entry code has set these up. Save them for @@ -130,6 +144,7 @@ static void __cpuinit smp_store_cpu_info print_cpu_info(c); } +#ifndef CONFIG_XEN /* * New Funky TSC sync algorithm borrowed from IA64. * Main advantage is that it doesn''t reset the TSCs fully and @@ -331,6 +346,7 @@ static __init int notscsync_setup(char * return 0; } __setup("notscsync", notscsync_setup); +#endif static atomic_t init_deasserted __cpuinitdata; @@ -343,6 +359,7 @@ void __cpuinit smp_callin(void) int cpuid, phys_id; unsigned long timeout; +#ifndef CONFIG_XEN /* * If waken up by an INIT in an 82489DX configuration * we may get here before an INIT-deassert IPI reaches @@ -352,10 +369,15 @@ void __cpuinit smp_callin(void) while (!atomic_read(&init_deasserted)) cpu_relax(); +#endif /* * (This works even if the APIC is not enabled.) */ +#ifndef CONFIG_XEN phys_id = GET_APIC_ID(apic_read(APIC_ID)); +#else + phys_id = smp_processor_id(); +#endif cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", @@ -389,6 +411,7 @@ void __cpuinit smp_callin(void) cpuid); } +#ifndef CONFIG_XEN /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this @@ -398,6 +421,7 @@ void __cpuinit smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); +#endif /* * Get our bogomips. @@ -405,7 +429,9 @@ void __cpuinit smp_callin(void) calibrate_delay(); Dprintk("Stack at about %p\n",&cpuid); +#ifndef CONFIG_XEN disable_APIC_timer(); +#endif /* * Save our processor parameters @@ -418,6 +444,29 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +#ifdef CONFIG_XEN +static irqreturn_t ldebug_interrupt( + int irq, void *dev_id, struct pt_regs *regs) +{ + return IRQ_HANDLED; +} + +static DEFINE_PER_CPU(int, ldebug_irq); +static char ldebug_name[NR_CPUS][15]; + +void ldebug_setup(void) +{ + int cpu = smp_processor_id(); + + per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG); + sprintf(ldebug_name[cpu], "ldebug%d", cpu); + BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt, + SA_INTERRUPT, ldebug_name[cpu], NULL)); +} + +extern void local_setup_timer(void); +#endif + /* * Setup code on secondary processor (after comming out of the trampoline) */ @@ -434,6 +483,7 @@ void __cpuinit start_secondary(void) /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); +#ifndef CONFIG_XEN Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -446,6 +496,12 @@ void __cpuinit start_secondary(void) } enable_APIC_timer(); +#else + local_setup_timer(); + ldebug_setup(); + smp_intr_init(); + local_irq_enable(); +#endif /* * Allow the master to continue. @@ -453,10 +509,12 @@ void __cpuinit start_secondary(void) cpu_set(smp_processor_id(), cpu_online_map); mb(); +#ifndef CONFIG_XEN /* Wait for TSC sync to not schedule things before. We still process interrupts, which could see an inconsistent time in that window unfortunately. */ tsc_sync_wait(); +#endif cpu_idle(); } @@ -464,6 +522,7 @@ void __cpuinit start_secondary(void) extern volatile unsigned long init_rsp; extern void (*initial_code)(void); +#ifndef CONFIG_XEN #if APIC_DEBUG static void inquire_remote_apic(int apicid) { @@ -627,6 +686,7 @@ static int __cpuinit wakeup_secondary_vi return (send_status | accept_status); } +#endif /* * Boot one CPU. @@ -637,6 +697,14 @@ static int __cpuinit do_boot_cpu(int cpu unsigned long boot_error; int timeout; unsigned long start_rip; +#ifdef CONFIG_XEN + vcpu_guest_context_t ctxt; + extern void startup_64_smp(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + extern void smp_trap_init(trap_info_t *); + int i; +#endif /* * We can''t use kernel_thread since we must avoid to * reschedule the child. @@ -649,7 +717,11 @@ static int __cpuinit do_boot_cpu(int cpu cpu_pda[cpu].pcurrent = idle; +#ifndef CONFIG_XEN start_rip = setup_trampoline(); +#else + start_rip = (unsigned long)startup_64_smp; +#endif init_rsp = idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; @@ -666,6 +738,93 @@ static int __cpuinit do_boot_cpu(int cpu atomic_set(&init_deasserted, 0); +#ifdef CONFIG_XEN + if (cpu_gdt_descr[0].size > PAGE_SIZE) + BUG(); + cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; + memcpy((void *)cpu_gdt_descr[cpu].address, + (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size); + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS|0x3; + ctxt.user_regs.cs = __KERNEL_CS|0x3; + ctxt.user_regs.rip = start_rip; + ctxt.user_regs.rsp = idle->thread.rsp; +#define X86_EFLAGS_IOPL_RING3 0x3000 + ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING3; + + /* FPU is set up to default initial state. */ + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + smp_trap_init(ctxt.trap_ctxt); + + /* No LDT. */ + ctxt.ldt_ents = 0; + + { + unsigned long va; + int f; + + for (va = cpu_gdt_descr[cpu].address, f = 0; + va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; + va += PAGE_SIZE, f++) { + ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + ctxt.gdt_ents = GDT_ENTRIES; + } + + /* Ring 1 stack is the initial stack. */ + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.rsp; + + /* Callback handlers. */ + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + ctxt.syscall_callback_eip = (unsigned long)system_call; + + ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(init_level4_pgt); + + boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + } + } + x86_cpu_to_apicid[cpu] = apicid; +#else Dprintk("Setting warm reset code and vector.\n"); CMOS_WRITE(0xa, 0xf); @@ -729,6 +888,7 @@ static int __cpuinit do_boot_cpu(int cpu #endif } } +#endif if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ @@ -790,6 +950,7 @@ static __cpuinit void detect_siblings(vo } } +#ifndef CONFIG_XEN /* * Cleanup possible dangling ends... */ @@ -817,6 +978,7 @@ static __cpuinit void smp_cleanup_boot(v free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); #endif } +#endif /* * Fall back to non SMP mode after errors. @@ -827,10 +989,12 @@ static __cpuinit void disable_smp(void) { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); +#ifndef CONFIG_XEN if (smp_found_config) phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); else phys_cpu_present_map = physid_mask_of_physid(0); +#endif cpu_set(0, cpu_sibling_map[0]); cpu_set(0, cpu_core_map[0]); } @@ -857,6 +1021,7 @@ static __cpuinit void enforce_max_cpus(u */ static int __cpuinit smp_sanity_check(unsigned max_cpus) { +#ifndef CONFIG_XEN if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); @@ -896,13 +1061,19 @@ static int __cpuinit smp_sanity_check(un nr_ioapics = 0; return -1; } +#endif /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { +#ifdef CONFIG_XEN + HYPERVISOR_shared_info->n_vcpu = 1; +#endif printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); +#ifndef CONFIG_XEN nr_ioapics = 0; +#endif return -1; } @@ -917,7 +1088,10 @@ void __cpuinit smp_prepare_cpus(unsigned { int i; +#if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST) +#else nmi_watchdog_default(); +#endif current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ @@ -927,8 +1101,12 @@ void __cpuinit smp_prepare_cpus(unsigned * Fill in cpu_present_mask */ for (i = 0; i < NR_CPUS; i++) { +#ifndef CONFIG_XEN int apicid = cpu_present_to_apicid(i); if (physid_isset(apicid, phys_cpu_present_map)) { +#else + if (i < HYPERVISOR_shared_info->n_vcpu) { +#endif cpu_set(i, cpu_present_map); /* possible map would be different if we supported real CPU hotplug. */ @@ -942,6 +1120,9 @@ void __cpuinit smp_prepare_cpus(unsigned return; } +#ifdef CONFIG_XEN + smp_intr_init(); +#else /* * Switch from PIC to APIC mode. @@ -954,20 +1135,26 @@ void __cpuinit smp_prepare_cpus(unsigned GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); /* Or can we switch back to PIC here? */ } +#endif /* * Now start the IO-APICs */ +#if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST) +#else if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); else nr_ioapics = 0; +#endif /* * Set up local APIC timer on boot CPU. */ +#ifndef CONFIG_XEN setup_boot_APIC_clock(); +#endif } /* @@ -989,17 +1176,23 @@ void __init smp_prepare_boot_cpu(void) int __cpuinit __cpu_up(unsigned int cpu) { int err; +#ifndef CONFIG_XEN int apicid = cpu_present_to_apicid(cpu); +#else + int apicid = cpu; +#endif WARN_ON(irqs_disabled()); Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); +#ifndef CONFIG_XEN if (apicid == BAD_APICID || apicid == boot_cpu_id || !physid_isset(apicid, phys_cpu_present_map)) { printk("__cpu_up: bad cpu %d\n", cpu); return -EINVAL; } +#endif /* Boot it! */ err = do_boot_cpu(cpu, apicid); @@ -1021,15 +1214,76 @@ int __cpuinit __cpu_up(unsigned int cpu) */ void __cpuinit smp_cpus_done(unsigned int max_cpus) { +#ifndef CONFIG_XEN zap_low_mappings(); smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif +#endif detect_siblings(); +#ifndef CONFIG_XEN time_init_gtod(); check_nmi_watchdog(); +#endif } + +#ifdef CONFIG_XEN +extern int bind_ipi_to_irq(int ipi); +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +void smp_intr_init(void) +{ + int cpu = smp_processor_id(); + + per_cpu(resched_irq, cpu) + bind_ipi_to_irq(RESCHEDULE_VECTOR); + sprintf(resched_name[cpu], "resched%d", cpu); + BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, + SA_INTERRUPT, resched_name[cpu], NULL)); + + per_cpu(callfunc_irq, cpu) + bind_ipi_to_irq(CALL_FUNCTION_VECTOR); + sprintf(callfunc_name[cpu], "callfunc%d", cpu); + BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), + smp_call_function_interrupt, + SA_INTERRUPT, callfunc_name[cpu], NULL)); +} + +static void smp_intr_exit(void) +{ + int cpu = smp_processor_id(); + + free_irq(per_cpu(resched_irq, cpu), NULL); + unbind_ipi_from_irq(RESCHEDULE_VECTOR); + + free_irq(per_cpu(callfunc_irq, cpu), NULL); + unbind_ipi_from_irq(CALL_FUNCTION_VECTOR); +} + +extern void local_setup_timer_irq(void); +extern void local_teardown_timer_irq(void); + +void smp_suspend(void) +{ + /* XXX todo: take down time and ipi''s on all cpus */ + local_teardown_timer_irq(); + smp_intr_exit(); +} + +void smp_resume(void) +{ + /* XXX todo: restore time and ipi''s on all cpus */ + smp_intr_init(); + local_setup_timer_irq(); +} +#endif Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c @@ -953,6 +953,17 @@ void __init trap_init(void) cpu_init(); } +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + /* Actual parsing is done early in setup.c. */ static int __init oops_dummy(char *s) Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S @@ -8,11 +8,14 @@ #define sizeof_vcpu_shift 3 #ifdef CONFIG_SMP -#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +#define preempt_disable(reg) +#define preempt_enable(reg) #define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ movq %gs:pda_cpunumber,reg ; \ - shl $sizeof_vcpu_shift,reg ; \ + shl $32, reg ; \ + shr $32-sizeof_vcpu_shift,reg ; \ addq HYPERVISOR_shared_info,reg #define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ #define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic.c ==================================================================--- /dev/null +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic.c @@ -0,0 +1,123 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/ipi.h> + +#if defined(CONFIG_ACPI_BUS) +#include <acpi/acpi_bus.h> +#endif + +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(x86_cpu_to_apicid); +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +extern struct genapic apic_cluster; +extern struct genapic apic_flat; + +#ifndef CONFIG_XEN +struct genapic *genapic = &apic_flat; +#else +extern struct genapic apic_xen; +struct genapic *genapic = &apic_xen; +#endif + + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init clustered_apic_check(void) +{ +#ifndef CONFIG_XEN + long i; + u8 clusters, max_cluster; + u8 id; + u8 cluster_cnt[NUM_APIC_CLUSTERS]; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + /* AMD always uses flat mode right now */ + genapic = &apic_flat; + goto print; + } + +#if defined(CONFIG_ACPI_BUS) + /* + * Some x86_64 machines use physical APIC mode regardless of how many + * procs/clusters are present (x86_64 ES7000 is an example). + */ + if (acpi_fadt.revision > FADT2_REVISION_ID) + if (acpi_fadt.force_apic_physical_destination_mode) { + genapic = &apic_cluster; + goto print; + } +#endif + + memset(cluster_cnt, 0, sizeof(cluster_cnt)); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + cluster_cnt[APIC_CLUSTERID(id)]++; + } + + clusters = 0; + max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (cluster_cnt[i] > 0) { + ++clusters; + if (cluster_cnt[i] > max_cluster) + max_cluster = cluster_cnt[i]; + } + } + + /* + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical + * else physical mode. + * (We don''t use lowest priority delivery + HW APIC IRQ steering, so + * can ignore the clustered logical case and go straight to physical.) + */ + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + genapic = &apic_flat; + else + genapic = &apic_cluster; + +print: +#else + /* hardcode to xen apic functions */ + genapic = &apic_xen; +#endif + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +} + +/* Same for both flat and clustered. */ + +#ifdef CONFIG_XEN +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); +#endif + +void send_IPI_self(int vector) +{ +#ifndef CONFIG_XEN + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +#else + xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +#endif +} Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic_xen.c ==================================================================--- /dev/null +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic_xen.c @@ -0,0 +1,167 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Xen APIC subarch code. Maximum 8 CPUs, logical delivery. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + * + * Hacked to pieces for Xen by Chris Wright. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#include <asm/smp.h> +#include <asm/ipi.h> +#else +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/genapic.h> +#endif +#include <asm-xen/evtchn.h> + +DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]); + +static inline void __send_IPI_one(unsigned int cpu, int vector) +{ + unsigned int evtchn; + Dprintk("%s\n", __FUNCTION__); + + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + if (evtchn) + notify_via_evtchn(evtchn); + else + printk("send_IPI to unbound port %d/%d", cpu, vector); +} + +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +{ + int cpu; + + switch (shortcut) { + case APIC_DEST_SELF: + __send_IPI_one(smp_processor_id(), vector); + break; + case APIC_DEST_ALLBUT: + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpu_isset(cpu, cpu_online_map)) { + __send_IPI_one(cpu, vector); + } + } + break; + case APIC_DEST_ALLINC: + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu_isset(cpu, cpu_online_map)) { + __send_IPI_one(cpu, vector); + } + } + break; + default: + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, + vector); + break; + } +} + +static cpumask_t xen_target_cpus(void) +{ + return cpu_online_map; +} + +/* + * Set up the logical destination ID. + * Do nothing, not called now. + */ +static void xen_init_apic_ldr(void) +{ + Dprintk("%s\n", __FUNCTION__); + return; +} + +static void xen_send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then + * we get an APIC send error if we try to broadcast. + * thus we have to avoid sending IPIs in this case. + */ + Dprintk("%s\n", __FUNCTION__); + if (num_online_cpus() > 1) + xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); +} + +static void xen_send_IPI_all(int vector) +{ + Dprintk("%s\n", __FUNCTION__); + xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static void xen_send_IPI_mask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned int cpu; + unsigned long flags; + + Dprintk("%s\n", __FUNCTION__); + local_irq_save(flags); + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu_isset(cpu, cpumask)) { + __send_IPI_one(cpu, vector); + } + } + local_irq_restore(flags); +} + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static int xen_apic_id_registered(void) +{ + /* better be set */ + Dprintk("%s\n", __FUNCTION__); + return physid_isset(smp_processor_id(), phys_cpu_present_map); +} +#endif + +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask) +{ + Dprintk("%s\n", __FUNCTION__); + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + u32 ebx; + + Dprintk("%s\n", __FUNCTION__); + ebx = cpuid_ebx(1); + return ((ebx >> 24) & 0xFF) >> index_msb; +} + +struct genapic apic_xen = { + .name = "xen", +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + .int_delivery_mode = dest_LowestPrio, +#endif + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, + .target_cpus = xen_target_cpus, +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + .apic_id_registered = xen_apic_id_registered, +#endif + .init_apic_ldr = xen_init_apic_ldr, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_mask = xen_send_IPI_mask, + .cpu_mask_to_apicid = xen_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; Index: xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig ==================================================================--- xen-unstable.orig/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig +++ xen-unstable/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig @@ -126,6 +126,10 @@ config X86_IO_APIC bool default XEN_PRIVILEGED_GUEST +config X86_XEN_GENAPIC + bool + default XEN_PRIVILEGED_GUEST || SMP + config X86_LOCAL_APIC bool default XEN_PRIVILEGED_GUEST _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel