Provide cpu hotplug support to Xen. Note this hotplug support is specific to PM, instead of for a run-time single CPU hotplug which can be a separate task. See embedded comment: /* * XXX: One important thing missed here is to migrate vcpus * from dead cpu to other online ones and then put whole * system into a stop state. It assures a safe environment * for a cpu hotplug/remove at normal running state. * * However for xen PM case, at this point: * -> All other domains should be notified with PM event, * and then in following states: * * Suspend state, or * * Paused state, which is a force step to all * domains if they do nothing to suspend * -> All vcpus of dom0 (except vcpu0) have already beem * hot removed * with the net effect that all other cpus only have idle vcpu * running. In this special case, we can avoid vcpu migration * then and system can be considered in a stop state. * * So current cpu hotplug is a special version for PM specific * usage, and need more effort later for full cpu hotplug. * (ktian1) */ Signed-off-by Kevin Tian <kevin.tian@intel.com> diff -r fe69f7fd1639 xen/arch/x86/cpu/common.c --- a/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:52 2007 -0400 @@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void) { int cpu = raw_smp_processor_id(); cpu_clear(cpu, cpu_initialized); - - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} -#endif +} +#endif diff -r fe69f7fd1639 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/domain.c Tue Apr 10 13:41:52 2007 -0400 @@ -76,6 +76,7 @@ static void default_idle(void) /* We don''t actually take CPU down, just spin without interrupts. */ static inline void play_dead(void) { + __cpu_disable(); /* This must be done before dead CPU ack */ cpu_exit_clear(); wbinvd(); @@ -101,6 +102,8 @@ void idle_loop(void) { for ( ; ; ) { + if (cpu_is_offline(smp_processor_id())) + play_dead(); page_scrub_schedule_work(); default_idle(); do_softirq(); diff -r fe69f7fd1639 xen/arch/x86/i8259.c --- a/xen/arch/x86/i8259.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/i8259.c Tue Apr 10 13:41:52 2007 -0400 @@ -396,6 +396,7 @@ void __init init_IRQ(void) irq_desc[i].action = NULL; irq_desc[i].depth = 1; spin_lock_init(&irq_desc[i].lock); + cpus_setall(irq_desc[i].affinity); set_intr_gate(i, interrupt[i]); } diff -r fe69f7fd1639 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/io_apic.c Tue Apr 10 13:41:52 2007 -0400 @@ -34,9 +34,6 @@ #include <asm/desc.h> #include <mach_apic.h> #include <io_ports.h> - -#define set_irq_info(irq, mask) ((void)0) -#define set_native_irq_info(irq, mask) ((void)0) /* Different to Linux: our implementation can be simpler. */ #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq))) diff -r fe69f7fd1639 xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/irq.c Tue Apr 10 13:41:52 2007 -0400 @@ -656,7 +656,8 @@ __initcall(setup_dump_irqs); __initcall(setup_dump_irqs); #ifdef CONFIG_HOTPLUG_CPU -#include <mach_apic.h> +#include <asm/mach-generic/mach_apic.h> +#include <xen/delay.h> void fixup_irqs(cpumask_t map) { @@ -673,8 +674,8 @@ void fixup_irqs(cpumask_t map) printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); else if (irq_desc[irq].action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff -r fe69f7fd1639 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/smpboot.c Tue Apr 10 14:30:27 2007 -0400 @@ -109,6 +109,9 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mos { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); +static void *stack_base[NR_CPUS] __cacheline_aligned; +spinlock_t cpu_add_remove_lock; + /* * Trampoline 80x86 program as an array. */ @@ -121,7 +124,7 @@ static void map_cpu_to_logical_apicid(vo static void map_cpu_to_logical_apicid(void); /* State of each CPU. */ -/*DEFINE_PER_CPU(int, cpu_state) = { 0 };*/ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; /* * Currently trivial. Write the real->protected mode @@ -439,9 +442,11 @@ void __devinit smp_callin(void) /* * Synchronize the TSC with the BP */ - if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) + if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) { synchronize_tsc_ap(); - calibrate_tsc_ap(); + /* No sync for same reason as above */ + calibrate_tsc_ap(); + } } static int cpucount, booting_cpu; @@ -508,8 +513,12 @@ static void construct_percpu_idt(unsigne { unsigned char idt_load[10]; - idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); - memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t)); + /* If IDT table exists since last hotplug, reuse it */ + if (!idt_tables[cpu]) { + idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); + memcpy(idt_tables[cpu], idt_table, + IDT_ENTRIES*sizeof(idt_entry_t)); + } *(unsigned short *)(&idt_load[0]) (IDT_ENTRIES*sizeof(idt_entry_t))-1; *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu]; @@ -571,15 +580,15 @@ void __devinit start_secondary(void *unu * lock helps us to not include this cpu in a currently in progress * smp_call_function(). */ - /*lock_ipi_call_lock();*/ + lock_ipi_call_lock(); cpu_set(smp_processor_id(), cpu_online_map); - /*unlock_ipi_call_lock();*/ - /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/ + unlock_ipi_call_lock(); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + + init_percpu_time(); /* We can take interrupts now: we''re officially "up". */ local_irq_enable(); - - init_percpu_time(); wmb(); startup_cpu_idle_loop(); @@ -877,6 +886,22 @@ static inline int alloc_cpu_id(void) return cpu; } +static struct vcpu *prepare_idle_vcpu(unsigned int cpu) +{ + if (idle_vcpu[cpu]) + return idle_vcpu[cpu]; + + return alloc_idle_vcpu(cpu); +} + +static void *prepare_idle_stack(unsigned int cpu) +{ + if (!stack_base[cpu]) + stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER); + + return stack_base[cpu]; +} + static int __devinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -894,7 +919,7 @@ static int __devinit do_boot_cpu(int api booting_cpu = cpu; - v = alloc_idle_vcpu(cpu); + v = prepare_idle_vcpu(cpu); BUG_ON(v == NULL); /* start_eip had better be page-aligned! */ @@ -903,7 +928,7 @@ static int __devinit do_boot_cpu(int api /* So we see what''s up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); - stack_start.esp = alloc_xenheap_pages(STACK_ORDER); + stack_start.esp = prepare_idle_stack(cpu); /* Debug build: detect stack overflow by setting up a guard page. */ memguard_guard_stack(stack_start.esp); @@ -980,6 +1005,12 @@ static int __devinit do_boot_cpu(int api } #ifdef CONFIG_HOTPLUG_CPU +static void idle_task_exit(void) +{ + /* Give up lazy state borrowed by this idle vcpu */ + __sync_lazy_execstate(); +} + void cpu_exit_clear(void) { int cpu = raw_smp_processor_id(); @@ -988,7 +1019,6 @@ void cpu_exit_clear(void) cpucount --; cpu_uninit(); - irq_ctx_exit(cpu); cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); @@ -997,26 +1027,9 @@ void cpu_exit_clear(void) unmap_cpu_to_logical_apicid(cpu); } -struct warm_boot_cpu_info { - struct completion *complete; - int apicid; - int cpu; -}; - -static void __cpuinit do_warm_boot_cpu(void *p) -{ - struct warm_boot_cpu_info *info = p; - do_boot_cpu(info->apicid, info->cpu); - complete(info->complete); -} - static int __cpuinit __smp_prepare_cpu(int cpu) { - DECLARE_COMPLETION(done); - struct warm_boot_cpu_info info; - struct work_struct task; int apicid, ret; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -1024,34 +1037,19 @@ static int __cpuinit __smp_prepare_cpu(i goto exit; } - /* - * the CPU isn''t initialized at boot time, allocate gdt table here. - * cpu_init will initialize it - */ - if (!cpu_gdt_descr->address) { - cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); - if (!cpu_gdt_descr->address) - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - ret = -ENOMEM; - goto exit; - } - - info.complete = &done; - info.apicid = apicid; - info.cpu = cpu; - INIT_WORK(&task, do_warm_boot_cpu, &info); - tsc_sync_disabled = 1; /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - KERNEL_PGD_PTRS); - flush_tlb_all(); - schedule_work(&task); - wait_for_completion(&done); + init_low_mappings(); + + do_boot_cpu(apicid, cpu); tsc_sync_disabled = 0; +#ifdef CONFIG_X86_64 zap_low_mappings(); +#else + zap_low_mappings(idle_pg_table_l2); +#endif ret = 0; exit: return ret; @@ -1084,6 +1082,8 @@ static void __init smp_boot_cpus(unsigne boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; + + stack_base[0] = stack_start.esp; /*current_thread_info()->cpu = 0;*/ /*smp_tune_scheduling();*/ @@ -1255,7 +1255,8 @@ void __devinit smp_prepare_boot_cpu(void cpu_set(smp_processor_id(), cpu_callout_map); cpu_set(smp_processor_id(), cpu_present_map); cpu_set(smp_processor_id(), cpu_possible_map); - /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/ + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + spin_lock_init(&cpu_add_remove_lock); } #ifdef CONFIG_HOTPLUG_CPU @@ -1278,11 +1279,12 @@ remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - c[cpu].phys_proc_id = 0; - c[cpu].cpu_core_id = 0; + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; cpu_clear(cpu, cpu_sibling_setup_map); } +extern void fixup_irqs(cpumask_t map); int __cpu_disable(void) { cpumask_t map = cpu_online_map; @@ -1299,12 +1301,15 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; + local_irq_disable(); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); mdelay(1); local_irq_disable(); + destroy_percpu_time(); + remove_siblinginfo(cpu); cpu_clear(cpu, map); @@ -1323,13 +1328,89 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); - if (1 == num_online_cpus()) - alternatives_smp_switch(0); return; } - msleep(100); + mdelay(100); + mb(); + process_pending_timers(); } printk(KERN_ERR "CPU %u didn''t die...\n", cpu); +} + +/* + * XXX: One important thing missed here is to migrate vcpus + * from dead cpu to other online ones and then put whole + * system into a stop state. It assures a safe environment + * for a cpu hotplug/remove at normal running state. + * + * However for xen PM case, at this point: + * -> All other domains should be notified with PM event, + * and then in following states: + * * Suspend state, or + * * Paused state, which is a force step to all + * domains if they do nothing to suspend + * -> All vcpus of dom0 (except vcpu0) have already beem + * hot removed + * with the net effect that all other cpus only have idle vcpu + * running. In this special case, we can avoid vcpu migration + * then and system can be considered in a stop state. + * + * So current cpu hotplug is a special version for PM specific + * usage, and need more effort later for full cpu hotplug. + * (ktian1) + */ +int cpu_down(unsigned int cpu) +{ + int err = 0; + cpumask_t mask; + + spin_lock(&cpu_add_remove_lock); + if (num_online_cpus() == 1) { + err = -EBUSY; + goto out; + } + + if (!cpu_online(cpu)) { + err = -EINVAL; + goto out; + } + + printk("Prepare to bring CPU%d down...\n", cpu); + /* Send notification to remote idle vcpu */ + cpus_clear(mask); + cpu_set(cpu, mask); + per_cpu(cpu_state, cpu) = CPU_DYING; + smp_send_event_check_mask(mask); + + __cpu_die(cpu); + + if (cpu_online(cpu)) { + printk("Bad state (DEAD, but in online map) on CPU%d\n", cpu); + err = -EBUSY; + } +out: + spin_unlock(&cpu_add_remove_lock); + return err; +} + +int cpu_up(unsigned int cpu) +{ + int err = 0; + + spin_lock(&cpu_add_remove_lock); + if (cpu_online(cpu)) { + printk("Bring up a online cpu. Bogus!\n"); + err = -EBUSY; + goto out; + } + + err = __cpu_up(cpu); + if (err < 0) + goto out; + +out: + spin_unlock(&cpu_add_remove_lock); + return err; } /* From kernel/power/main.c */ @@ -1390,6 +1471,22 @@ void __cpu_die(unsigned int cpu) int __devinit __cpu_up(unsigned int cpu) { +#ifdef CONFIG_HOTPLUG_CPU + int ret=0; + + /* + * We do warm boot only on cpus that had booted earlier + * Otherwise cold boot is all handled from smp_boot_cpus(). + * cpu_callin_map is set during AP kickstart process. Its reset + * when a cpu is taken offline from cpu_exit_clear(). + */ + if (!cpu_isset(cpu, cpu_callin_map)) + ret = __smp_prepare_cpu(cpu); + + if (ret) + return -EIO; +#endif + /* In case one didn''t come up */ if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_DEBUG "skipping cpu%d, didn''t come online\n", cpu); diff -r fe69f7fd1639 xen/arch/x86/time.c --- a/xen/arch/x86/time.c Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/arch/x86/time.c Tue Apr 10 14:33:20 2007 -0400 @@ -963,6 +963,12 @@ void init_percpu_time(void) set_timer(&t->calibration_timer, NOW() + EPOCH); } +/* Normally all pending timers are fired once APIC interrupt is + * active again, and thus no need to kill them when cpu is down. + * (Migrate may be required for pure cpu-hotplug). However + * calibration timer is a bit special, and re-initialization is + * required after cpu is up. + */ void destroy_percpu_time(void) { kill_timer(&this_cpu(cpu_time).calibration_timer); diff -r fe69f7fd1639 xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/include/asm-x86/config.h Tue Apr 10 13:41:52 2007 -0400 @@ -37,6 +37,8 @@ #define CONFIG_ACPI_BOOT 1 #define CONFIG_VGA 1 + +#define CONFIG_HOTPLUG_CPU 1 #define HZ 100 diff -r fe69f7fd1639 xen/include/asm-x86/smp.h --- a/xen/include/asm-x86/smp.h Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/include/asm-x86/smp.h Tue Apr 10 13:41:52 2007 -0400 @@ -46,14 +46,30 @@ extern void zap_low_mappings(l2_pgentry_ #endif extern void init_low_mappings(void); + +extern void lock_ipi_call_lock(void); +extern void unlock_ipi_call_lock(void); #define MAX_APICID 256 extern u8 x86_cpu_to_apicid[]; #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] +/* State of each CPU. */ +#define CPU_ONLINE 0x0002 /* CPU is up */ +#define CPU_DYING 0x0003 /* CPU is requested to die */ +#define CPU_DEAD 0x0004 /* CPU is dead */ +DECLARE_PER_CPU(int, cpu_state); + #ifdef CONFIG_HOTPLUG_CPU +#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) =CPU_DYING) +extern int cpu_down(unsigned int cpu); +extern int cpu_up(unsigned int cpu); extern void cpu_exit_clear(void); extern void cpu_uninit(void); +extern void disable_nonboot_cpus(void); +extern void enable_nonboot_cpus(void); +#else +static inline int cpu_is_offline(int cpu) {return 0;} #endif /* diff -r fe69f7fd1639 xen/include/asm-x86/system.h --- a/xen/include/asm-x86/system.h Tue Apr 10 13:41:45 2007 -0400 +++ b/xen/include/asm-x86/system.h Tue Apr 10 13:41:52 2007 -0400 @@ -313,6 +313,8 @@ static always_inline unsigned long long #define __sti() __asm__ __volatile__("sti": : :"memory") /* used in the idle loop; sti takes one instruction cycle to complete */ #define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory") +/* used when interrupts are already enabled or to shutdown the processor */ +#define halt() __asm__ __volatile__("hlt": : :"memory") /* For spinlocks etc */ #if defined(__i386__) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel