Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 02/10] Clean up duplicate includes in arch/i386/xen/
This patch cleans up duplicate includes in arch/i386/xen/ Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/xen/enlighten.c | 1 - arch/i386/xen/mmu.c | 2 -- 2 files changed, 3 deletions(-) ==================================================================--- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> -#include <linux/smp.h> #include <xen/interface/xen.h> #include <xen/interface/physdev.h> diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index 4ae038a..35691a2 100644 ==================================================================--- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -41,7 +41,6 @@ #include <linux/sched.h> #include <linux/highmem.h> #include <linux/bug.h> -#include <linux/sched.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 06/10] xen: lock pte pages while pinning/unpinning
When a pagetable is created, it is made globally visible in the rmap prio tree before it is pinned via arch_dup_mmap(), and remains in the rmap tree while it is unpinned with arch_exit_mmap(). This means that other CPUs may race with the pinning/unpinning process, and see a pte between when it gets marked RO and actually pinned, causing any pte updates to fail with write-protect faults. As a result, all pte pages must be properly locked, and only unlocked once the pinning/unpinning process has finished. In order to avoid taking spinlocks for the whole pagetable - which may overflow the PREEMPT_BITS portion of preempt counter - it locks and pins each pte page individually, and then finally pins the whole pagetable. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rik van Riel <riel@redhat.com> Cc: Hugh Dickens <hugh@veritas.com> Cc: David Rientjes <rientjes@google.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andi Kleen <ak@suse.de> Cc: Keir Fraser <keir@xensource.com> Cc: Jan Beulich <jbeulich@novell.com> --- arch/i386/xen/enlighten.c | 30 ++++++++--- arch/i386/xen/mmu.c | 113 ++++++++++++++++++++++++++++++++------------- mm/Kconfig | 1 3 files changed, 103 insertions(+), 41 deletions(-) ==================================================================--- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -687,6 +687,15 @@ static __init void xen_alloc_pt_init(str make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } +static void pin_pagetable_pfn(unsigned level, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) @@ -696,9 +705,10 @@ static void xen_alloc_pt(struct mm_struc if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - if (!PageHighMem(page)) + if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - else + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else /* make sure there are no stray mappings of this page */ kmap_flush_unused(); @@ -711,8 +721,10 @@ static void xen_release_pt(u32 pfn) struct page *page = pfn_to_page(pfn); if (PagePinned(page)) { - if (!PageHighMem(page)) + if (!PageHighMem(page)) { + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } } } @@ -827,15 +839,15 @@ static __init void xen_pagetable_setup_d /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ { - struct mmuext_op op; + unsigned level; + #ifdef CONFIG_X86_PAE - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + + pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); } } ==================================================================--- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) } #endif /* CONFIG_X86_PAE */ - +enum pt_level { + PT_PGD, + PT_PUD, + PT_PMD, + PT_PTE +}; /* (Yet another) pagetable walker. This one is intended for pinning a @@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) FIXADDR_TOP. But the important bit is that we don't pin beyond there, because then we start getting into Xen's ptes. */ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), unsigned long limit) { pgd_t *pgd = pgd_base; @@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int pud = pud_offset(pgd, 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), 0); + flush |= (*func)(virt_to_page(pud), PT_PUD); for (; addr != pud_limit; pud++, addr = pud_next) { pmd_t *pmd; @@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), 0); + flush |= (*func)(virt_to_page(pmd), PT_PMD); for (; addr != pmd_limit; pmd++) { addr += (PAGE_SIZE * PTRS_PER_PTE); @@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int if (pmd_none(*pmd)) continue; - flush |= (*func)(pmd_page(*pmd), 0); + flush |= (*func)(pmd_page(*pmd), PT_PTE); } } } - flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + flush |= (*func)(virt_to_page(pgd_base), PT_PGD); return flush; } -static int pin_page(struct page *page, unsigned flags) +static spinlock_t *lock_pte(struct page *page) +{ + spinlock_t *ptl = NULL; + +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + ptl = __pte_lockptr(page); + spin_lock(ptl); +#endif + + return ptl; +} + +static void do_unlock(void *v) +{ + spinlock_t *ptl = v; + spin_unlock(ptl); +} + +static void xen_do_pin(unsigned level, unsigned long pfn) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = level; + op->arg1.mfn = pfn_to_mfn(pfn); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +} + +static int pin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); int flush; @@ -396,12 +431,26 @@ static int pin_page(struct page *page, u void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl; flush = 0; + + ptl = NULL; + if (level == PT_PTE) + ptl = lock_pte(page); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (level == PT_PTE) + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); + + if (ptl) { + /* Queue a deferred unlock for when this batch + is completed. */ + xen_mc_callback(do_unlock, ptl); + } } return flush; @@ -412,8 +461,7 @@ static int pin_page(struct page *page, u read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - struct multicall_space mcs; - struct mmuext_op *op; + unsigned level; xen_mc_batch(); @@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - #ifdef CONFIG_X86_PAE - op->cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op->cmd = MMUEXT_PIN_L2_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_do_pin(level, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } @@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ -static __init int mark_pinned(struct page *page, unsigned flags) +static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, unsigned flags) +static int unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); if (pgfl && !PageHighMem(page)) { void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl = NULL; + struct multicall_space mcs; + + if (level == PT_PTE) { + ptl = lock_pte(page); + + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + } + + mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + /* unlock when batch completed */ + xen_mc_callback(do_unlock, ptl); + } } return 0; /* never need to flush on unpin */ @@ -472,18 +531,9 @@ static int unpin_page(struct page *page, /* Release a pagetables pages back as normal RW */ static void xen_pgd_unpin(pgd_t *pgd) { - struct mmuext_op *op; - struct multicall_space mcs; - xen_mc_batch(); - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_UNPIN_TABLE; - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); pgd_walk(pgd, unpin_page, TASK_SIZE); @@ -585,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (PagePinned(virt_to_page(mm->pgd))) xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); } ==================================================================--- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,6 @@ config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 - default "4096" if XEN default "4" # --
This is my current set of updates to Xen for 2.6.24. This is largely a bugfix set, and a couple of them are also relevent to 2.6.23. These are in the pre-x86 merge form; I'll update them once the merge goes into git. Quick overview: - remove some dead code in arch/i386/mm/init.c - clean up some duplicate includes - when sending an IPI, yield the vcpu if the destination doesn't have a real cpu to run on - fix a bug where a lazy cr3 reload can prevent a pagetable from being unpinned - deal with split pte locks properly - fix an old structure used for the register_vcpu_info hypercall - add some multicall debugging help - workaround for a bad XFS/Xen interaction Thanks, J --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 05/10] xen: deal with stale cr3 values when unpinning pagetables
When a pagetable is no longer in use, it must be unpinned so that its pages can be freed. However, this is only possible if there are no stray uses of the pagetable. The code currently deals with all the usual cases, but there's a rare case where a vcpu is changing cr3, but is doing so lazily, and the change hasn't actually happened by the time the pagetable is unpinned, even though it appears to have been completed. This change adds a second per-cpu cr3 variable - xen_current_cr3 - which tracks the actual state of the vcpu cr3. It is only updated once the actual hypercall to set cr3 has been completed. Other processors wishing to unpin a pagetable can check other vcpu's xen_current_cr3 values to see if any cross-cpu IPIs are needed to clean things up. [ Stable folks: 2.6.23 bugfix ] Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Stable Kernel <stable@kernel.org> --- arch/i386/xen/enlighten.c | 63 ++++++++++++++++++++++++++++++--------------- arch/i386/xen/mmu.c | 33 ++++++++++++++++++++--- arch/i386/xen/xen-ops.h | 1 3 files changed, 71 insertions(+), 26 deletions(-) ==================================================================--- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -55,7 +55,23 @@ DEFINE_PER_CPU(enum paravirt_lazy_mode, DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); -DEFINE_PER_CPU(unsigned long, xen_cr3); + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -631,32 +647,36 @@ static unsigned long xen_read_cr3(void) return x86_read_percpu(xen_cr3); } +static void set_current_cr3(void *v) +{ + x86_write_percpu(xen_current_cr3, (unsigned long)v); +} + static void xen_write_cr3(unsigned long cr3) { + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + BUG_ON(preemptible()); - if (cr3 == x86_read_percpu(xen_cr3)) { - /* just a simple tlb flush */ - xen_flush_tlb(); - return; - } - + mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ x86_write_percpu(xen_cr3, cr3); - - { - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); - - op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - } + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + /* Update xen_update_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } /* Early in boot, while setting up the initial pagetable, assume @@ -1124,6 +1144,7 @@ asmlinkage void __init xen_start_kernel( /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); + x86_write_percpu(xen_current_cr3, __pa(pgd)); #ifdef CONFIG_SMP /* Don't do the full vcpu_info placement stuff until we have a ==================================================================--- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -564,20 +564,43 @@ static void drop_other_mm_ref(void *info if (__get_cpu_var(cpu_tlbstate).active_mm == mm) leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ + if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { + load_cr3(swapper_pg_dir); + arch_flush_lazy_cpu_mode(); + } } static void drop_mm_ref(struct mm_struct *mm) { + cpumask_t mask; + unsigned cpu; + if (current->active_mm == mm) { if (current->mm == mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); - } - - if (!cpus_empty(mm->cpu_vm_mask)) - xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, - mm, 1); + arch_flush_lazy_cpu_mode(); + } + + /* Get the "official" set of cpus referring to our pagetable. */ + mask = mm->cpu_vm_mask; + + /* It's possible that a vcpu may have a stale reference to our + cr3, because its in lazy mode, and it hasn't yet flushed + its set of pending hypercalls yet. In this case, we can + look at its actual current cr3 value, and force it to flush + if needed. */ + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpu_set(cpu, mask); + } + + if (!cpus_empty(mask)) + xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) ==================================================================--- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); +DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info *HYPERVISOR_shared_info; --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 03/10] xen: yield to IPI target if necessary
When sending a call-function IPI to a vcpu, yield if the vcpu isn't running. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/xen/smp.c | 14 ++++++++++---- arch/i386/xen/time.c | 6 ++++++ arch/i386/xen/xen-ops.h | 2 ++ 3 files changed, 18 insertions(+), 4 deletions(-) ==================================================================--- a/arch/i386/xen/smp.c +++ b/arch/i386/xen/smp.c @@ -360,7 +360,8 @@ int xen_smp_call_function_mask(cpumask_t void *info, int wait) { struct call_data_struct data; - int cpus; + int cpus, cpu; + bool yield; /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); @@ -389,9 +390,14 @@ int xen_smp_call_function_mask(cpumask_t /* Send a message to other CPUs and wait for them to respond */ xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - /* Make sure other vcpus get a chance to run. - XXX too severe? Maybe we should check the other CPU's states? */ - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + /* Make sure other vcpus get a chance to run if they need to. */ + yield = false; + for_each_cpu_mask(cpu, mask) + if (xen_vcpu_stolen(cpu)) + yield = true; + + if (yield) + HYPERVISOR_sched_op(SCHEDOP_yield, 0); /* Wait for response */ while (atomic_read(&data.started) != cpus || ==================================================================--- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -103,6 +103,12 @@ static void get_runstate_snapshot(struct *res = *state; barrier(); } while (get64(&state->state_entry_time) != state_time); +} + +/* return true when a vcpu could run but has no real cpu to run on */ +bool xen_vcpu_stolen(int vcpu) +{ + return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; } static void setup_runstate_info(int cpu) ==================================================================--- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -27,6 +27,8 @@ int xen_set_wallclock(unsigned long time int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); +bool xen_vcpu_stolen(int vcpu); + void xen_mark_init_mm_pinned(void); DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 04/10] xen: add batch completion callbacks
This adds a mechanism to register a callback function to be called once a batch of hypercalls has been issued. This is typically used to unlock things which must remain locked until the hypercall has taken place. [ Stable folks: pre-req for 2.6.23 bugfix "xen: deal with stale cr3 values when unpinning pagetables" ] Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Stable Kernel <stable@kernel.org> --- arch/i386/xen/multicalls.c | 29 ++++++++++++++++++++++++++--- arch/i386/xen/multicalls.h | 3 +++ 2 files changed, 29 insertions(+), 3 deletions(-) ==================================================================--- a/arch/i386/xen/multicalls.c +++ b/arch/i386/xen/multicalls.c @@ -32,7 +32,11 @@ struct mc_buffer { struct mc_buffer { struct multicall_entry entries[MC_BATCH]; u64 args[MC_ARGS]; - unsigned mcidx, argidx; + struct callback { + void (*fn)(void *); + void *data; + } callbacks[MC_BATCH]; + unsigned mcidx, argidx, cbidx; }; static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); @@ -43,6 +47,7 @@ void xen_mc_flush(void) struct mc_buffer *b = &__get_cpu_var(mc_buffer); int ret = 0; unsigned long flags; + int i; BUG_ON(preemptible()); @@ -51,8 +56,6 @@ void xen_mc_flush(void) local_irq_save(flags); if (b->mcidx) { - int i; - if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) BUG(); for (i = 0; i < b->mcidx; i++) @@ -64,6 +67,13 @@ void xen_mc_flush(void) BUG_ON(b->argidx != 0); local_irq_restore(flags); + + for(i = 0; i < b->cbidx; i++) { + struct callback *cb = &b->callbacks[i]; + + (*cb->fn)(cb->data); + } + b->cbidx = 0; BUG_ON(ret); } @@ -88,3 +98,16 @@ struct multicall_space __xen_mc_entry(si return ret; } + +void xen_mc_callback(void (*fn)(void *), void *data) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct callback *cb; + + if (b->cbidx == MC_BATCH) + xen_mc_flush(); + + cb = &b->callbacks[b->cbidx++]; + cb->fn = fn; + cb->data = data; +} ==================================================================--- a/arch/i386/xen/multicalls.h +++ b/arch/i386/xen/multicalls.h @@ -42,4 +42,7 @@ static inline void xen_mc_issue(unsigned local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); } +/* Set up a callback to be called when the current batch is flushed */ +void xen_mc_callback(void (*fn)(void *), void *data); + #endif /* _XEN_MULTICALLS_H */ --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 09/10] xen: add some debug output for failed multicalls
Multicalls are expected to never fail, and the normal response to a failed multicall is very terse. In the interests of better debuggability, add some more verbose output. It may be worth turning this off once it all seems more tested. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/xen/multicalls.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) ==================================================================--- a/arch/i386/xen/multicalls.c +++ b/arch/i386/xen/multicalls.c @@ -26,11 +26,16 @@ #include "multicalls.h" +#define MC_DEBUG 1 + #define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; +#if MC_DEBUG + struct multicall_entry debug[MC_BATCH]; +#endif u64 args[MC_ARGS]; struct callback { void (*fn)(void *); @@ -56,11 +61,31 @@ void xen_mc_flush(void) local_irq_save(flags); if (b->mcidx) { +#if MC_DEBUG + memcpy(b->debug, b->entries, + b->mcidx * sizeof(struct multicall_entry)); +#endif + if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) BUG(); for (i = 0; i < b->mcidx; i++) if (b->entries[i].result < 0) ret++; + +#if MC_DEBUG + if (ret) { + printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", + ret, smp_processor_id()); + for(i = 0; i < b->mcidx; i++) { + printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + i+1, b->mcidx, + b->debug[i].op, + b->debug[i].args[0], + b->entries[i].result); + } + } +#endif + b->mcidx = 0; b->argidx = 0; } else --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 01/10] remove dead code in pgtable_cache_init
The conversion from using a slab cache to quicklist left some residual dead code. I note that in the conversion it now always allocates a whole page for the pgd, rather than the 32 bytes needed for a PAE pgd. Was this intended? Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@suse.de> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> --- arch/i386/mm/init.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) ==================================================================--- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -746,24 +746,12 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { - size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); - - if (PTRS_PER_PMD > 1) { + if (PTRS_PER_PMD > 1) pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); - if (!SHARED_KERNEL_PMD) { - /* If we're in PAE mode and have a non-shared - kernel pmd, then the pgd size must be a - page size. This is because the pgd_list - links through the page structure, so there - can only be one pgd per page for this to - work. */ - pgd_size = PAGE_SIZE; - } - } + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + SLAB_PANIC, + pmd_ctor); } /* --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 07/10] xen: ask the hypervisor how much space it needs reserved
Ask the hypervisor how much space it needs reserved, since 32-on-64 doesn't need any space, and it may change in future. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/xen/enlighten.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) ==================================================================--- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -1085,6 +1085,17 @@ static const struct machine_ops __initda }; +static void __init xen_reserve_top(void) +{ + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top + 2 * PAGE_SIZE); +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1134,7 +1145,7 @@ asmlinkage void __init xen_start_kernel( paravirt_ops.kernel_rpl = 0; /* set the limit of our address space */ - reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + xen_reserve_top(); /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 08/10] xen: fix incorrect vcpu_register_vcpu_info hypercall argument
The kernel's copy of struct vcpu_register_vcpu_info was out of date, at best causing the hypercall to fail and the guest kernel to fall back to the old mechanism, or worse, causing random memory corruption. [ Stable folks: applies to 2.6.23 ] Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Stable Kernel <stable@kernel.org> Cc: Morten =?utf-8?q?B=C3=B8geskov?= <xen-users@morten.bogeskov.dk> Cc: Mark Williamson <mark.williamson@cl.cam.ac.uk> --- arch/i386/xen/enlighten.c | 2 +- include/xen/interface/vcpu.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) ==================================================================--- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -115,7 +115,7 @@ static void __init xen_vcpu_setup(int cp info.mfn = virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", cpu, vcpup, info.mfn, info.offset); /* Check to see if the hypervisor will put the vcpu_info ==================================================================--- a/include/xen/interface/vcpu.h +++ b/include/xen/interface/vcpu.h @@ -160,8 +160,9 @@ struct vcpu_set_singleshot_timer { */ #define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */ struct vcpu_register_vcpu_info { - uint32_t mfn; /* mfn of page to place vcpu_info */ - uint32_t offset; /* offset within page */ + uint64_t mfn; /* mfn of page to place vcpu_info */ + uint32_t offset; /* offset within page */ + uint32_t rsvd; /* unused */ }; #endif /* __XEN_PUBLIC_VCPU_H__ */ --
Jeremy Fitzhardinge
2007-Oct-12 14:33 UTC
[PATCH 10/10] xfs: eagerly remove vmap mappings to avoid upsetting Xen
XFS leaves stray mappings around when it vmaps memory to make it virtually contigious. This upsets Xen if one of those pages is being recycled into a pagetable, since it finds an extra writable mapping of the page. This patch solves the problem in a brute force way, by making XFS always eagerly unmap its mappings. [ Stable: This works around a bug in 2.6.23. We may come up with a better solution for mainline, but this seems like a low-impact fix for the stable kernel. ] Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: XFS masters <xfs-masters@oss.sgi.com> Cc: Stable kernel <stable@kernel.org> Cc: Morten =?utf-8?q?B=C3=B8geskov?= <xen-users@morten.bogeskov.dk> Cc: Mark Williamson <mark.williamson@cl.cam.ac.uk> --- fs/xfs/linux-2.6/xfs_buf.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) ==================================================================--- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -186,6 +186,19 @@ free_address( void *addr) { a_list_t *aentry; + +#ifdef CONFIG_XEN + /* + * Xen needs to be able to make sure it can get an exclusive + * RO mapping of pages it wants to turn into a pagetable. If + * a newly allocated page is also still being vmap()ed by xfs, + * it will cause pagetable construction to fail. This is a + * quick workaround to always eagerly unmap pages so that Xen + * is happy. + */ + vunmap(addr); + return; +#endif aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { --