Konrad Rzeszutek Wilk
2013-Mar-23 13:36 UTC
[PATCH 1/2] x86: mm: Fix vmalloc_fault oops during lazy MMU updates.
From: Samu Kallio <samu.kallio@aberdeencloud.com>
In paravirtualized x86_64 kernels, vmalloc_fault may cause an oops
when lazy MMU updates are enabled, because set_pgd effects are being
deferred.
One instance of this problem is during process mm cleanup with memory
cgroups enabled. The chain of events is as follows:
- zap_pte_range enables lazy MMU updates
- zap_pte_range eventually calls mem_cgroup_charge_statistics,
which accesses the vmalloc''d mem_cgroup per-cpu stat area
- vmalloc_fault is triggered which tries to sync the corresponding
PGD entry with set_pgd, but the update is deferred
- vmalloc_fault oopses due to a mismatch in the PUD entries
The OOPs usually looks as so:
------------[ cut here ]------------
kernel BUG at arch/x86/mm/fault.c:396!
invalid opcode: 0000 [#1] SMP
.. snip ..
CPU 1
Pid: 10866, comm: httpd Not tainted 3.6.10-4.fc18.x86_64 #1
RIP: e030:[<ffffffff816271bf>] [<ffffffff816271bf>]
vmalloc_fault+0x11f/0x208
.. snip ..
Call Trace:
[<ffffffff81627759>] do_page_fault+0x399/0x4b0
[<ffffffff81004f4c>] ? xen_mc_extend_args+0xec/0x110
[<ffffffff81624065>] page_fault+0x25/0x30
[<ffffffff81184d03>] ? mem_cgroup_charge_statistics.isra.13+0x13/0x50
[<ffffffff81186f78>] __mem_cgroup_uncharge_common+0xd8/0x350
[<ffffffff8118aac7>] mem_cgroup_uncharge_page+0x57/0x60
[<ffffffff8115fbc0>] page_remove_rmap+0xe0/0x150
[<ffffffff8115311a>] ? vm_normal_page+0x1a/0x80
[<ffffffff81153e61>] unmap_single_vma+0x531/0x870
[<ffffffff81154962>] unmap_vmas+0x52/0xa0
[<ffffffff81007442>] ? pte_mfn_to_pfn+0x72/0x100
[<ffffffff8115c8f8>] exit_mmap+0x98/0x170
[<ffffffff810050d9>] ? __raw_callee_save_xen_pmd_val+0x11/0x1e
[<ffffffff81059ce3>] mmput+0x83/0xf0
[<ffffffff810624c4>] exit_mm+0x104/0x130
[<ffffffff8106264a>] do_exit+0x15a/0x8c0
[<ffffffff810630ff>] do_group_exit+0x3f/0xa0
[<ffffffff81063177>] sys_exit_group+0x17/0x20
[<ffffffff8162bae9>] system_call_fastpath+0x16/0x1b
Calling arch_flush_lazy_mmu_mode immediately after set_pgd makes the
changes visible to the consistency checks.
CC: stable@vger.kernel.org
RedHat-Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=914737
Tested-by: Josh Boyer <jwboyer@redhat.com>
Reported-and-Tested-by: Krishna Raman <kraman@redhat.com>
Signed-off-by: Samu Kallio <samu.kallio@aberdeencloud.com>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/mm/fault.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2b97525..0e88336 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -378,10 +378,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long
address)
if (pgd_none(*pgd_ref))
return -1;
- if (pgd_none(*pgd))
+ if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
- else
+ arch_flush_lazy_mmu_mode();
+ } else {
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ }
/*
* Below here mismatches are bugs because these lower tables
--
1.8.0.2
Konrad Rzeszutek Wilk
2013-Mar-23 13:36 UTC
[PATCH 2/2] mm/x86: Patch out arch_flush_lazy_mmu_mode() when running on bare metal
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Invoking arch_flush_lazy_mmu_mode() results in calls to
preempt_enable()/disable() which may have performance impact.
Since lazy MMU is not used on bare metal we can patch away
arch_flush_lazy_mmu_mode() so that it is never called in such
environment.
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Tested-by: Josh Boyer <jwboyer@redhat.com>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/include/asm/paravirt.h | 5 ++++-
arch/x86/include/asm/paravirt_types.h | 2 ++
arch/x86/kernel/paravirt.c | 25 +++++++++++++------------
arch/x86/lguest/boot.c | 1 +
arch/x86/xen/mmu.c | 1 +
5 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5edd174..7361e47 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -703,7 +703,10 @@ static inline void arch_leave_lazy_mmu_mode(void)
PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
}
-void arch_flush_lazy_mmu_mode(void);
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush);
+}
static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags)
diff --git a/arch/x86/include/asm/paravirt_types.h
b/arch/x86/include/asm/paravirt_types.h
index 142236e..b3b0ec1 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -91,6 +91,7 @@ struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
void (*enter)(void);
void (*leave)(void);
+ void (*flush)(void);
};
struct pv_time_ops {
@@ -679,6 +680,7 @@ void paravirt_end_context_switch(struct task_struct *next);
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
+void paravirt_flush_lazy_mmu(void);
void _paravirt_nop(void);
u32 _paravirt_ident_32(u32);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 17fff18..8bfb335 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -263,6 +263,18 @@ void paravirt_leave_lazy_mmu(void)
leave_lazy(PARAVIRT_LAZY_MMU);
}
+void paravirt_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
void paravirt_start_context_switch(struct task_struct *prev)
{
BUG_ON(preemptible());
@@ -292,18 +304,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
return this_cpu_read(paravirt_lazy_mode);
}
-void arch_flush_lazy_mmu_mode(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -475,6 +475,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.lazy_mode = {
.enter = paravirt_nop,
.leave = paravirt_nop,
+ .flush = paravirt_nop,
},
.set_fixmap = native_set_fixmap,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cbd89c..7114c63 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1334,6 +1334,7 @@ __init void lguest_init(void)
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
+ pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
pv_mmu_ops.pte_update = lguest_pte_update;
pv_mmu_ops.pte_update_defer = lguest_pte_update;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e8e3493..f4f4105 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2197,6 +2197,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
.leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
--
1.8.0.2
Boris Ostrovsky
2013-Apr-03 13:26 UTC
Re: [PATCH 2/2] mm/x86: Patch out arch_flush_lazy_mmu_mode() when running on bare metal
On 03/23/2013 09:36 AM, Konrad Rzeszutek Wilk wrote:> From: Boris Ostrovsky <boris.ostrovsky@oracle.com> > > Invoking arch_flush_lazy_mmu_mode() results in calls to > preempt_enable()/disable() which may have performance impact. > > Since lazy MMU is not used on bare metal we can patch away > arch_flush_lazy_mmu_mode() so that it is never called in such > environment. > > Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> > Tested-by: Josh Boyer <jwboyer@redhat.com> > Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> > Acked-by: Borislav Petkov <bp@suse.de> > Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>Peter, what''s the status of these two patches? They are not going into 3.9, right? Thanks. -boris> --- > arch/x86/include/asm/paravirt.h | 5 ++++- > arch/x86/include/asm/paravirt_types.h | 2 ++ > arch/x86/kernel/paravirt.c | 25 +++++++++++++------------ > arch/x86/lguest/boot.c | 1 + > arch/x86/xen/mmu.c | 1 + > 5 files changed, 21 insertions(+), 13 deletions(-) > > diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h > index 5edd174..7361e47 100644 > --- a/arch/x86/include/asm/paravirt.h > +++ b/arch/x86/include/asm/paravirt.h > @@ -703,7 +703,10 @@ static inline void arch_leave_lazy_mmu_mode(void) > PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); > } > > -void arch_flush_lazy_mmu_mode(void); > +static inline void arch_flush_lazy_mmu_mode(void) > +{ > + PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush); > +} > > static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, > phys_addr_t phys, pgprot_t flags) > diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h > index 142236e..b3b0ec1 100644 > --- a/arch/x86/include/asm/paravirt_types.h > +++ b/arch/x86/include/asm/paravirt_types.h > @@ -91,6 +91,7 @@ struct pv_lazy_ops { > /* Set deferred update mode, used for batching operations. */ > void (*enter)(void); > void (*leave)(void); > + void (*flush)(void); > }; > > struct pv_time_ops { > @@ -679,6 +680,7 @@ void paravirt_end_context_switch(struct task_struct *next); > > void paravirt_enter_lazy_mmu(void); > void paravirt_leave_lazy_mmu(void); > +void paravirt_flush_lazy_mmu(void); > > void _paravirt_nop(void); > u32 _paravirt_ident_32(u32); > diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c > index 17fff18..8bfb335 100644 > --- a/arch/x86/kernel/paravirt.c > +++ b/arch/x86/kernel/paravirt.c > @@ -263,6 +263,18 @@ void paravirt_leave_lazy_mmu(void) > leave_lazy(PARAVIRT_LAZY_MMU); > } > > +void paravirt_flush_lazy_mmu(void) > +{ > + preempt_disable(); > + > + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { > + arch_leave_lazy_mmu_mode(); > + arch_enter_lazy_mmu_mode(); > + } > + > + preempt_enable(); > +} > + > void paravirt_start_context_switch(struct task_struct *prev) > { > BUG_ON(preemptible()); > @@ -292,18 +304,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) > return this_cpu_read(paravirt_lazy_mode); > } > > -void arch_flush_lazy_mmu_mode(void) > -{ > - preempt_disable(); > - > - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { > - arch_leave_lazy_mmu_mode(); > - arch_enter_lazy_mmu_mode(); > - } > - > - preempt_enable(); > -} > - > struct pv_info pv_info = { > .name = "bare hardware", > .paravirt_enabled = 0, > @@ -475,6 +475,7 @@ struct pv_mmu_ops pv_mmu_ops = { > .lazy_mode = { > .enter = paravirt_nop, > .leave = paravirt_nop, > + .flush = paravirt_nop, > }, > > .set_fixmap = native_set_fixmap, > diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c > index 1cbd89c..7114c63 100644 > --- a/arch/x86/lguest/boot.c > +++ b/arch/x86/lguest/boot.c > @@ -1334,6 +1334,7 @@ __init void lguest_init(void) > pv_mmu_ops.read_cr3 = lguest_read_cr3; > pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; > pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; > + pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; > pv_mmu_ops.pte_update = lguest_pte_update; > pv_mmu_ops.pte_update_defer = lguest_pte_update; > > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index e8e3493..f4f4105 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -2197,6 +2197,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { > .lazy_mode = { > .enter = paravirt_enter_lazy_mmu, > .leave = xen_leave_lazy_mmu, > + .flush = paravirt_flush_lazy_mmu, > }, > > .set_fixmap = xen_set_fixmap,