Jürgen Keil
2008-Apr-21 10:26 UTC
xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write?
The problem below may or may not be related to problems recently reported here: - "Opensolaris guest eats 100% cpu" http://www.opensolaris.org/jive/thread.jspa?threadID=57568&tstart=0 - "Libmicro issue in XVM Guest" http://www.opensolaris.org/jive/thread.jspa?threadID=57630&tstart=0 I''m observing *excessive high* number of pagefaults after a process has forked. I''m observing this both in 32-bit dom0 OpenSolaris kernels, and also in 32-bit domU OpenSolaris kernels, when running on a 32-bit 3.1.2-xvm hypervisor. Problem is much worse in the PV domU OpenSolaris kernel. I''m using current OpenSolaris bits compiled from the mercurial repository (post snv_88). Systems where I observed this issue: 1. ASUS M2NPV-VM mainboard, AMD Athlon(tm) 64 X2 Dual Core Processor 6400+ 8GB of memory Note: xen 3.1.2-xvm hypervisor was booted with option "mem=4G" or "mem=2G" 2. ASUS M2N-SLI deluxe, AMD Athlon(tm) 64 X2 Dual Core Processor 4200+ 2GB of memory 3. Toshiba Tecra S1, Pentium M, 2GB of memory Test case is this: # cat fork.c #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/wait.h> static void fill(int n, void *p) { char data[4096]; memset(data, data[0], sizeof(data)); if (--n > 0) fill(n, data); } int main(int argc, char **argv) { pid_t p; int status; fill(8, NULL); switch (p = fork()) { case -1: perror("fork"); exit(1); case 0: /*sleep(1);*/ fill(8, NULL); _exit(0); default: fill(8, NULL); wait(&status); break; } exit(0); } # cc -o fork fork.c # cat pagefault.d #!/usr/sbin/dtrace -s #pragma D option flowindent BEGIN { type[0 /*F_INVAL*/] = "F_INVAL"; type[1 /*F_PROT*/] = "F_PROT"; type[2 /*F_SOFTLOCK*/] = "F_SOFTLOCK"; type[3 /*F_SOFTUNLOCK*/] = "F_SOFTUNLOCK"; rw[0 /*S_OTHER*/] = "S_OTHER"; rw[1 /*S_READ*/] = "S_READ"; rw[2 /*S_WRITE*/] = "S_WRITE"; rw[3 /*S_EXEC*/] = "S_EXEC"; rw[4 /*S_CREATE*/] = "S_CREATE"; rw[5 /*S_READ_NOCOW*/] = "S_READ_NOCOW"; flt_cnt = 0; } fbt::pagefault:entry { this->addr = (caddr_t)arg0; this->type = (enum fault_type)arg1; this->rw = (enum seg_rw)arg2; this->iskernel = arg3; } fbt::pagefault:entry /(uint64_t)this->addr < 0x08048000 && execname == "fork"/ { @fault[execname, type[this->type], this->addr, rw[this->rw], uregs[R_PC]] = count(); printf("prog %s(%d), addr %a %s %s, pc %a trap %x err %x", execname, pid, this->addr, type[this->type], rw[this->rw], uregs[R_PC], uregs[R_TRAPNO], uregs[R_ERR]); self->trace = 1; } fbt::pagefault:return /self->trace/ { printf("ret: %x", arg1); self->trace = 0; } END { printa("prog %s, type %s, addr %a %s, pc %a, count %@u\n", @fault); } /* fbt::segvn_fault:entry /self->trace && ((uregs[R_PC] & 0xffff) == 0x4a3b || (uregs[R_PC] & 0xffff) == 0x4a3c) && flt_cnt < 5/ { self->trace_me = 1; flt_cnt++; } fbt:::entry /self->trace_me/ { } fbt:::return /self->trace_me/ { printf("returns %x", arg1); } fbt::segvn_fault:return /self->trace_me/ { self->trace_me = 0; } fbt::x86pte_cas:entry /self->trace_me/ { this->ht = (htable_t *)arg0; this->entry = (uint_t)arg1; this->old = (x86pte_t)((uint32_t)arg2 | (arg3 << 32)); this->new = (x86pte_t)((uint32_t)arg4 | (arg5 << 32)); printf("entry %x, %llx -> %llx", this->entry, this->old, this->new); } fbt::x86pte_set:entry /self->trace_me/ { this->ht = (htable_t *)arg0; this->entry = (uint_t)arg1; this->new = (x86pte_t)((uint32_t)arg2 | (arg3 << 32)); printf("entry %x, new %llx", this->entry, this->new); } fbt::x86pte_inval:entry /self->trace_me/ { this->ht = (htable_t *)arg0; this->entry = (uint_t)arg1; this->expect = (x86pte_t)((uint32_t)arg2 | (arg3 << 32)); printf("entry %x, expect %llx", this->entry, this->expect); } fbt::HYPERVISOR_mmu_update:entry /self->trace_me/ { this->req = (mmu_update_t *)arg0; this->count = (int)arg1; this->success_count = (int *)arg2; this->domain_id = (domid_t)arg3; printf("req[0/%d]: ptr %p, val %llx", this->count, this->req[0].ptr, this->req[0].val); } fbt::HYPERVISOR_mmuext_op:entry /self->trace_me/ { this->req2 = (struct mmuext_op *)arg0; this->count = (int)arg1; this->success_count = (int *)arg2; this->domain_id = (domid_t)arg3; printf("req[0/%d]: cmd %x, addr %p", this->count, this->req2[0].cmd, this->req2[0].arg1.linear_addr); } fbt::HYPERVISOR_update_va_mapping:entry /self->trace_me/ { this->va = (ulong_t)arg0; this->new_pte = ((uint32_t)arg1 | (arg2 << 32)); this->flags = (ulong_t)arg3; printf("va %p, new_pte %llx, flags %lx", this->va, this->new_pte, this->flags); } */ # dtrace -s pagefault.d -c ./fork The dtrace script produces a list of stack pagefaults and a summary at the end, the stack pagefault summary looks something like this: prog fork, type F_INVAL, addr 0x8047db8 S_READ, pc 0xd2b04a3b, count 1 prog fork, type F_PROT, addr 0x803fd78 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8040d88 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8041d98 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8042da8 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8043db8 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8044dc8 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8045dd8 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_PROT, addr 0x8046de8 S_WRITE, pc 0x8050a80, count 1 prog fork, type F_INVAL, addr 0x803fd7c S_READ, pc 0x8050a79, count 2 prog fork, type F_INVAL, addr 0x8040d8c S_READ, pc 0x8050a79, count 2 prog fork, type F_INVAL, addr 0x8041d9c S_READ, pc 0x8050a79, count 2 prog fork, type F_INVAL, addr 0x8042dac S_READ, pc 0x8050a79, count 2 prog fork, type F_INVAL, addr 0x8043dbc S_READ, pc 0x8050a79, count 2 prog fork, type F_INVAL, addr 0x8044dcc S_READ, pc 0x8050a79, count 2 prog fork, type F_INVAL, addr 0x8045ddc S_READ, pc 0x8050a79, count 2 prog fork, type F_PROT, addr 0x8047dbc S_WRITE, pc 0xd2b04a3c, count 2 prog fork, type F_INVAL, addr 0x8047dbc S_WRITE, pc 0xd2b04a3c, count 2647 The problem is the 2647 pagefaults that we get in the forked child at PC 0xd2b04a3c (which translates to libc.so.1`__forkx+0xc) trying to write to the stack at address 0x8047dbc and failing with a page-not-present pagefault (trap 0xe error 0x6).> 0xd2b04a3c::dislibc.so.1`__forkx: popl %ecx libc.so.1`__forkx+1: pushl $0x0 libc.so.1`__forkx+3: pushl %ecx libc.so.1`__forkx+4: movl $0x8e,%eax libc.so.1`__forkx+9: int $0x91 libc.so.1`__forkx+0xb: popl %ecx <<<< 0xd2b04a3b [1] libc.so.1`__forkx+0xc: movl %ecx,0x0(%esp) <<<< 0xd2b04a3c [2] libc.so.1`__forkx+0x10: jb -0x80326 <libc.so.1`__cerror> libc.so.1`__forkx+0x16: testl %edx,%edx libc.so.1`__forkx+0x18: je +0x2 <libc.so.1`__forkx+0x1c> libc.so.1`__forkx+0x1a: xorl %eax,%eax libc.so.1`__forkx+0x1c: ret We get one F_INVAL / S_READ stack page fault at __forkx+b [1], where the popl %ecx tries to read something from the stack. This apparently installs a readonly shared page from the parent''s address space. One of the two F_PROT / S_WRITE stack page faults at __forkx+c [2] does the copy-on-write operation and installs a private writable copy of the stack page. Now I would expect that we can finally write to the stack in the child process. But instead we now get *lots* of F_INVAL / S_WRITE stack page faults at address [2]. That doesn''t seem to be correct. The pagefault handler doesn''t seem to change anything in the mmu for these pagefaults because everything appears to be set up correctly; all that is done is an INVLPG for the fault address on the stack though the hypervisor via a call to HYPERVISOR_mmuext_op(MMUEXT_INVLPG_LOCAL). The process repeats a few thousand times. After a while the problem disappears automagically and the forked child process starts to run. Of cause these high number of pagefaults is really bad for the system''s performance, the kernel uses almost 100% of system cpu time to handle them. Btw. the problem cannot be reproduced when running a 32-bit PV domU on a 64-bit dom0. Or when running both dom0 and domU in 64-bit. This message posted from opensolaris.org
Jürgen Keil
2008-Apr-21 12:52 UTC
Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write
> I''m observing *excessive high* number of pagefaults > after a process has forked....> Now I would expect that we can finally write to the stack in the > child process. But instead we now get *lots* of F_INVAL / S_WRITE > stack page faults at address [2]. That doesn''t seem to be > correct.I (google) just found this: Joe Bonasera''s blog might contain an explanation what is happening, in the section "Spurious Page Faults": http://blogs.sun.com/JoeBonasera/entry/i_ve_got_spur_ious The only thing that does not match is that my 32bit PV domUs run with vcpus = 1; and my Tecra S1 is an UP machine, so dom0 = domU = 1 vcpu. This message posted from opensolaris.org
John Levon
2008-Apr-21 13:13 UTC
Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write
On Mon, Apr 21, 2008 at 05:52:02AM -0700, J??rgen Keil wrote:> Joe Bonasera''s blog might contain an explanation what is > happening, in the section "Spurious Page Faults": > > http://blogs.sun.com/JoeBonasera/entry/i_ve_got_spur_iousThis doesn''t affect us any more. This type of writable page table was removed, since it provided no performance benefit. regards john
Jürgen Keil
2008-Apr-24 17:33 UTC
Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri
John Levon wrote:> On Mon, Apr 21, 2008 at 05:52:02AM -0700, Jürgen Keil wrote: > > > Joe Bonasera's blog might contain an explanation what is > > happening, in the section "Spurious Page Faults": > > > > http://blogs.sun.com/JoeBonasera/entry/i_ve_got_spur_ious > > This doesn't affect us any more. This type of writable page table was > removed, since it provided no performance benefit.Ok... Looking at the dtrace output for the pv 32bit copy-on-write test program, I see that x86pte_inval() does an INVLPG through the hypervisor (MMUEXT_INVLPG_LOCAL) when it removes a page mapping. Like this (this removes the read-only cow stack page): 1 -> x86pte_inval 1 | x86pte_inval:entry entry 47, expect 1479b025 1 -> x86pte_access_pagetable 1 -> x86pte_mapin 1 -> pa_to_ma 1 -> pfn_to_mfn 1 <- pfn_to_mfn returns 12473 1 <- pa_to_ma returns 12473000 1 -> xen_map 1 -> HYPERVISOR_update_va_mapping 1 | HYPERVISOR_update_va_mapping:entry va cda02000, new_pte 8000000012473001, flags 2 1 <- HYPERVISOR_update_va_mapping returns 0 1 <- xen_map returns 0 1 <- x86pte_mapin returns cda02238 1 <- x86pte_access_pagetable returns cda02238 1 -> get_pte64 1 <- get_pte64 returns 1479b025 1 -> htable_e2va 1 <- htable_e2va returns 8047000 1 -> hat_tlb_inval 1 -> xen_flush_va 1 -> HYPERVISOR_mmuext_op 1 | HYPERVISOR_mmuext_op:entry req[0/1]: cmd 7, addr 8047000 1 <- HYPERVISOR_mmuext_op returns 0 1 <- xen_flush_va returns 0 1 <- hat_tlb_inval returns 1 1 -> x86pte_release_pagetable 1 -> x86pte_mapout 1 -> HYPERVISOR_update_va_mapping 1 | HYPERVISOR_update_va_mapping:entry va cda02000, new_pte 0, flags 2 1 <- HYPERVISOR_update_va_mapping returns 0 1 <- x86pte_mapout returns cf9df800 1 <- x86pte_release_pagetable returns cf9df800 1 <- x86pte_inval returns 1479b025 Code in uts/i86pc/vm/htable.c function x86pte_inval() is this 2222 /* 2223 * Note that the loop is needed to handle changes due to h/w updating 2224 * of PT_MOD/PT_REF. 2225 */ 2226 do { 2227 oldpte = GET_PTE(ptep); 2228 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 2229 goto done; 2230 XPV_ALLOW_PAGETABLE_UPDATES(); 2231 found = CAS_PTE(ptep, oldpte, 0); 2232 XPV_DISALLOW_PAGETABLE_UPDATES(); 2233 } while (found != oldpte); 2234 if (oldpte & (PT_REF | PT_MOD)) 2235 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); The invalidated PTE was accessed (return value from get_pte64 had the 0x20 bit set), so line 2235 hat_tlb_inval() is called which invalidates the TLB for that stack page. Ok so far. Why doesn't x86pte_set() use INVLPG when it installs a new PTE entry? The dtrace for my fork test case contains this (this one installes the writable page after we got the COW fault): 1 -> x86pte_set 1 | x86pte_set:entry entry 47, new bc39a007 1 -> htable_e2va 1 <- htable_e2va returns 8047000 1 -> x86pte_access_pagetable 1 -> x86pte_mapin 1 -> pa_to_ma 1 -> pfn_to_mfn 1 <- pfn_to_mfn returns 12473 1 <- pa_to_ma returns 12473000 1 -> xen_map 1 -> HYPERVISOR_update_va_mapping 1 | HYPERVISOR_update_va_mapping:entry va cda02000, new_pte 8000000012473001, flags 2 1 <- HYPERVISOR_update_va_mapping returns 0 1 <- xen_map returns 0 1 <- x86pte_mapin returns cda02238 1 <- x86pte_access_pagetable returns cda02238 1 -> get_pte64 1 <- get_pte64 returns 0 1 -> x86pte_release_pagetable 1 -> x86pte_mapout 1 -> HYPERVISOR_update_va_mapping 1 | HYPERVISOR_update_va_mapping:entry va cda02000, new_pte 0, flags 2 1 <- HYPERVISOR_update_va_mapping returns 0 1 <- x86pte_mapout returns cf9df800 1 <- x86pte_release_pagetable returns cf9df800 1 <- x86pte_set returns 0 The hypervisor is told up invalidate the page that contains the PTE (via HYPERVISOR_update_va_mapping, va cda02000 flags 2), but the CPU / MMU isn't told that the mapping for the virtual stack address 8047000 has changed. Isn't it possible that the CPU / MMU / TLB has cached the information "virtual stack address 8047000 is not valid address", after the call to x86pte_inval() ? htable.c x86pte_set() does a TLB flush when the old PTE referred to a referenced page, but it doesn't update the TLB when an empty PTE was replaced with a new translation: 2090 /* 2091 * Do a TLB demap if needed, ie. the old pte was valid. 2092 * 2093 * Note that a stale TLB writeback to the PTE here either can't happen 2094 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST 2095 * mappings, but they were created with REF and MOD already set, so 2096 * no stale writeback will happen. 2097 * 2098 * Segmap is the only place where remaps happen on the same pfn and for 2099 * that we want to preserve the stale REF/MOD bits. 2100 */ 2101 if (old & PT_REF) 2102 hat_tlb_inval(hat, addr); Btw. I've been experimenting with this change to x86pte_set() (lines 2103 ... 2111 added): 2090 /* 2091 * Do a TLB demap if needed, ie. the old pte was valid. 2092 * 2093 * Note that a stale TLB writeback to the PTE here either can't happen 2094 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST 2095 * mappings, but they were created with REF and MOD already set, so 2096 * no stale writeback will happen. 2097 * 2098 * Segmap is the only place where remaps happen on the same pfn and for 2099 * that we want to preserve the stale REF/MOD bits. 2100 */ 2101 if (old & PT_REF) 2102 hat_tlb_inval(hat, addr); 2103 #if defined(__i386) && defined(__xpv) 2104 /* jk: ugly hack / experiment with PV spurious page faults */ 2105 else if (old == 0 && addr < 0x8048000 && xpv_page_fault_hack) { 2106 if (xpv_page_fault_hack == 1) 2107 xen_flush_tlb(); 2108 else 2109 xen_flush_va((caddr_t)addr); 2110 } 2111 #endif With xpv_page_fault_hack := 0 I get the original code. With xpv_page_fault_hack := 2 I try to do an INVALPG on the new installed translation. But that hasn't fixed the issue... But with xpv_page_fault_hack := 1 the entire TLB gets flushed when installing new stack pages, and now: 1. the libMicro-0.4.0 fork_100 test runs ~ 30x faster in a 32-bit PV domU !! 800 seconds -> 28 seconds 2 ./boot/solaris/bin/create_ramdisk runs ~ 4x faster in a 32-bit PV domU ! 2 minutes -> 36 seconds So it seems that there is an issue with the TLB in 32-bit xVM PV doms... This message posted from opensolaris.org _______________________________________________ xen-discuss mailing list xen-discuss@opensolaris.org
Mark Johnson
2008-Apr-24 19:37 UTC
Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri
Joe''s response.. --- -------- Original Message -------- Subject: Re: [Fwd: Re: [xen-discuss] xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri] Date: Thu, 24 Apr 2008 12:00:41 -0700 From: Joe B... > > > The hypervisor is told up invalidate the page that contains the > PTE (via HYPERVISOR_update_va_mapping, va cda02000 flags 2), > but the CPU / MMU isn''t told that the mapping for the virtual stack address > 8047000 has changed. Isn''t it possible that the CPU / MMU / TLB has > cached the information "virtual stack address 8047000 is not valid > address", > after the call to x86pte_inval() ? Nope. No hardware ever works this way. It only caches information in the TLB when the lowest bit (PRESENT) is set in the PTE. Therefore when setting an entry that is zero to a non-zero value, an INVLPG instruction is never needed on hardware. > > htable.c x86pte_set() does a TLB flush when the old PTE > referred to a referenced page, but it doesn''t update the TLB when > an empty PTE was replaced with a new translation: > > 2090 /* ... > > > > Btw. I''ve been experimenting with this change to x86pte_set() > (lines 2103 ... 2111 added): > > 2090 /* ... > 2111 #endif > > > With xpv_page_fault_hack := 0 I get the original code. > > With xpv_page_fault_hack := 2 I try to do an INVALPG on the > new installed translation. But that hasn''t fixed the issue... > > But with xpv_page_fault_hack := 1 the entire TLB gets flushed > when installing new stack pages, and now: That would confirm a bug in Xen.. both xen_flush_tlb() and xen_flush_va() should have identical behavior here - and shouldn''t be necessary either. > > 1. the libMicro-0.4.0 fork_100 test runs ~ 30x faster in a 32-bit PV > domU !! > 800 seconds -> 28 seconds > 2 ./boot/solaris/bin/create_ramdisk runs ~ 4x faster in a 32-bit PV domU ! > 2 minutes -> 36 seconds > > > So it seems that there is an issue with the TLB in 32-bit xVM PV doms... The bug is probably in TLB flushing management in the Xen code itself. I know they''ve said in the past that they do all kinds of very crafty optimizations to avoid unnecessary invalidates in the hypervisor. I suspect they''ve got a bug.
Jürgen Keil
2008-May-14 13:28 UTC
Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri
Mark wrote:> Joe''s response..> > So it seems that there is an issue with the TLB in 32-bit xVM PV doms... > > The bug is probably in TLB flushing management in the Xen code itself. > I know they''ve said in the past that they do all kinds of very crafty > optimizations to avoid unnecessary invalidates in the hypervisor. > I suspect they''ve got a bug.The bug is in the Solaris 32-bit PAE xVM xm kernel code. It doesn''t do a full tlb flush when one of the four PDPTR entries changes; instead of a full tlb flush Solaris tries to use INVLPG, but Intel has documented that this doesn''t work... Intel has published an application note about the TLBs and their invalidation: http://www.intel.com/products/processor/manuals/ http://www.intel.com/design/processor/applnots/317080.pdf And in that application note, the following is documented in section 8.1: --------------------------------------------------- The processor does not maintain a PDP cache as described in Section 4. The processor always caches information from the four page-directory-pointer- table entries. These entries are not cached at the time of address translation. Instead, they are always cached as part of the execution of the following instructions: o A MOV to CR3 that occurs with IA32_EFER.LMA = 0 and CR4.PAE = 1. o A MOV to CR4 that results in CR4.PAE = 1, that occurs with IA32_EFER.LMA = 0 and CR0.PG = 1, and that modifies at least one of CR4.PAE, CR4.PGE, and CR4.PSE. o A MOV to CR0 that modifies CR0.PG and that occurs with IA32_EFER.LMA = 0 and CR4.PAE = 1. These instructions fault if they would load a PDPTR that sets any of the bits that must be 0 (see above). These cached entries are not modified by any other operations.14 In particular, executions of INVLPG do not affect these cached entries. --------------------------------------------------- Solaris implements this: 1043 static void 1044 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr) 1045 { ... 1067 /* 1068 * When a top level VLP page table entry changes, we must issue 1069 * a reload of cr3 on all processors. 1070 * 1071 * If we don''t need do do that, then we still have to INVLPG against 1072 * an address covered by the inner page table, as the latest processors 1073 * have TLB-like caches for non-leaf page table entries. 1074 */ 1075 if (!(hat->hat_flags & HAT_FREEING)) { 1076 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? 1077 DEMAP_ALL_ADDR : old->ht_vaddr); 1078 } and 1087 static void 1088 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr) 1089 { ... 1104 /* 1105 * When any top level VLP page table entry changes, we must issue 1106 * a reload of cr3 on all processors using it. 1107 * We also need to do this for the kernel hat on PAE 32 bit kernel. 1108 */ 1109 if ( 1110 #ifdef __i386 1111 (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || 1112 #endif 1113 (higher->ht_flags & HTABLE_VLP)) 1114 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); That is, when we remove PDPTR entries in unlink_ptp there is a hat_tlb_inval(DEMAP_ALL_ADDR) (flush TLB by CR3 reload), but only when "higher->ht_flags & HTABLE_VLP". Under 32-bit PV xVM, the HTABLE_VLP flag isn''t set, so the code does an hat_tlb_inval(old->ht_vaddr) which results in an INVLPG. According to section 8.1 in Intel''s document, the INVLPG does *not* affect the cached PDPTR entries When PDPTR entries are added in link_ptp, there is a hat_tlb_inval(DEMAP_ALL_ADDR). Again, this is only done when the HTABLE_VLP flag is set; it isn''t under 32-bit PV xVM. Or when (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL); but this is a non-kernel hat mapping, so the hat_tlb_inval() is skipped. When I change the kernel so something like this, it seems to work without getting lots of spurious page faults: diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c --- a/usr/src/uts/i86pc/vm/htable.c +++ b/usr/src/uts/i86pc/vm/htable.c @@ -1073,8 +1079,11 @@ unlink_ptp(htable_t *higher, htable_t *o * have TLB-like caches for non-leaf page table entries. */ if (!(hat->hat_flags & HAT_FREEING)) { - hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? - DEMAP_ALL_ADDR : old->ht_vaddr); + hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) +#ifdef __i386 + || (higher->ht_level == VLP_LEVEL) +#endif + ? DEMAP_ALL_ADDR : old->ht_vaddr); } HTABLE_DEC(higher->ht_valid_cnt); @@ -1108,7 +1117,7 @@ link_ptp(htable_t *higher, htable_t *new */ if ( #ifdef __i386 - (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || + (higher->ht_level == VLP_LEVEL) || #endif (higher->ht_flags & HTABLE_VLP)) hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); On metal this isn''t a problem because the HTABLE_VLP flag is set on the L2 htable, so the tlb gets flushed by CR3 reloads. This message posted from opensolaris.org
Jürgen Keil
2008-May-14 13:48 UTC
Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write
> I''m observing *excessive high* number of pagefaults > after a process has forked.Btw: I''m observing a similar / the same problem with mprotect(2): # cat mapp.c #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <sys/mman.h> #define PAGESIZE 4096 int main(int argc, char **argv) { int fd; char *filename; char data_tmp[PAGESIZE]; void *data; char tmp; fd = open(filename = "/tmp/data", O_RDWR|O_TRUNC|O_CREAT, 0666); if (fd < 0) { perror(filename); exit(1); } memset(data_tmp, ''*'', sizeof(data_tmp)); write(fd, data_tmp, sizeof(data_tmp)); data = mmap((void*)0x41002000, PAGESIZE, PROT_READ, MAP_FIXED|MAP_SHARED, fd, 0); if (data == MAP_FAILED) { perror("mmap"); exit(1); } tmp = *(char*)data; mprotect(data, PAGESIZE, PROT_READ|PROT_WRITE); *(char*)data = tmp; close(fd); exit(0); } This program should produce two pagefaults for address 0x41002000; but when I just ran it on a gentoo 32-bit dom0 with xen 3.2.0 pae hypervisor, in a snv_81 domU, the test program was running for 45 seconds and produced almost six million pagefaults for address 41002000. This message posted from opensolaris.org