thr3ads.net - xen discuss - xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write? [Apr 2008]

If this information is useful, please help other people find it:
Share via:

Jürgen Keil

2008-Apr-21 10:26 UTC

xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write?

The problem below may or may not be related to problems recently reported here:

- "Opensolaris guest eats 100% cpu"
    http://www.opensolaris.org/jive/thread.jspa?threadID=57568&tstart=0

- "Libmicro issue in XVM Guest"
    http://www.opensolaris.org/jive/thread.jspa?threadID=57630&tstart=0


I''m observing *excessive high* number of pagefaults after a process has
forked.

I''m observing this both in 32-bit dom0 OpenSolaris kernels, and also in
32-bit
domU OpenSolaris kernels, when running on a 32-bit 3.1.2-xvm hypervisor.
Problem is much worse in the PV domU OpenSolaris kernel.  I''m using
current
OpenSolaris bits compiled from the mercurial repository (post snv_88).

Systems where I observed this issue:

1. ASUS M2NPV-VM mainboard, AMD Athlon(tm) 64 X2 Dual Core Processor 6400+
   8GB of memory

   Note: xen 3.1.2-xvm hypervisor was booted with option "mem=4G" or
"mem=2G"

2. ASUS M2N-SLI deluxe, AMD Athlon(tm) 64 X2 Dual Core Processor 4200+
    2GB of memory

3. Toshiba Tecra S1, Pentium M, 2GB of memory



Test case is this:

# cat fork.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>

static void
fill(int n, void *p)
{
        char data[4096];
        memset(data, data[0], sizeof(data));
        if (--n > 0)
                fill(n, data);
}


int
main(int argc, char **argv)
{
        pid_t p;
        int status;

        fill(8, NULL);
        switch (p = fork()) {
        case -1:
                perror("fork");
                exit(1);
        case 0:
                /*sleep(1);*/
                fill(8, NULL);
                _exit(0);
        default:
                fill(8, NULL);
                wait(&status);
                break;
        }
        exit(0);
}


# cc -o fork fork.c

# cat pagefault.d
#!/usr/sbin/dtrace -s

#pragma D option flowindent

BEGIN
{
        type[0 /*F_INVAL*/] = "F_INVAL";
        type[1 /*F_PROT*/] = "F_PROT";
        type[2 /*F_SOFTLOCK*/] = "F_SOFTLOCK";
        type[3 /*F_SOFTUNLOCK*/] = "F_SOFTUNLOCK";

        rw[0 /*S_OTHER*/] = "S_OTHER";
        rw[1 /*S_READ*/] = "S_READ";
        rw[2 /*S_WRITE*/] = "S_WRITE";
        rw[3 /*S_EXEC*/] = "S_EXEC";
        rw[4 /*S_CREATE*/] = "S_CREATE";
        rw[5 /*S_READ_NOCOW*/] = "S_READ_NOCOW";

        flt_cnt = 0;
}

fbt::pagefault:entry
{
        this->addr = (caddr_t)arg0;
        this->type = (enum fault_type)arg1;
        this->rw = (enum seg_rw)arg2;
        this->iskernel = arg3;
}

fbt::pagefault:entry
/(uint64_t)this->addr < 0x08048000 && execname ==
"fork"/
{
        @fault[execname, type[this->type], this->addr, rw[this->rw],
uregs[R_PC]] = count();
        printf("prog %s(%d), addr %a %s %s, pc %a trap %x err %x",
                execname, pid, this->addr, type[this->type],
rw[this->rw],
                uregs[R_PC], uregs[R_TRAPNO], uregs[R_ERR]);
        self->trace = 1;
}

fbt::pagefault:return
/self->trace/
{
        printf("ret: %x", arg1);
        self->trace = 0;
}


END
{
        printa("prog %s, type %s, addr %a %s, pc %a, count %@u\n",
@fault);
}


/*

fbt::segvn_fault:entry
/self->trace
 && ((uregs[R_PC] & 0xffff) == 0x4a3b || (uregs[R_PC] & 0xffff)
== 0x4a3c)
 && flt_cnt < 5/
{
        self->trace_me = 1;
        flt_cnt++;
}

fbt:::entry
/self->trace_me/
{
}

fbt:::return
/self->trace_me/
{
        printf("returns %x", arg1);
}

fbt::segvn_fault:return
/self->trace_me/
{
        self->trace_me = 0;
}

fbt::x86pte_cas:entry
/self->trace_me/
{
        this->ht = (htable_t *)arg0;
        this->entry = (uint_t)arg1;
        this->old = (x86pte_t)((uint32_t)arg2 | (arg3 << 32));
        this->new = (x86pte_t)((uint32_t)arg4 | (arg5 << 32));
        printf("entry %x, %llx -> %llx", this->entry,
this->old, this->new);
}

fbt::x86pte_set:entry
/self->trace_me/
{
        this->ht = (htable_t *)arg0;
        this->entry = (uint_t)arg1;
        this->new = (x86pte_t)((uint32_t)arg2 | (arg3 << 32));
        printf("entry %x, new %llx", this->entry, this->new);
}

fbt::x86pte_inval:entry
/self->trace_me/
{
        this->ht = (htable_t *)arg0;
        this->entry = (uint_t)arg1;
        this->expect = (x86pte_t)((uint32_t)arg2 | (arg3 << 32));
        printf("entry %x, expect %llx", this->entry,
this->expect);
}

fbt::HYPERVISOR_mmu_update:entry
/self->trace_me/
{
        this->req = (mmu_update_t *)arg0;
        this->count = (int)arg1;
        this->success_count = (int *)arg2;
        this->domain_id = (domid_t)arg3;
        printf("req[0/%d]: ptr %p, val %llx", this->count,
                this->req[0].ptr, this->req[0].val);
}

fbt::HYPERVISOR_mmuext_op:entry
/self->trace_me/
{
        this->req2 = (struct mmuext_op *)arg0;
        this->count = (int)arg1;
        this->success_count = (int *)arg2;
        this->domain_id = (domid_t)arg3;
        printf("req[0/%d]: cmd %x, addr %p", this->count,
                this->req2[0].cmd, this->req2[0].arg1.linear_addr);
}

fbt::HYPERVISOR_update_va_mapping:entry
/self->trace_me/
{
        this->va = (ulong_t)arg0;
        this->new_pte = ((uint32_t)arg1 | (arg2 << 32));
        this->flags = (ulong_t)arg3;
        printf("va %p, new_pte %llx, flags %lx", this->va,
this->new_pte, this->flags);
}
*/


# dtrace -s pagefault.d -c ./fork


The dtrace script produces a list of stack pagefaults and a summary
at the end, the stack pagefault summary looks something like this:

prog fork, type F_INVAL, addr 0x8047db8 S_READ, pc 0xd2b04a3b, count 1
prog fork, type F_PROT, addr 0x803fd78 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8040d88 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8041d98 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8042da8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8043db8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8044dc8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8045dd8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8046de8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_INVAL, addr 0x803fd7c S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8040d8c S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8041d9c S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8042dac S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8043dbc S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8044dcc S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8045ddc S_READ, pc 0x8050a79, count 2
prog fork, type F_PROT, addr 0x8047dbc S_WRITE, pc 0xd2b04a3c, count 2
prog fork, type F_INVAL, addr 0x8047dbc S_WRITE, pc 0xd2b04a3c, count 2647


The problem is the 2647 pagefaults that we get in the forked child at 
PC 0xd2b04a3c (which translates to libc.so.1`__forkx+0xc) trying to
write to the stack at address 0x8047dbc and failing with a page-not-present
pagefault (trap 0xe error 0x6).
> 0xd2b04a3c::dislibc.so.1`__forkx:              popl   %ecx
libc.so.1`__forkx+1:            pushl  $0x0
libc.so.1`__forkx+3:            pushl  %ecx
libc.so.1`__forkx+4:            movl   $0x8e,%eax
libc.so.1`__forkx+9:            int    $0x91
libc.so.1`__forkx+0xb:          popl   %ecx                       
<<<< 0xd2b04a3b [1]
libc.so.1`__forkx+0xc:          movl   %ecx,0x0(%esp)   <<<<
0xd2b04a3c [2]
libc.so.1`__forkx+0x10:         jb     -0x80326 <libc.so.1`__cerror>
libc.so.1`__forkx+0x16:         testl  %edx,%edx
libc.so.1`__forkx+0x18:         je     +0x2     <libc.so.1`__forkx+0x1c>
libc.so.1`__forkx+0x1a:         xorl   %eax,%eax
libc.so.1`__forkx+0x1c:         ret    

We get one F_INVAL / S_READ stack page fault at __forkx+b [1],
where the popl %ecx tries to read something from the stack.
This apparently installs a readonly shared page from the
parent''s address space.

One of the two F_PROT / S_WRITE stack page faults at __forkx+c [2]
does the copy-on-write operation and installs a private writable
copy of the stack page. 

Now I would expect that we can finally write to the stack in the
child process.  But instead we now get *lots* of F_INVAL / S_WRITE
stack page faults at address [2].    That doesn''t seem to be
correct.  The pagefault handler doesn''t seem to change anything
in the mmu for these pagefaults because everything appears to be
set up correctly; all that is done is an INVLPG for the fault address on
the stack though the hypervisor via a call to
HYPERVISOR_mmuext_op(MMUEXT_INVLPG_LOCAL).  The
process repeats a few thousand times.

After a while the problem disappears automagically and the
forked child process starts to run.

Of cause these high number of pagefaults is really bad for the
system''s performance, the kernel uses almost 100% of system
cpu time to handle them.



Btw. the problem cannot be reproduced when running a
32-bit PV domU on a 64-bit dom0.  Or when running 
both dom0 and domU in 64-bit.
 
 
This message posted from opensolaris.org

Jürgen Keil

2008-Apr-21 12:52 UTC

head link

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write

> I''m observing *excessive high* number of pagefaults
> after a process has forked.
...> Now I would expect that we can finally write to the stack in the
> child process.  But instead we now get *lots* of F_INVAL / S_WRITE
> stack page faults at address [2].    That doesn''t seem to be
> correct.
I (google) just found this:

Joe Bonasera''s blog might contain an explanation what is
happening, in the section "Spurious Page Faults":

    http://blogs.sun.com/JoeBonasera/entry/i_ve_got_spur_ious


The only thing that does not match is that my 32bit PV domUs
run with vcpus = 1; and my Tecra S1 is an UP machine, so
dom0 = domU = 1 vcpu.
 
 
This message posted from opensolaris.org

John Levon

2008-Apr-21 13:13 UTC

head link

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write

On Mon, Apr 21, 2008 at 05:52:02AM -0700, J??rgen Keil wrote:
> Joe Bonasera''s blog might contain an explanation what is
> happening, in the section "Spurious Page Faults":
> 
>     http://blogs.sun.com/JoeBonasera/entry/i_ve_got_spur_ious
This doesn''t affect us any more. This type of writable page table was
removed, since it provided no performance benefit.

regards
john

Jürgen Keil

2008-Apr-24 17:33 UTC

head link

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri

John Levon wrote:
> On Mon, Apr 21, 2008 at 05:52:02AM -0700, Jürgen Keil wrote:
> 
> > Joe Bonasera's blog might contain an explanation what is
> > happening, in the section "Spurious Page Faults":
> > 
> >  http://blogs.sun.com/JoeBonasera/entry/i_ve_got_spur_ious
> 
> This doesn't affect us any more. This type of writable page table was
> removed, since it provided no performance benefit.
Ok...


Looking at the dtrace output for the pv 32bit copy-on-write 
test program,  I see that x86pte_inval() does an INVLPG
through the hypervisor (MMUEXT_INVLPG_LOCAL)
when it removes a page mapping.  Like this (this removes
the read-only cow stack page):

  1                -> x86pte_inval
  1                 | x86pte_inval:entry      entry 47, expect 1479b025
  1                  -> x86pte_access_pagetable
  1                    -> x86pte_mapin
  1                      -> pa_to_ma
  1                        -> pfn_to_mfn
  1                        <- pfn_to_mfn      returns 12473
  1                      <- pa_to_ma          returns 12473000
  1                      -> xen_map
  1                        -> HYPERVISOR_update_va_mapping
  1                         | HYPERVISOR_update_va_mapping:entry va cda02000,
new_pte 8000000012473001, flags 2
  1                        <- HYPERVISOR_update_va_mapping returns 0
  1                      <- xen_map           returns 0
  1                    <- x86pte_mapin        returns cda02238
  1                  <- x86pte_access_pagetable returns cda02238
  1                  -> get_pte64
  1                  <- get_pte64             returns 1479b025
  1                  -> htable_e2va
  1                  <- htable_e2va           returns 8047000
  1                  -> hat_tlb_inval
  1                    -> xen_flush_va
  1                      -> HYPERVISOR_mmuext_op
  1                       | HYPERVISOR_mmuext_op:entry req[0/1]: cmd 7, addr
8047000
  1                      <- HYPERVISOR_mmuext_op returns 0
  1                    <- xen_flush_va        returns 0
  1                  <- hat_tlb_inval         returns 1
  1                  -> x86pte_release_pagetable
  1                    -> x86pte_mapout
  1                      -> HYPERVISOR_update_va_mapping
  1                       | HYPERVISOR_update_va_mapping:entry va cda02000,
new_pte 0, flags 2
  1			 <- HYPERVISOR_update_va_mapping returns 0
  1			<- x86pte_mapout		returns cf9df800
  1			<- x86pte_release_pagetable returns cf9df800
  1		   <- x86pte_inval		returns 1479b025


Code in uts/i86pc/vm/htable.c function x86pte_inval() is this

  2222		/*
  2223		 * Note that the loop is needed to handle changes due to h/w updating
  2224		 * of PT_MOD/PT_REF.
  2225		 */
  2226		do {
  2227			oldpte = GET_PTE(ptep);
  2228			if (expect != 0 && (oldpte & PT_PADDR) != (expect &
PT_PADDR))
  2229				goto done;
  2230			XPV_ALLOW_PAGETABLE_UPDATES();
  2231			found = CAS_PTE(ptep, oldpte, 0);
  2232			XPV_DISALLOW_PAGETABLE_UPDATES();
  2233		} while (found != oldpte);
  2234		if (oldpte & (PT_REF | PT_MOD))
  2235			hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));


The invalidated PTE was accessed (return value from get_pte64 had the
0x20 bit set), so line 2235 hat_tlb_inval() is called which invalidates the
TLB for that stack page.

Ok so far.


Why doesn't x86pte_set() use INVLPG when it installs a
new PTE entry?  The dtrace for my fork test case contains
this (this one installes the writable page after we got the COW
fault):

  1              -> x86pte_set
  1               | x86pte_set:entry          entry 47, new bc39a007
  1                -> htable_e2va
  1                <- htable_e2va             returns 8047000
  1                -> x86pte_access_pagetable
  1                  -> x86pte_mapin
  1                    -> pa_to_ma
  1                      -> pfn_to_mfn
  1                      <- pfn_to_mfn        returns 12473
  1                    <- pa_to_ma            returns 12473000
  1                    -> xen_map
  1                      -> HYPERVISOR_update_va_mapping
  1                       | HYPERVISOR_update_va_mapping:entry va cda02000,
new_pte 8000000012473001, flags 2
  1                      <- HYPERVISOR_update_va_mapping returns 0
  1                    <- xen_map             returns 0
  1                  <- x86pte_mapin          returns cda02238
  1                <- x86pte_access_pagetable returns cda02238
  1                -> get_pte64
  1                <- get_pte64               returns 0
  1                -> x86pte_release_pagetable
  1                  -> x86pte_mapout
  1                    -> HYPERVISOR_update_va_mapping
  1                     | HYPERVISOR_update_va_mapping:entry va cda02000,
new_pte 0, flags 2
  1                    <- HYPERVISOR_update_va_mapping returns 0
  1                  <- x86pte_mapout         returns cf9df800
  1                <- x86pte_release_pagetable returns cf9df800
  1              <- x86pte_set                returns 0


The hypervisor is told up invalidate the page that contains the
PTE (via HYPERVISOR_update_va_mapping, va cda02000 flags 2),
but the CPU / MMU isn't told that the mapping for the virtual stack address
8047000 has changed.    Isn't it possible that the CPU / MMU / TLB  has
cached the information "virtual stack address 8047000 is not valid
address",
after the call to x86pte_inval() ?

htable.c x86pte_set() does a TLB flush when the old PTE
referred to a referenced page, but it doesn't update the TLB when
an empty PTE was replaced with a new translation:

  2090		/*
  2091		 * Do a TLB demap if needed, ie. the old pte was valid.
  2092		 *
  2093		 * Note that a stale TLB writeback to the PTE here either can't
happen
  2094		 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
  2095		 * mappings, but they were created with REF and MOD already set, so
  2096		 * no stale writeback will happen.
  2097		 *
  2098		 * Segmap is the only place where remaps happen on the same pfn and for
  2099		 * that we want to preserve the stale REF/MOD bits.
  2100		 */
  2101		if (old & PT_REF)
  2102			hat_tlb_inval(hat, addr);




Btw. I've been experimenting with this change to x86pte_set()
(lines 2103 ... 2111 added):

  2090		/*
  2091		 * Do a TLB demap if needed, ie. the old pte was valid.
  2092		 *
  2093		 * Note that a stale TLB writeback to the PTE here either can't
happen
  2094		 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
  2095		 * mappings, but they were created with REF and MOD already set, so
  2096		 * no stale writeback will happen.
  2097		 *
  2098		 * Segmap is the only place where remaps happen on the same pfn and for
  2099		 * that we want to preserve the stale REF/MOD bits.
  2100		 */
  2101		if (old & PT_REF)
  2102			hat_tlb_inval(hat, addr);
  2103  #if     defined(__i386) && defined(__xpv)
  2104          /* jk: ugly hack / experiment with PV spurious page faults */
  2105          else if (old == 0 && addr < 0x8048000 &&
xpv_page_fault_hack) {
  2106                  if (xpv_page_fault_hack == 1)
  2107                          xen_flush_tlb();
  2108                  else
  2109                          xen_flush_va((caddr_t)addr);
  2110          }
  2111  #endif


With xpv_page_fault_hack := 0 I get the original code.

With xpv_page_fault_hack := 2 I try to do an INVALPG on the
new installed translation.  But that hasn't fixed the issue...

But with xpv_page_fault_hack := 1 the entire TLB gets flushed
when installing new stack pages, and now:

1. the libMicro-0.4.0 fork_100 test runs ~ 30x faster in a 32-bit PV domU !!
    800 seconds -> 28 seconds
2 ./boot/solaris/bin/create_ramdisk runs ~ 4x faster  in a 32-bit PV domU !
   2 minutes -> 36 seconds


So it seems that there is an issue with the TLB in 32-bit xVM PV doms...
 
 
This message posted from opensolaris.org
_______________________________________________
xen-discuss mailing list
xen-discuss@opensolaris.org

Mark Johnson

2008-Apr-24 19:37 UTC

head link

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri

Joe''s response..

---


-------- Original Message --------
Subject: Re: [Fwd: Re: [xen-discuss] xvm 32-bit: excessive number of pagefaults
after	fork() / copy-on-wri]
Date: Thu, 24 Apr 2008 12:00:41 -0700
From: Joe B...


 >
 >
 > The hypervisor is told up invalidate the page that contains the
 > PTE (via HYPERVISOR_update_va_mapping, va cda02000 flags 2),
 > but the CPU / MMU isn''t told that the mapping for the virtual
stack address
 > 8047000 has changed.    Isn''t it possible that the CPU / MMU /
TLB  has
 > cached the information "virtual stack address 8047000 is not valid
 > address",
 > after the call to x86pte_inval() ?

Nope. No hardware ever works this way. It only caches information in the TLB
when the lowest bit (PRESENT) is set in the PTE. Therefore when setting an
entry that is zero to a non-zero value, an INVLPG instruction is never needed
on hardware.

 >
 > htable.c x86pte_set() does a TLB flush when the old PTE
 > referred to a referenced page, but it doesn''t update the TLB when
 > an empty PTE was replaced with a new translation:
 >
 >   2090        /*
...
 >
 >
 >
 > Btw. I''ve been experimenting with this change to x86pte_set()
 > (lines 2103 ... 2111 added):
 >
 >   2090        /*
...
 >   2111  #endif
 >
 >
 > With xpv_page_fault_hack := 0 I get the original code.
 >
 > With xpv_page_fault_hack := 2 I try to do an INVALPG on the
 > new installed translation.  But that hasn''t fixed the issue...
 >
 > But with xpv_page_fault_hack := 1 the entire TLB gets flushed
 > when installing new stack pages, and now:

That would confirm a bug in Xen.. both xen_flush_tlb() and xen_flush_va()
should have identical behavior here - and shouldn''t be necessary
either.

 >
 > 1. the libMicro-0.4.0 fork_100 test runs ~ 30x faster in a 32-bit PV
 > domU !!
 >     800 seconds -> 28 seconds
 > 2 ./boot/solaris/bin/create_ramdisk runs ~ 4x faster  in a 32-bit PV domU
!
 >    2 minutes -> 36 seconds
 >
 >
 > So it seems that there is an issue with the TLB in 32-bit xVM PV doms...

The bug is probably in TLB flushing management in the Xen code itself.
I know they''ve said in the past that they do all kinds of very crafty
optimizations to avoid unnecessary invalidates in the hypervisor.
I suspect they''ve got a bug.

Jürgen Keil

2008-May-14 13:28 UTC

head link

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri

Mark wrote:> Joe''s response..
> > So it seems that there is an issue with the TLB in  32-bit xVM PV
doms...
>
> The bug is probably in TLB flushing management in the Xen code itself.
> I know they''ve said in the past that they do all kinds of very
crafty
> optimizations to avoid unnecessary invalidates in the hypervisor.
> I suspect they''ve got a bug.
The bug is in the Solaris 32-bit PAE xVM xm kernel code.  It
doesn''t do a full tlb flush when one of the four PDPTR entries
changes; instead of a full tlb flush Solaris tries to use INVLPG,
but Intel has documented that this doesn''t work...

Intel has published an application note about the TLBs and their
invalidation:

    http://www.intel.com/products/processor/manuals/
    http://www.intel.com/design/processor/applnots/317080.pdf

And in that application note, the following is documented in section 8.1:
---------------------------------------------------
The processor does not maintain a PDP cache as described in Section 4.
The processor always caches information from the four page-directory-pointer-
table entries. These entries are not cached at the time of address translation.
Instead, they are always cached as part of the execution of the following
instructions:
o A MOV to CR3 that occurs with IA32_EFER.LMA = 0 and CR4.PAE = 1.
o A MOV to CR4 that results in CR4.PAE = 1, that occurs with IA32_EFER.LMA = 0
  and CR0.PG = 1, and that modifies at least one of CR4.PAE, CR4.PGE,
  and CR4.PSE.
o A MOV to CR0 that modifies CR0.PG and that occurs with IA32_EFER.LMA = 0 and
  CR4.PAE = 1.

These instructions fault if they would load a PDPTR that sets any of the bits
that must be 0 (see above). These cached entries are not modified by any other
operations.14 In particular, executions of INVLPG do not affect these cached
entries.
---------------------------------------------------


Solaris implements this:

   1043 static void
   1044 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
   1045 {
...
   1067         /*
   1068          * When a top level VLP page table entry changes, we must issue
   1069          * a reload of cr3 on all processors.
   1070          *
   1071          * If we don''t need do do that, then we still have to
INVLPG against
   1072          * an address covered by the inner page table, as the latest
processors
   1073          * have TLB-like caches for non-leaf page table entries.
   1074          */
   1075         if (!(hat->hat_flags & HAT_FREEING)) {
   1076                 hat_tlb_inval(hat, (higher->ht_flags &
HTABLE_VLP) ?
   1077                     DEMAP_ALL_ADDR : old->ht_vaddr);
   1078         }


and

   1087 static void
   1088 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
   1089 {
...
   1104         /*
   1105          * When any top level VLP page table entry changes, we must
issue
   1106          * a reload of cr3 on all processors using it.
   1107          * We also need to do this for the kernel hat on PAE 32 bit
kernel.
   1108          */
   1109         if (
   1110 #ifdef __i386
   1111             (higher->ht_hat == kas.a_hat &&
higher->ht_level == VLP_LEVEL) ||
   1112 #endif
   1113             (higher->ht_flags & HTABLE_VLP))
   1114                 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);

That is, when we remove PDPTR entries in unlink_ptp there is a
hat_tlb_inval(DEMAP_ALL_ADDR) (flush TLB by CR3 reload), but only when
"higher->ht_flags & HTABLE_VLP".
Under 32-bit PV xVM, the HTABLE_VLP flag isn''t set, so the code does an
hat_tlb_inval(old->ht_vaddr) which results in an INVLPG.  According to
section 8.1 in Intel''s document, the INVLPG does *not* affect the
cached PDPTR entries


When PDPTR entries are added in link_ptp, there is a
hat_tlb_inval(DEMAP_ALL_ADDR).	Again, this is only done when the
HTABLE_VLP flag is set; it isn''t under 32-bit PV xVM. Or when
(higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL);
but
this is a non-kernel hat mapping, so the hat_tlb_inval() is skipped.


When I change the kernel so something like this, it seems to work
without getting lots of spurious page faults:

diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c
--- a/usr/src/uts/i86pc/vm/htable.c
+++ b/usr/src/uts/i86pc/vm/htable.c
@@ -1073,8 +1079,11 @@ unlink_ptp(htable_t *higher, htable_t *o
         * have TLB-like caches for non-leaf page table entries.
         */
        if (!(hat->hat_flags & HAT_FREEING)) {
-               hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
-                   DEMAP_ALL_ADDR : old->ht_vaddr);
+               hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP)
+#ifdef __i386
+                   || (higher->ht_level == VLP_LEVEL)
+#endif
+                   ? DEMAP_ALL_ADDR : old->ht_vaddr);
        }

        HTABLE_DEC(higher->ht_valid_cnt);
@@ -1108,7 +1117,7 @@ link_ptp(htable_t *higher, htable_t *new
         */
        if (
 #ifdef __i386
-           (higher->ht_hat == kas.a_hat && higher->ht_level ==
VLP_LEVEL) ||
+           (higher->ht_level == VLP_LEVEL) ||
 #endif
            (higher->ht_flags & HTABLE_VLP))
                hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);



On metal this isn''t a problem because the HTABLE_VLP flag is set on
the L2 htable, so the tlb gets flushed by CR3 reloads.
 
 
This message posted from opensolaris.org

Jürgen Keil

2008-May-14 13:48 UTC

head link

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write

> I''m observing *excessive high* number of pagefaults
> after a process has forked.
Btw: I''m observing a similar / the same problem with mprotect(2):

# cat mapp.c
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/mman.h>

#define	PAGESIZE	4096

int
main(int argc, char **argv)
{
	int fd;
	char *filename;
	char data_tmp[PAGESIZE];
	void *data;
	char tmp;

	fd = open(filename = "/tmp/data", O_RDWR|O_TRUNC|O_CREAT, 0666);
	if (fd < 0) {
		perror(filename);
		exit(1);
	}
	memset(data_tmp, ''*'', sizeof(data_tmp));
	write(fd, data_tmp, sizeof(data_tmp));
	data = mmap((void*)0x41002000, PAGESIZE, PROT_READ,
	    MAP_FIXED|MAP_SHARED, fd, 0);
	if (data == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}
	tmp = *(char*)data;
	mprotect(data, PAGESIZE, PROT_READ|PROT_WRITE);
	*(char*)data = tmp;

	close(fd);
	exit(0);
}


This program should produce two pagefaults for address 0x41002000;
but when I just ran it on a gentoo 32-bit dom0 with xen 3.2.0 pae
hypervisor, in a snv_81 domU, the test program was running for
45 seconds and produced almost six million pagefaults for
address 41002000.
 
 
This message posted from opensolaris.org

xen discuss - Apr 2008 - xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write?

xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write?

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-wri

Re: xvm 32-bit: excessive number of pagefaults after fork() / copy-on-write