thr3ads.net - Xen devel - RFC: AMD support for paging [Feb 2012]

If this information is useful, please help other people find it:
Share via:

Andres Lagar-Cavilla

2012-Feb-14 19:05 UTC

RFC: AMD support for paging

We started hashing out some AMD support for mem_paging and mem_access.
Right now my VMs boot, page out a bit, and then die on an HVM triple
fault.

Most importantly, I want to get somebody from AMD to comment/help out on
this. It feels like we''re inches away from enabling support for this
very
nice feature. I''m not sure who exactly on AMD monitors the list for
these
kinds of things. It''d be great to have you on board!

For starters, the changes to the p2m code are relatively mild, but it''d
be
great if somebody spots a red flag.

Another issue: comments indicate that bits 59-62 in NPT entries are used
for IOMMU flags but effectively bits 61-62 are. Repossessing one bit (59)
would give us enough space to support mem_access. Right now we only have 7
bits available for Xen flags and that is not enough for paging and access.
Is bit 59 effectively reserved?

Finally, the triple fault. Maybe I''m missing something obvious.
Comments
welcome.

Patch inline below, thanks!
Andres

Enable AMD support for paging.

Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>
Signed-off-by: Adin Scannell <adin@scannell.ca>

diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
--- a/xen/arch/x86/mm/mem_event.c
+++ b/xen/arch/x86/mm/mem_event.c
@@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
             if ( !hap_enabled(d) )
                 break;

-            /* Currently only EPT is supported */
-            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-                break;
-
             rc = -EXDEV;
             /* Disallow paging in a PoD guest */
             if ( p2m->pod.entry_count )
diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -53,6 +53,20 @@
 #define P2M_BASE_FLAGS \
         (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)

+#ifdef __x86_64__
+/* l1e_from_pfn is not designed to have INVALID_MFN stored. The 0xff..ff
+ * value tramples over the higher-order bits used for flags (NX, p2mt,
+ * etc.) This happens for paging entries. Thus we do this clip/unclip
+ * juggle for l1 entries only (no paging superpages!) */
+#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
+#define clipped_mfn(mfn)    ((mfn) & ((1UL << EFF_MFN_WIDTH) - 1))
+#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ? \
+                                INVALID_MFN : (mfn))
+#else
+#define clipped_mfn(mfn)    (mfn)
+#define unclip_mfn(mfn)     (mfn)
+#endif /* __x86_64__ */
+
 static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
@@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
     case p2m_invalid:
     case p2m_mmio_dm:
     case p2m_populate_on_demand:
+    case p2m_ram_paging_out:
+    case p2m_ram_paged:
+    case p2m_ram_paging_in:
     default:
         return flags;
     case p2m_ram_ro:
@@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
                                       shift, max)) )
         return 0;

-    /* PoD: Not present doesn''t imply empty. */
+    /* PoD/paging: Not present doesn''t imply empty. */
     if ( !l1e_get_flags(*p2m_entry) )
     {
         struct page_info *pg;
@@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
                                    0, L1_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);

-        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-            entry_content = l1e_from_pfn(mfn_x(mfn),
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
+             (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) )
+            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
                                          p2m_type_to_flags(p2mt, mfn));
         else
             entry_content = l1e_empty();
@@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
         if ( entry_content.l1 != 0 )
         {
             p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
+            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
         }
         /* level 1 entry */
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
entry_content, 1);
@@ -615,11 +633,12 @@ pod_retry_l1:
                            sizeof(l1e));

     if ( ret == 0 ) {
+        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
         p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
+                (l1e_mfn == INVALID_MFN && p2m_is_paging(p2mt)) );

-        if ( p2m_flags_to_type(l1e_get_flags(l1e))
-             == p2m_populate_on_demand )
+        if ( p2mt == p2m_populate_on_demand )
         {
             /* The read has succeeded, so we know that the mapping
              * exits at this point.  */
@@ -641,7 +660,7 @@ pod_retry_l1:
         }

         if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
-            mfn = _mfn(l1e_get_pfn(l1e));
+            mfn = _mfn(l1e_mfn);
         else
             /* XXX see above */
             p2mt = p2m_mmio_dm;
@@ -783,18 +802,26 @@ pod_retry_l2:
 pod_retry_l1:
     if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
     {
+        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
         /* PoD: Try to populate */
-        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =p2m_populate_on_demand )
+        if ( l1t == p2m_populate_on_demand )
         {
             if ( q != p2m_query ) {
                 if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) )
                     goto pod_retry_l1;
             } else
                 *t = p2m_populate_on_demand;
+        } else {
+            if ( p2m_is_paging(l1t) )
+            {
+                *t = l1t;
+                /* No need to unclip due to check below */
+                mfn = _mfn(l1e_get_pfn(*l1e));
+            }
         }

         unmap_domain_page(l1e);
-        return _mfn(INVALID_MFN);
+        return (l1t == p2m_ram_paging_out) ? mfn : _mfn(INVALID_MFN);
     }
     mfn = _mfn(l1e_get_pfn(*l1e));
     *t = p2m_flags_to_type(l1e_get_flags(*l1e));
@@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
                     flags = l1e_get_flags(l1e[i1]);
                     if ( p2m_flags_to_type(flags) != ot )
                         continue;
-                    mfn = l1e_get_pfn(l1e[i1]);
+                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
                     gfn = i1 + (i2 + (i3
 #if CONFIG_PAGING_LEVELS >= 4
 					+ (i4 * L3_PAGETABLE_ENTRIES)
@@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
                            * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
                     /* create a new 1le entry with the new type */
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags);
+                    l1e_content = l1e_from_pfn(clipped_mfn(mfn), flags);
                     p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
                                          l1mfn, l1e_content, 1);
                 }
@@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
                                 entry_count++;
                             continue;
                         }
-                        mfn = l1e_get_pfn(l1e[i1]);
+                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
                         ASSERT(mfn_valid(_mfn(mfn)));
                         m2pfn = get_gpfn_from_mfn(mfn);
                         if ( m2pfn != gfn &&

Olaf Hering

2012-Feb-14 19:22 UTC

head link

Re: RFC: AMD support for paging

On Tue, Feb 14, Andres Lagar-Cavilla wrote:
> Finally, the triple fault.
"Me too!"
Thats how far I got as well in my attempt.

Olaf

Andres Lagar-Cavilla

2012-Feb-14 19:35 UTC

head link

Re: RFC: AMD support for paging

> On Tue, Feb 14, Andres Lagar-Cavilla wrote:
>
>> Finally, the triple fault.
>
> "Me too!"
> Thats how far I got as well in my attempt.
Hey, awesome you''re trying this out so quickly :)

I just realized the unclip_mfn macro in my patch is broken. It should
instead be

+#define unclip_mfn(mfn)     (((mfn) == clipped_mfn(INVALID_MFN)) ? \
+                                INVALID_MFN : (mfn))

If you have a couple cycles...

Thanks a lot!
Andres
>
> Olaf
>

Olaf Hering

2012-Feb-14 21:26 UTC

head link

Re: RFC: AMD support for paging

On Tue, Feb 14, Andres Lagar-Cavilla wrote:
> > On Tue, Feb 14, Andres Lagar-Cavilla wrote:
> >
> >> Finally, the triple fault.
> >
> > "Me too!"
> > Thats how far I got as well in my attempt.
> 
> Hey, awesome you''re trying this out so quickly :)
I meant my attempt a few weeks ago.
 > If you have a couple cycles...
Does your pager do anything? I just get this:

xc: detail: xenpaging init
xc: detail: watching ''/local/domain/1/memory/target-tot_pages''
xc: detail: max_pages = 263168
xc: detail: starting /usr/lib/xen/bin/xenpaging for domain_id 1 with pagefile
/dev/shm/pagefile-sles11sp2_full_xenpaging
xc: detail: confirming startup in /local/domain/1/xenpaging/state
xc: detail: Got event from xenstore
xc: detail: path ''@releaseDomain'' token ''1''
xc: detail: Got event from evtchn
xc: detail: Got event from xenstore
xc: detail: path ''/local/domain/1/memory/target-tot_pages''
token ''''
xc: detail: new target_tot_pages 59904
xc: detail: Got event from Xen
xc: detail: Need to evict 202213 pages to reach 59904 target_tot_pages
xc: error: Error mapping page 20201 (22 = Invalid argument): Internal error
xc: debug: hypercall buffer: total allocations:9 total releases:9
xc: debug: hypercall buffer: current allocations:0 maximum allocations:2
xc: debug: hypercall buffer: cache current size:2
xc: debug: hypercall buffer: cache hits:7 misses:2 toobig:0


Olaf

Andres Lagar-Cavilla

2012-Feb-14 21:33 UTC

head link

Re: RFC: AMD support for paging

> On Tue, Feb 14, Andres Lagar-Cavilla wrote:
>
>> > On Tue, Feb 14, Andres Lagar-Cavilla wrote:
>> >
>> >> Finally, the triple fault.
>> >
>> > "Me too!"
>> > Thats how far I got as well in my attempt.
>>
>> Hey, awesome you''re trying this out so quickly :)
>
> I meant my attempt a few weeks ago.
>
>> If you have a couple cycles...
>
> Does your pager do anything? I just get this:
>
> xc: detail: xenpaging init
> xc: detail: watching
''/local/domain/1/memory/target-tot_pages''
> xc: detail: max_pages = 263168
> xc: detail: starting /usr/lib/xen/bin/xenpaging for domain_id 1 with
> pagefile /dev/shm/pagefile-sles11sp2_full_xenpaging
> xc: detail: confirming startup in /local/domain/1/xenpaging/state
> xc: detail: Got event from xenstore
> xc: detail: path ''@releaseDomain'' token
''1''
> xc: detail: Got event from evtchn
> xc: detail: Got event from xenstore
> xc: detail: path
''/local/domain/1/memory/target-tot_pages'' token
''''
> xc: detail: new target_tot_pages 59904
> xc: detail: Got event from Xen
> xc: detail: Need to evict 202213 pages to reach 59904 target_tot_pages
> xc: error: Error mapping page 20201 (22 = Invalid argument): Internal
> error
Those are the kinds of errors I was getting until I finished my patch.
Basically, the p2m gets confused and returns p2m_mmio_dm as the type for
anything it doesn''t understand. If you end up giving my patch a spin,
let
me know if it gets you past that hump.

Andres
> xc: debug: hypercall buffer: total allocations:9 total releases:9
> xc: debug: hypercall buffer: current allocations:0 maximum allocations:2
> xc: debug: hypercall buffer: cache current size:2
> xc: debug: hypercall buffer: cache hits:7 misses:2 toobig:0
>
>
> Olaf
>

Olaf Hering

2012-Feb-14 21:43 UTC

head link

Re: RFC: AMD support for paging

On Tue, Feb 14, Andres Lagar-Cavilla wrote:
> Those are the kinds of errors I was getting until I finished my patch.
> Basically, the p2m gets confused and returns p2m_mmio_dm as the type for
> anything it doesn''t understand. If you end up giving my patch a
spin, let
> me know if it gets you past that hump.
No, it does not work for me, still EINVAL during mapping.

Perhaps the cpu is not new enough?

processor       : 11
vendor_id       : AuthenticAMD
cpu family      : 16
model           : 8
model name      : AMD Engineering Sample
stepping        : 0
cpu MHz         : 2400.204
cache size      : 512 KB

Olaf

Olaf Hering

2012-Feb-15 09:18 UTC

head link

Re: RFC: AMD support for paging

On Tue, Feb 14, Olaf Hering wrote:
> On Tue, Feb 14, Andres Lagar-Cavilla wrote:
> 
> > Those are the kinds of errors I was getting until I finished my patch.
> > Basically, the p2m gets confused and returns p2m_mmio_dm as the type
for
> > anything it doesn''t understand. If you end up giving my patch
a spin, let
> > me know if it gets you past that hump.
> 
> No, it does not work for me, still EINVAL during mapping.
And this also happens with a recent system:

processor       : 2
vendor_id       : AuthenticAMD
cpu family      : 16
model           : 5
model name      : AMD Phenom(tm) II N830 Triple-Core Processor
stepping        : 3
cpu MHz         : 2094.846


Olaf

Wei Wang

2012-Feb-15 12:09 UTC

head link

Re: RFC: AMD support for paging

On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:> We started hashing out some AMD support for mem_paging and mem_access.
> Right now my VMs boot, page out a bit, and then die on an HVM triple
> fault.
>
> Most importantly, I want to get somebody from AMD to comment/help out on
> this. It feels like we''re inches away from enabling support for
this very
> nice feature. I''m not sure who exactly on AMD monitors the list
for these
> kinds of things. It''d be great to have you on board!
>
> For starters, the changes to the p2m code are relatively mild, but
it''d be
> great if somebody spots a red flag.
>
> Another issue: comments indicate that bits 59-62 in NPT entries are used
> for IOMMU flags but effectively bits 61-62 are. Repossessing one bit (59)
> would give us enough space to support mem_access. Right now we only have 7
> bits available for Xen flags and that is not enough for paging and access.
> Is bit 59 effectively reserved?
Hi
bit 59 is used by iommu hardware for ATS response. In most cases, for 
p2m_ram_rw pages, U bit must be 0. But maybe for other page types that 
are not potentially used by DMA, this bit could be non-zero. I could 
tested it on my iommu machines if you had some patches that use U bits.

Thanks,
Wei
>
> Finally, the triple fault. Maybe I''m missing something obvious.
Comments
> welcome.
>
> Patch inline below, thanks!
> Andres
>
> Enable AMD support for paging.
>
> Signed-off-by: Andres Lagar-Cavilla<andres@lagarcavilla.org>
> Signed-off-by: Adin Scannell<adin@scannell.ca>
>
> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
> --- a/xen/arch/x86/mm/mem_event.c
> +++ b/xen/arch/x86/mm/mem_event.c
> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>               if ( !hap_enabled(d) )
>                   break;
>
> -            /* Currently only EPT is supported */
> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
> -                break;
> -
>               rc = -EXDEV;
>               /* Disallow paging in a PoD guest */
>               if ( p2m->pod.entry_count )
> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -53,6 +53,20 @@
>   #define P2M_BASE_FLAGS \
>           (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
>
> +#ifdef __x86_64__
> +/* l1e_from_pfn is not designed to have INVALID_MFN stored. The 0xff..ff
> + * value tramples over the higher-order bits used for flags (NX, p2mt,
> + * etc.) This happens for paging entries. Thus we do this clip/unclip
> + * juggle for l1 entries only (no paging superpages!) */
> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
> +#define clipped_mfn(mfn)    ((mfn)&  ((1UL<<  EFF_MFN_WIDTH) -
1))
> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ? \
> +                                INVALID_MFN : (mfn))
> +#else
> +#define clipped_mfn(mfn)    (mfn)
> +#define unclip_mfn(mfn)     (mfn)
> +#endif /* __x86_64__ */
> +
>   static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
>   {
>       unsigned long flags;
> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>       case p2m_invalid:
>       case p2m_mmio_dm:
>       case p2m_populate_on_demand:
> +    case p2m_ram_paging_out:
> +    case p2m_ram_paged:
> +    case p2m_ram_paging_in:
>       default:
>           return flags;
>       case p2m_ram_ro:
> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>                                         shift, max)) )
>           return 0;
>
> -    /* PoD: Not present doesn''t imply empty. */
> +    /* PoD/paging: Not present doesn''t imply empty. */
>       if ( !l1e_get_flags(*p2m_entry) )
>       {
>           struct page_info *pg;
> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>                                      0, L1_PAGETABLE_ENTRIES);
>           ASSERT(p2m_entry);
>
> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
> -            entry_content = l1e_from_pfn(mfn_x(mfn),
> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
> +             (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) )
> +            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>                                            p2m_type_to_flags(p2mt, mfn));
>           else
>               entry_content = l1e_empty();
> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>           if ( entry_content.l1 != 0 )
>           {
>               p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
> -            old_mfn = l1e_get_pfn(*p2m_entry);
> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>           }
>           /* level 1 entry */
>           p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
> entry_content, 1);
> @@ -615,11 +633,12 @@ pod_retry_l1:
>                              sizeof(l1e));
>
>       if ( ret == 0 ) {
> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>           p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
> +                (l1e_mfn == INVALID_MFN&&  p2m_is_paging(p2mt)) );
>
> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
> -             == p2m_populate_on_demand )
> +        if ( p2mt == p2m_populate_on_demand )
>           {
>               /* The read has succeeded, so we know that the mapping
>                * exits at this point.  */
> @@ -641,7 +660,7 @@ pod_retry_l1:
>           }
>
>           if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
> -            mfn = _mfn(l1e_get_pfn(l1e));
> +            mfn = _mfn(l1e_mfn);
>           else
>               /* XXX see above */
>               p2mt = p2m_mmio_dm;
> @@ -783,18 +802,26 @@ pod_retry_l2:
>   pod_retry_l1:
>       if ( (l1e_get_flags(*l1e)&  _PAGE_PRESENT) == 0 )
>       {
> +        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
>           /* PoD: Try to populate */
> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>
p2m_populate_on_demand )
> +        if ( l1t == p2m_populate_on_demand )
>           {
>               if ( q != p2m_query ) {
>                   if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q)
)
>                       goto pod_retry_l1;
>               } else
>                   *t = p2m_populate_on_demand;
> +        } else {
> +            if ( p2m_is_paging(l1t) )
> +            {
> +                *t = l1t;
> +                /* No need to unclip due to check below */
> +                mfn = _mfn(l1e_get_pfn(*l1e));
> +            }
>           }
>
>           unmap_domain_page(l1e);
> -        return _mfn(INVALID_MFN);
> +        return (l1t == p2m_ram_paging_out) ? mfn : _mfn(INVALID_MFN);
>       }
>       mfn = _mfn(l1e_get_pfn(*l1e));
>       *t = p2m_flags_to_type(l1e_get_flags(*l1e));
> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>                       flags = l1e_get_flags(l1e[i1]);
>                       if ( p2m_flags_to_type(flags) != ot )
>                           continue;
> -                    mfn = l1e_get_pfn(l1e[i1]);
> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>                       gfn = i1 + (i2 + (i3
>   #if CONFIG_PAGING_LEVELS>= 4
>   					+ (i4 * L3_PAGETABLE_ENTRIES)
> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>                              * L2_PAGETABLE_ENTRIES) *
L1_PAGETABLE_ENTRIES;
>                       /* create a new 1le entry with the new type */
>                       flags = p2m_type_to_flags(nt, _mfn(mfn));
> -                    l1e_content = l1e_from_pfn(mfn, flags);
> +                    l1e_content = l1e_from_pfn(clipped_mfn(mfn), flags);
>                       p2m->write_p2m_entry(p2m, gfn,&l1e[i1],
>                                            l1mfn, l1e_content, 1);
>                   }
> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>                                   entry_count++;
>                               continue;
>                           }
> -                        mfn = l1e_get_pfn(l1e[i1]);
> +                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>                           ASSERT(mfn_valid(_mfn(mfn)));
>                           m2pfn = get_gpfn_from_mfn(mfn);
>                           if ( m2pfn != gfn&&
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

Andres Lagar-Cavilla

2012-Feb-15 14:59 UTC

head link

Re: RFC: AMD support for paging

> On Tue, Feb 14, Olaf Hering wrote:
>
>> On Tue, Feb 14, Andres Lagar-Cavilla wrote:
>>
>> > Those are the kinds of errors I was getting until I finished my
patch.
>> > Basically, the p2m gets confused and returns p2m_mmio_dm as the
type
>> for
>> > anything it doesn''t understand. If you end up giving my
patch a spin,
>> let
>> > me know if it gets you past that hump.
>>
>> No, it does not work for me, still EINVAL during mapping.
>
> And this also happens with a recent system:
Ok, most likely the p2m is not returning the proper values for entries
that are p2m_ram_paging_out. Although eviction after nominate does work.
Ummhhh. I''ll look into it. Thanks Olaf for the report

Andres
>
> processor       : 2
> vendor_id       : AuthenticAMD
> cpu family      : 16
> model           : 5
> model name      : AMD Phenom(tm) II N830 Triple-Core Processor
> stepping        : 3
> cpu MHz         : 2094.846
>
>
> Olaf
>

Andres Lagar-Cavilla

2012-Feb-15 15:14 UTC

head link

Re: RFC: AMD support for paging

> On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:
>> We started hashing out some AMD support for mem_paging and mem_access.
>> Right now my VMs boot, page out a bit, and then die on an HVM triple
>> fault.
>>
>> Most importantly, I want to get somebody from AMD to comment/help out
on
>> this. It feels like we''re inches away from enabling support
for this
>> very
>> nice feature. I''m not sure who exactly on AMD monitors the
list for
>> these
>> kinds of things. It''d be great to have you on board!
>>
>> For starters, the changes to the p2m code are relatively mild, but
it''d
>> be
>> great if somebody spots a red flag.
>>
>> Another issue: comments indicate that bits 59-62 in NPT entries are
used
>> for IOMMU flags but effectively bits 61-62 are. Repossessing one bit
>> (59)
>> would give us enough space to support mem_access. Right now we only
have
>> 7
>> bits available for Xen flags and that is not enough for paging and
>> access.
>> Is bit 59 effectively reserved?
>
> Hi
> bit 59 is used by iommu hardware for ATS response. In most cases, for
> p2m_ram_rw pages, U bit must be 0. But maybe for other page types that
> are not potentially used by DMA, this bit could be non-zero. I could
> tested it on my iommu machines if you had some patches that use U bits.
Hi Wei, thanks for pitching in! I assume you''re talking about table 14
(and fig 9) in http://support.amd.com/us/Processor_TechDocs/48882.pdf

I don''t think this will work. The p2m access value is arbitrary, and
independent of the p2m type. So bit 59 is out of reach and we''re stuck
with 7 bits for Xen use. Thanks for the clarification.

An alternative to enabling mem_access on AMD processors will be to limit
the access types to 3 bits. This would force disabling support for two
modes. I''m inclined to disable two out of X, W and WX. I don''t
think they
make sense without R permissions.

Thanks!
Andres
>
> Thanks,
> Wei
>
>>
>> Finally, the triple fault. Maybe I''m missing something
obvious. Comments
>> welcome.
>>
>> Patch inline below, thanks!
>> Andres
>>
>> Enable AMD support for paging.
>>
>> Signed-off-by: Andres Lagar-Cavilla<andres@lagarcavilla.org>
>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>>
>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
>> --- a/xen/arch/x86/mm/mem_event.c
>> +++ b/xen/arch/x86/mm/mem_event.c
>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>>               if ( !hap_enabled(d) )
>>                   break;
>>
>> -            /* Currently only EPT is supported */
>> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
>> -                break;
>> -
>>               rc = -EXDEV;
>>               /* Disallow paging in a PoD guest */
>>               if ( p2m->pod.entry_count )
>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
>> --- a/xen/arch/x86/mm/p2m-pt.c
>> +++ b/xen/arch/x86/mm/p2m-pt.c
>> @@ -53,6 +53,20 @@
>>   #define P2M_BASE_FLAGS \
>>           (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
>>
>> +#ifdef __x86_64__
>> +/* l1e_from_pfn is not designed to have INVALID_MFN stored. The
>> 0xff..ff
>> + * value tramples over the higher-order bits used for flags (NX, p2mt,
>> + * etc.) This happens for paging entries. Thus we do this clip/unclip
>> + * juggle for l1 entries only (no paging superpages!) */
>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
>> +#define clipped_mfn(mfn)    ((mfn)&  ((1UL<<  EFF_MFN_WIDTH)
- 1))
>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ? \
>> +                                INVALID_MFN : (mfn))
>> +#else
>> +#define clipped_mfn(mfn)    (mfn)
>> +#define unclip_mfn(mfn)     (mfn)
>> +#endif /* __x86_64__ */
>> +
>>   static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
>>   {
>>       unsigned long flags;
>> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>>       case p2m_invalid:
>>       case p2m_mmio_dm:
>>       case p2m_populate_on_demand:
>> +    case p2m_ram_paging_out:
>> +    case p2m_ram_paged:
>> +    case p2m_ram_paging_in:
>>       default:
>>           return flags;
>>       case p2m_ram_ro:
>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>>                                         shift, max)) )
>>           return 0;
>>
>> -    /* PoD: Not present doesn''t imply empty. */
>> +    /* PoD/paging: Not present doesn''t imply empty. */
>>       if ( !l1e_get_flags(*p2m_entry) )
>>       {
>>           struct page_info *pg;
>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>                                      0, L1_PAGETABLE_ENTRIES);
>>           ASSERT(p2m_entry);
>>
>> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
>> +             (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) )
>> +            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>                                            p2m_type_to_flags(p2mt,
>> mfn));
>>           else
>>               entry_content = l1e_empty();
>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>           if ( entry_content.l1 != 0 )
>>           {
>>               p2m_add_iommu_flags(&entry_content, 0,
iommu_pte_flags);
>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>>           }
>>           /* level 1 entry */
>>           p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
>> entry_content, 1);
>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>                              sizeof(l1e));
>>
>>       if ( ret == 0 ) {
>> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>>           p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
>> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
>> +                (l1e_mfn == INVALID_MFN&& 
p2m_is_paging(p2mt)) );
>>
>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>> -             == p2m_populate_on_demand )
>> +        if ( p2mt == p2m_populate_on_demand )
>>           {
>>               /* The read has succeeded, so we know that the mapping
>>                * exits at this point.  */
>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>           }
>>
>>           if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
>> -            mfn = _mfn(l1e_get_pfn(l1e));
>> +            mfn = _mfn(l1e_mfn);
>>           else
>>               /* XXX see above */
>>               p2mt = p2m_mmio_dm;
>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>   pod_retry_l1:
>>       if ( (l1e_get_flags(*l1e)&  _PAGE_PRESENT) == 0 )
>>       {
>> +        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>           /* PoD: Try to populate */
>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>>
p2m_populate_on_demand )
>> +        if ( l1t == p2m_populate_on_demand )
>>           {
>>               if ( q != p2m_query ) {
>>                   if ( !p2m_pod_demand_populate(p2m, gfn,
PAGE_ORDER_4K,
>> q) )
>>                       goto pod_retry_l1;
>>               } else
>>                   *t = p2m_populate_on_demand;
>> +        } else {
>> +            if ( p2m_is_paging(l1t) )
>> +            {
>> +                *t = l1t;
>> +                /* No need to unclip due to check below */
>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>> +            }
>>           }
>>
>>           unmap_domain_page(l1e);
>> -        return _mfn(INVALID_MFN);
>> +        return (l1t == p2m_ram_paging_out) ? mfn : _mfn(INVALID_MFN);
>>       }
>>       mfn = _mfn(l1e_get_pfn(*l1e));
>>       *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>>                       flags = l1e_get_flags(l1e[i1]);
>>                       if ( p2m_flags_to_type(flags) != ot )
>>                           continue;
>> -                    mfn = l1e_get_pfn(l1e[i1]);
>> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>                       gfn = i1 + (i2 + (i3
>>   #if CONFIG_PAGING_LEVELS>= 4
>>   					+ (i4 * L3_PAGETABLE_ENTRIES)
>> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>>                              * L2_PAGETABLE_ENTRIES) *
>> L1_PAGETABLE_ENTRIES;
>>                       /* create a new 1le entry with the new type */
>>                       flags = p2m_type_to_flags(nt, _mfn(mfn));
>> -                    l1e_content = l1e_from_pfn(mfn, flags);
>> +                    l1e_content = l1e_from_pfn(clipped_mfn(mfn),
>> flags);
>>                       p2m->write_p2m_entry(p2m, gfn,&l1e[i1],
>>                                            l1mfn, l1e_content, 1);
>>                   }
>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>>                                   entry_count++;
>>                               continue;
>>                           }
>> -                        mfn = l1e_get_pfn(l1e[i1]);
>> +                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>                           ASSERT(mfn_valid(_mfn(mfn)));
>>                           m2pfn = get_gpfn_from_mfn(mfn);
>>                           if ( m2pfn != gfn&&
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>>
>
>
>

Wei Wang

2012-Feb-15 15:33 UTC

head link

Re: RFC: AMD support for paging

On 02/15/2012 04:14 PM, Andres Lagar-Cavilla wrote:>> On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:
>>> We started hashing out some AMD support for mem_paging and
mem_access.
>>> Right now my VMs boot, page out a bit, and then die on an HVM
triple
>>> fault.
>>>
>>> Most importantly, I want to get somebody from AMD to comment/help
out on
>>> this. It feels like we''re inches away from enabling
support for this
>>> very
>>> nice feature. I''m not sure who exactly on AMD monitors the
list for
>>> these
>>> kinds of things. It''d be great to have you on board!
>>>
>>> For starters, the changes to the p2m code are relatively mild, but
it''d
>>> be
>>> great if somebody spots a red flag.
>>>
>>> Another issue: comments indicate that bits 59-62 in NPT entries are
used
>>> for IOMMU flags but effectively bits 61-62 are. Repossessing one
bit
>>> (59)
>>> would give us enough space to support mem_access. Right now we only
have
>>> 7
>>> bits available for Xen flags and that is not enough for paging and
>>> access.
>>> Is bit 59 effectively reserved?
>>
>> Hi
>> bit 59 is used by iommu hardware for ATS response. In most cases, for
>> p2m_ram_rw pages, U bit must be 0. But maybe for other page types that
>> are not potentially used by DMA, this bit could be non-zero. I could
>> tested it on my iommu machines if you had some patches that use U bits.
>
> Hi Wei, thanks for pitching in! I assume you''re talking about
table 14
> (and fig 9) in http://support.amd.com/us/Processor_TechDocs/48882.pdf
Yes, indeed.
> I don''t think this will work. The p2m access value is arbitrary,
and
> independent of the p2m type. So bit 59 is out of reach and we''re
stuck
> with 7 bits for Xen use. Thanks for the clarification.
Where will p2m access bit be stored? Please note that bit 52 - bit 58 
for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit 63 
and bit 1 - bit 8 are free to use if PR bit = 1.

Thanks,
Wei
> An alternative to enabling mem_access on AMD processors will be to limit
> the access types to 3 bits. This would force disabling support for two
> modes. I''m inclined to disable two out of X, W and WX. I
don''t think they
> make sense without R permissions.
> Thanks!
> Andres
>
>>
>> Thanks,
>> Wei
>>
>>>
>>> Finally, the triple fault. Maybe I''m missing something
obvious. Comments
>>> welcome.
>>>
>>> Patch inline below, thanks!
>>> Andres
>>>
>>> Enable AMD support for paging.
>>>
>>> Signed-off-by: Andres Lagar-Cavilla<andres@lagarcavilla.org>
>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>>>
>>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
>>> --- a/xen/arch/x86/mm/mem_event.c
>>> +++ b/xen/arch/x86/mm/mem_event.c
>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>>>                if ( !hap_enabled(d) )
>>>                    break;
>>>
>>> -            /* Currently only EPT is supported */
>>> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
>>> -                break;
>>> -
>>>                rc = -EXDEV;
>>>                /* Disallow paging in a PoD guest */
>>>                if ( p2m->pod.entry_count )
>>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
>>> --- a/xen/arch/x86/mm/p2m-pt.c
>>> +++ b/xen/arch/x86/mm/p2m-pt.c
>>> @@ -53,6 +53,20 @@
>>>    #define P2M_BASE_FLAGS \
>>>            (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
_PAGE_ACCESSED)
>>>
>>> +#ifdef __x86_64__
>>> +/* l1e_from_pfn is not designed to have INVALID_MFN stored. The
>>> 0xff..ff
>>> + * value tramples over the higher-order bits used for flags (NX,
p2mt,
>>> + * etc.) This happens for paging entries. Thus we do this
clip/unclip
>>> + * juggle for l1 entries only (no paging superpages!) */
>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
>>> +#define clipped_mfn(mfn)    ((mfn)&   ((1UL<<  
EFF_MFN_WIDTH) - 1))
>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ?
\
>>> +                                INVALID_MFN : (mfn))
>>> +#else
>>> +#define clipped_mfn(mfn)    (mfn)
>>> +#define unclip_mfn(mfn)     (mfn)
>>> +#endif /* __x86_64__ */
>>> +
>>>    static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
>>>    {
>>>        unsigned long flags;
>>> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>>>        case p2m_invalid:
>>>        case p2m_mmio_dm:
>>>        case p2m_populate_on_demand:
>>> +    case p2m_ram_paging_out:
>>> +    case p2m_ram_paged:
>>> +    case p2m_ram_paging_in:
>>>        default:
>>>            return flags;
>>>        case p2m_ram_ro:
>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>>>                                          shift, max)) )
>>>            return 0;
>>>
>>> -    /* PoD: Not present doesn''t imply empty. */
>>> +    /* PoD/paging: Not present doesn''t imply empty. */
>>>        if ( !l1e_get_flags(*p2m_entry) )
>>>        {
>>>            struct page_info *pg;
>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>>                                       0, L1_PAGETABLE_ENTRIES);
>>>            ASSERT(p2m_entry);
>>>
>>> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>>> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
>>> +             (p2mt == p2m_ram_paged) || (p2mt ==
p2m_ram_paging_in) )
>>> +            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>>                                             p2m_type_to_flags(p2mt,
>>> mfn));
>>>            else
>>>                entry_content = l1e_empty();
>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>>            if ( entry_content.l1 != 0 )
>>>            {
>>>                p2m_add_iommu_flags(&entry_content, 0,
iommu_pte_flags);
>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>>> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>>>            }
>>>            /* level 1 entry */
>>>            p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
>>> entry_content, 1);
>>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>>                               sizeof(l1e));
>>>
>>>        if ( ret == 0 ) {
>>> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>>>            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
!p2m_is_ram(p2mt));
>>> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
>>> +                (l1e_mfn == INVALID_MFN&&  
p2m_is_paging(p2mt)) );
>>>
>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>>> -             == p2m_populate_on_demand )
>>> +        if ( p2mt == p2m_populate_on_demand )
>>>            {
>>>                /* The read has succeeded, so we know that the
mapping
>>>                 * exits at this point.  */
>>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>>            }
>>>
>>>            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
>>> -            mfn = _mfn(l1e_get_pfn(l1e));
>>> +            mfn = _mfn(l1e_mfn);
>>>            else
>>>                /* XXX see above */
>>>                p2mt = p2m_mmio_dm;
>>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>>    pod_retry_l1:
>>>        if ( (l1e_get_flags(*l1e)&   _PAGE_PRESENT) == 0 )
>>>        {
>>> +        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>>            /* PoD: Try to populate */
>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>>>
p2m_populate_on_demand )
>>> +        if ( l1t == p2m_populate_on_demand )
>>>            {
>>>                if ( q != p2m_query ) {
>>>                    if ( !p2m_pod_demand_populate(p2m, gfn,
PAGE_ORDER_4K,
>>> q) )
>>>                        goto pod_retry_l1;
>>>                } else
>>>                    *t = p2m_populate_on_demand;
>>> +        } else {
>>> +            if ( p2m_is_paging(l1t) )
>>> +            {
>>> +                *t = l1t;
>>> +                /* No need to unclip due to check below */
>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>>> +            }
>>>            }
>>>
>>>            unmap_domain_page(l1e);
>>> -        return _mfn(INVALID_MFN);
>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
_mfn(INVALID_MFN);
>>>        }
>>>        mfn = _mfn(l1e_get_pfn(*l1e));
>>>        *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>>>                        flags = l1e_get_flags(l1e[i1]);
>>>                        if ( p2m_flags_to_type(flags) != ot )
>>>                            continue;
>>> -                    mfn = l1e_get_pfn(l1e[i1]);
>>> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>                        gfn = i1 + (i2 + (i3
>>>    #if CONFIG_PAGING_LEVELS>= 4
>>>    					+ (i4 * L3_PAGETABLE_ENTRIES)
>>> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>>>                               * L2_PAGETABLE_ENTRIES) *
>>> L1_PAGETABLE_ENTRIES;
>>>                        /* create a new 1le entry with the new type
*/
>>>                        flags = p2m_type_to_flags(nt, _mfn(mfn));
>>> -                    l1e_content = l1e_from_pfn(mfn, flags);
>>> +                    l1e_content = l1e_from_pfn(clipped_mfn(mfn),
>>> flags);
>>>                        p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
>>>                                             l1mfn, l1e_content, 1);
>>>                    }
>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>>>                                    entry_count++;
>>>                                continue;
>>>                            }
>>> -                        mfn = l1e_get_pfn(l1e[i1]);
>>> +                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>                            ASSERT(mfn_valid(_mfn(mfn)));
>>>                            m2pfn = get_gpfn_from_mfn(mfn);
>>>                            if ( m2pfn != gfn&&
>>>
>>>
>>> _______________________________________________
>>> Xen-devel mailing list
>>> Xen-devel@lists.xensource.com
>>> http://lists.xensource.com/xen-devel
>>>
>>
>>
>>
>
>
>

Andres Lagar-Cavilla

2012-Feb-15 16:06 UTC

head link

Re: RFC: AMD support for paging

> On 02/15/2012 04:14 PM, Andres Lagar-Cavilla wrote:
>>> On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:
>>>> We started hashing out some AMD support for mem_paging and
mem_access.
>>>> Right now my VMs boot, page out a bit, and then die on an HVM
triple
>>>> fault.
>>>>
>>>> Most importantly, I want to get somebody from AMD to
comment/help out
>>>> on
>>>> this. It feels like we''re inches away from enabling
support for this
>>>> very
>>>> nice feature. I''m not sure who exactly on AMD monitors
the list for
>>>> these
>>>> kinds of things. It''d be great to have you on board!
>>>>
>>>> For starters, the changes to the p2m code are relatively mild,
but
>>>> it''d
>>>> be
>>>> great if somebody spots a red flag.
>>>>
>>>> Another issue: comments indicate that bits 59-62 in NPT entries
are
>>>> used
>>>> for IOMMU flags but effectively bits 61-62 are. Repossessing
one bit
>>>> (59)
>>>> would give us enough space to support mem_access. Right now we
only
>>>> have
>>>> 7
>>>> bits available for Xen flags and that is not enough for paging
and
>>>> access.
>>>> Is bit 59 effectively reserved?
>>>
>>> Hi
>>> bit 59 is used by iommu hardware for ATS response. In most cases,
for
>>> p2m_ram_rw pages, U bit must be 0. But maybe for other page types
that
>>> are not potentially used by DMA, this bit could be non-zero. I
could
>>> tested it on my iommu machines if you had some patches that use U
bits.
>>
>> Hi Wei, thanks for pitching in! I assume you''re talking about
table 14
>> (and fig 9) in http://support.amd.com/us/Processor_TechDocs/48882.pdf
>
> Yes, indeed.
>
>> I don''t think this will work. The p2m access value is
arbitrary, and
>> independent of the p2m type. So bit 59 is out of reach and
we''re stuck
>> with 7 bits for Xen use. Thanks for the clarification.
>
> Where will p2m access bit be stored? Please note that bit 52 - bit 58
> for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit 63
> and bit 1 - bit 8 are free to use if PR bit = 1.
Wei,

Why *must* bits 52-58 be zero for p2m_ram_rw pages? Or is it just the
current convention?

I propose limiting p2m type to bits 52-55, and storing p2m access on bits
56-58. So, p2m_ram_rw pages with a non-zero access would break the "must
be zero" requirement/convention.

Right now, without any considerations for paging, we''re storing the p2m
type in bits 52-58 when PR=1. That p2m type can be non zero. p2m_ram_ro,
p2m_mmio_dm, p2m_logdirty, p2m_populate_on_demand are all par for the
course. Given your statement above, and table 14 in the IOMMU manual, how
is this even working? Or is it not working?

Would a violation of these rules cause a VMEXIT_SHUTDOWN?

Thanks a lot for the info!
Andres>
> Thanks,
> Wei
>
>> An alternative to enabling mem_access on AMD processors will be to
limit
>> the access types to 3 bits. This would force disabling support for two
>> modes. I''m inclined to disable two out of X, W and WX. I
don''t think
>> they
>> make sense without R permissions.
>> Thanks!
>> Andres
>>
>>>
>>> Thanks,
>>> Wei
>>>
>>>>
>>>> Finally, the triple fault. Maybe I''m missing something
obvious.
>>>> Comments
>>>> welcome.
>>>>
>>>> Patch inline below, thanks!
>>>> Andres
>>>>
>>>> Enable AMD support for paging.
>>>>
>>>> Signed-off-by: Andres
Lagar-Cavilla<andres@lagarcavilla.org>
>>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>>>>
>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/mem_event.c
>>>> --- a/xen/arch/x86/mm/mem_event.c
>>>> +++ b/xen/arch/x86/mm/mem_event.c
>>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>>>>                if ( !hap_enabled(d) )
>>>>                    break;
>>>>
>>>> -            /* Currently only EPT is supported */
>>>> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL
)
>>>> -                break;
>>>> -
>>>>                rc = -EXDEV;
>>>>                /* Disallow paging in a PoD guest */
>>>>                if ( p2m->pod.entry_count )
>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
>>>> --- a/xen/arch/x86/mm/p2m-pt.c
>>>> +++ b/xen/arch/x86/mm/p2m-pt.c
>>>> @@ -53,6 +53,20 @@
>>>>    #define P2M_BASE_FLAGS \
>>>>            (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
_PAGE_ACCESSED)
>>>>
>>>> +#ifdef __x86_64__
>>>> +/* l1e_from_pfn is not designed to have INVALID_MFN stored.
The
>>>> 0xff..ff
>>>> + * value tramples over the higher-order bits used for flags
(NX,
>>>> p2mt,
>>>> + * etc.) This happens for paging entries. Thus we do this
clip/unclip
>>>> + * juggle for l1 entries only (no paging superpages!) */
>>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits
*/
>>>> +#define clipped_mfn(mfn)    ((mfn)&   ((1UL<<  
EFF_MFN_WIDTH) - 1))
>>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) ==
INVALID_MFN) ? \
>>>> +                                INVALID_MFN : (mfn))
>>>> +#else
>>>> +#define clipped_mfn(mfn)    (mfn)
>>>> +#define unclip_mfn(mfn)     (mfn)
>>>> +#endif /* __x86_64__ */
>>>> +
>>>>    static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t
mfn)
>>>>    {
>>>>        unsigned long flags;
>>>> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>>>>        case p2m_invalid:
>>>>        case p2m_mmio_dm:
>>>>        case p2m_populate_on_demand:
>>>> +    case p2m_ram_paging_out:
>>>> +    case p2m_ram_paged:
>>>> +    case p2m_ram_paging_in:
>>>>        default:
>>>>            return flags;
>>>>        case p2m_ram_ro:
>>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>>>>                                          shift, max)) )
>>>>            return 0;
>>>>
>>>> -    /* PoD: Not present doesn''t imply empty. */
>>>> +    /* PoD/paging: Not present doesn''t imply empty.
*/
>>>>        if ( !l1e_get_flags(*p2m_entry) )
>>>>        {
>>>>            struct page_info *pg;
>>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>>>                                       0, L1_PAGETABLE_ENTRIES);
>>>>            ASSERT(p2m_entry);
>>>>
>>>> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
>>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>>>> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
>>>> +             (p2mt == p2m_ram_paged) || (p2mt ==
p2m_ram_paging_in) )
>>>> +            entry_content =
l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>>>                                            
p2m_type_to_flags(p2mt,
>>>> mfn));
>>>>            else
>>>>                entry_content = l1e_empty();
>>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>>>            if ( entry_content.l1 != 0 )
>>>>            {
>>>>                p2m_add_iommu_flags(&entry_content, 0,
>>>> iommu_pte_flags);
>>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>>>> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>>>>            }
>>>>            /* level 1 entry */
>>>>            p2m->write_p2m_entry(p2m, gfn, p2m_entry,
table_mfn,
>>>> entry_content, 1);
>>>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>>>                               sizeof(l1e));
>>>>
>>>>        if ( ret == 0 ) {
>>>> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>>>>            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
>>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
!p2m_is_ram(p2mt));
>>>> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt))
||
>>>> +                (l1e_mfn == INVALID_MFN&&  
p2m_is_paging(p2mt)) );
>>>>
>>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>>>> -             == p2m_populate_on_demand )
>>>> +        if ( p2mt == p2m_populate_on_demand )
>>>>            {
>>>>                /* The read has succeeded, so we know that the
mapping
>>>>                 * exits at this point.  */
>>>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>>>            }
>>>>
>>>>            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
>>>> -            mfn = _mfn(l1e_get_pfn(l1e));
>>>> +            mfn = _mfn(l1e_mfn);
>>>>            else
>>>>                /* XXX see above */
>>>>                p2mt = p2m_mmio_dm;
>>>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>>>    pod_retry_l1:
>>>>        if ( (l1e_get_flags(*l1e)&   _PAGE_PRESENT) == 0 )
>>>>        {
>>>> +        p2m_type_t l1t =
p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>            /* PoD: Try to populate */
>>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e))
=>>>> p2m_populate_on_demand )
>>>> +        if ( l1t == p2m_populate_on_demand )
>>>>            {
>>>>                if ( q != p2m_query ) {
>>>>                    if ( !p2m_pod_demand_populate(p2m, gfn,
>>>> PAGE_ORDER_4K,
>>>> q) )
>>>>                        goto pod_retry_l1;
>>>>                } else
>>>>                    *t = p2m_populate_on_demand;
>>>> +        } else {
>>>> +            if ( p2m_is_paging(l1t) )
>>>> +            {
>>>> +                *t = l1t;
>>>> +                /* No need to unclip due to check below */
>>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>>>> +            }
>>>>            }
>>>>
>>>>            unmap_domain_page(l1e);
>>>> -        return _mfn(INVALID_MFN);
>>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
_mfn(INVALID_MFN);
>>>>        }
>>>>        mfn = _mfn(l1e_get_pfn(*l1e));
>>>>        *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>>> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>>>>                        flags = l1e_get_flags(l1e[i1]);
>>>>                        if ( p2m_flags_to_type(flags) != ot )
>>>>                            continue;
>>>> -                    mfn = l1e_get_pfn(l1e[i1]);
>>>> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>                        gfn = i1 + (i2 + (i3
>>>>    #if CONFIG_PAGING_LEVELS>= 4
>>>>    					+ (i4 * L3_PAGETABLE_ENTRIES)
>>>> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>>>>                               * L2_PAGETABLE_ENTRIES) *
>>>> L1_PAGETABLE_ENTRIES;
>>>>                        /* create a new 1le entry with the new
type */
>>>>                        flags = p2m_type_to_flags(nt,
_mfn(mfn));
>>>> -                    l1e_content = l1e_from_pfn(mfn, flags);
>>>> +                    l1e_content =
l1e_from_pfn(clipped_mfn(mfn),
>>>> flags);
>>>>                        p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
>>>>                                             l1mfn, l1e_content,
1);
>>>>                    }
>>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>>>>                                    entry_count++;
>>>>                                continue;
>>>>                            }
>>>> -                        mfn = l1e_get_pfn(l1e[i1]);
>>>> +                        mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>                            ASSERT(mfn_valid(_mfn(mfn)));
>>>>                            m2pfn = get_gpfn_from_mfn(mfn);
>>>>                            if ( m2pfn != gfn&&
>>>>
>>>>
>>>> _______________________________________________
>>>> Xen-devel mailing list
>>>> Xen-devel@lists.xensource.com
>>>> http://lists.xensource.com/xen-devel
>>>>
>>>
>>>
>>>
>>
>>
>>
>
>
>

Tim Deegan

2012-Feb-15 17:02 UTC

head link

Re: RFC: AMD support for paging

At 08:06 -0800 on 15 Feb (1329293205), Andres Lagar-Cavilla
wrote:> > Where will p2m access bit be stored? Please note that bit 52 - bit 58
> > for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit 63
> > and bit 1 - bit 8 are free to use if PR bit = 1.
> 
> Wei,
> 
> Why *must* bits 52-58 be zero for p2m_ram_rw pages? Or is it just the
> current convention?
> 
> I propose limiting p2m type to bits 52-55, and storing p2m access on bits
> 56-58. So, p2m_ram_rw pages with a non-zero access would break the
"must
> be zero" requirement/convention.
Eh, let''s step back a bit from this.  The problem is that the IOMMU
can''t handle these bits being set, but the IOMMU can''t handle
paged-out
or CoW memory either (because there''s no way to cause the peripheral to
retry a DMA that faulted).

So can we just say that any VM that''s going to use paging, sharing
or access can''t have a unified p2m and IOMMU table?  That''s a
bit ugly
since the admin will have to make the choice, but it seems like the
hardware is forcing us into it.

Maybe the sensible choice is to default to _not_ sharing p2m and IOMMU
tables, and have thet be something an expert user can turn on (and which
disables the other features)?

Should we have a hypervisor-level interlock between PCI passthrough and
paging/sharing/access anyway?  Doing both is unlikely to lead to happiness.

Tim.
> Right now, without any considerations for paging, we''re storing
the p2m
> type in bits 52-58 when PR=1. That p2m type can be non zero. p2m_ram_ro,
> p2m_mmio_dm, p2m_logdirty, p2m_populate_on_demand are all par for the
> course. Given your statement above, and table 14 in the IOMMU manual, how
> is this even working? Or is it not working?
> 
> Would a violation of these rules cause a VMEXIT_SHUTDOWN?
> 
> Thanks a lot for the info!
> Andres
> >
> > Thanks,
> > Wei
> >
> >> An alternative to enabling mem_access on AMD processors will be to
limit
> >> the access types to 3 bits. This would force disabling support for
two
> >> modes. I''m inclined to disable two out of X, W and WX. I
don''t think
> >> they
> >> make sense without R permissions.
> >> Thanks!
> >> Andres
> >>
> >>>
> >>> Thanks,
> >>> Wei
> >>>
> >>>>
> >>>> Finally, the triple fault. Maybe I''m missing
something obvious.
> >>>> Comments
> >>>> welcome.
> >>>>
> >>>> Patch inline below, thanks!
> >>>> Andres
> >>>>
> >>>> Enable AMD support for paging.
> >>>>
> >>>> Signed-off-by: Andres
Lagar-Cavilla<andres@lagarcavilla.org>
> >>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
> >>>>
> >>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/mem_event.c
> >>>> --- a/xen/arch/x86/mm/mem_event.c
> >>>> +++ b/xen/arch/x86/mm/mem_event.c
> >>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain
*d, x
> >>>>                if ( !hap_enabled(d) )
> >>>>                    break;
> >>>>
> >>>> -            /* Currently only EPT is supported */
> >>>> -            if ( boot_cpu_data.x86_vendor !=
X86_VENDOR_INTEL )
> >>>> -                break;
> >>>> -
> >>>>                rc = -EXDEV;
> >>>>                /* Disallow paging in a PoD guest */
> >>>>                if ( p2m->pod.entry_count )
> >>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/p2m-pt.c
> >>>> --- a/xen/arch/x86/mm/p2m-pt.c
> >>>> +++ b/xen/arch/x86/mm/p2m-pt.c
> >>>> @@ -53,6 +53,20 @@
> >>>>    #define P2M_BASE_FLAGS \
> >>>>            (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
_PAGE_ACCESSED)
> >>>>
> >>>> +#ifdef __x86_64__
> >>>> +/* l1e_from_pfn is not designed to have INVALID_MFN
stored. The
> >>>> 0xff..ff
> >>>> + * value tramples over the higher-order bits used for
flags (NX,
> >>>> p2mt,
> >>>> + * etc.) This happens for paging entries. Thus we do this
clip/unclip
> >>>> + * juggle for l1 entries only (no paging superpages!) */
> >>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40
bits */
> >>>> +#define clipped_mfn(mfn)    ((mfn)&   ((1UL<<  
EFF_MFN_WIDTH) - 1))
> >>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) ==
INVALID_MFN) ? \
> >>>> +                                INVALID_MFN : (mfn))
> >>>> +#else
> >>>> +#define clipped_mfn(mfn)    (mfn)
> >>>> +#define unclip_mfn(mfn)     (mfn)
> >>>> +#endif /* __x86_64__ */
> >>>> +
> >>>>    static unsigned long p2m_type_to_flags(p2m_type_t t,
mfn_t mfn)
> >>>>    {
> >>>>        unsigned long flags;
> >>>> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
> >>>>        case p2m_invalid:
> >>>>        case p2m_mmio_dm:
> >>>>        case p2m_populate_on_demand:
> >>>> +    case p2m_ram_paging_out:
> >>>> +    case p2m_ram_paged:
> >>>> +    case p2m_ram_paging_in:
> >>>>        default:
> >>>>            return flags;
> >>>>        case p2m_ram_ro:
> >>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m,
m
> >>>>                                          shift, max)) )
> >>>>            return 0;
> >>>>
> >>>> -    /* PoD: Not present doesn''t imply empty. */
> >>>> +    /* PoD/paging: Not present doesn''t imply
empty. */
> >>>>        if ( !l1e_get_flags(*p2m_entry) )
> >>>>        {
> >>>>            struct page_info *pg;
> >>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m,
un
> >>>>                                       0,
L1_PAGETABLE_ENTRIES);
> >>>>            ASSERT(p2m_entry);
> >>>>
> >>>> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct)
)
> >>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
> >>>> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct)
||
> >>>> +             (p2mt == p2m_ram_paged) || (p2mt ==
p2m_ram_paging_in) )
> >>>> +            entry_content =
l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
> >>>>                                            
p2m_type_to_flags(p2mt,
> >>>> mfn));
> >>>>            else
> >>>>                entry_content = l1e_empty();
> >>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m,
un
> >>>>            if ( entry_content.l1 != 0 )
> >>>>            {
> >>>>                p2m_add_iommu_flags(&entry_content, 0,
> >>>> iommu_pte_flags);
> >>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
> >>>> +            old_mfn =
unclip_mfn(l1e_get_pfn(*p2m_entry));
> >>>>            }
> >>>>            /* level 1 entry */
> >>>>            p2m->write_p2m_entry(p2m, gfn, p2m_entry,
table_mfn,
> >>>> entry_content, 1);
> >>>> @@ -615,11 +633,12 @@ pod_retry_l1:
> >>>>                               sizeof(l1e));
> >>>>
> >>>>        if ( ret == 0 ) {
> >>>> +        unsigned long l1e_mfn =
unclip_mfn(l1e_get_pfn(l1e));
> >>>>            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
> >>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
!p2m_is_ram(p2mt));
> >>>> +        ASSERT( (l1e_mfn != INVALID_MFN ||
!p2m_is_ram(p2mt)) ||
> >>>> +                (l1e_mfn == INVALID_MFN&&  
p2m_is_paging(p2mt)) );
> >>>>
> >>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
> >>>> -             == p2m_populate_on_demand )
> >>>> +        if ( p2mt == p2m_populate_on_demand )
> >>>>            {
> >>>>                /* The read has succeeded, so we know that
the mapping
> >>>>                 * exits at this point.  */
> >>>> @@ -641,7 +660,7 @@ pod_retry_l1:
> >>>>            }
> >>>>
> >>>>            if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
> >>>> -            mfn = _mfn(l1e_get_pfn(l1e));
> >>>> +            mfn = _mfn(l1e_mfn);
> >>>>            else
> >>>>                /* XXX see above */
> >>>>                p2mt = p2m_mmio_dm;
> >>>> @@ -783,18 +802,26 @@ pod_retry_l2:
> >>>>    pod_retry_l1:
> >>>>        if ( (l1e_get_flags(*l1e)&   _PAGE_PRESENT) ==
0 )
> >>>>        {
> >>>> +        p2m_type_t l1t =
p2m_flags_to_type(l1e_get_flags(*l1e));
> >>>>            /* PoD: Try to populate */
> >>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>
>>>> p2m_populate_on_demand )
> >>>> +        if ( l1t == p2m_populate_on_demand )
> >>>>            {
> >>>>                if ( q != p2m_query ) {
> >>>>                    if ( !p2m_pod_demand_populate(p2m, gfn,
> >>>> PAGE_ORDER_4K,
> >>>> q) )
> >>>>                        goto pod_retry_l1;
> >>>>                } else
> >>>>                    *t = p2m_populate_on_demand;
> >>>> +        } else {
> >>>> +            if ( p2m_is_paging(l1t) )
> >>>> +            {
> >>>> +                *t = l1t;
> >>>> +                /* No need to unclip due to check below
*/
> >>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
> >>>> +            }
> >>>>            }
> >>>>
> >>>>            unmap_domain_page(l1e);
> >>>> -        return _mfn(INVALID_MFN);
> >>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
_mfn(INVALID_MFN);
> >>>>        }
> >>>>        mfn = _mfn(l1e_get_pfn(*l1e));
> >>>>        *t = p2m_flags_to_type(l1e_get_flags(*l1e));
> >>>> @@ -914,7 +941,7 @@ static void
p2m_change_type_global(struc
> >>>>                        flags = l1e_get_flags(l1e[i1]);
> >>>>                        if ( p2m_flags_to_type(flags) != ot
)
> >>>>                            continue;
> >>>> -                    mfn = l1e_get_pfn(l1e[i1]);
> >>>> +                    mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
> >>>>                        gfn = i1 + (i2 + (i3
> >>>>    #if CONFIG_PAGING_LEVELS>= 4
> >>>>    					+ (i4 * L3_PAGETABLE_ENTRIES)
> >>>> @@ -923,7 +950,7 @@ static void
p2m_change_type_global(struc
> >>>>                               * L2_PAGETABLE_ENTRIES) *
> >>>> L1_PAGETABLE_ENTRIES;
> >>>>                        /* create a new 1le entry with the
new type */
> >>>>                        flags = p2m_type_to_flags(nt,
_mfn(mfn));
> >>>> -                    l1e_content = l1e_from_pfn(mfn,
flags);
> >>>> +                    l1e_content =
l1e_from_pfn(clipped_mfn(mfn),
> >>>> flags);
> >>>>                        p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
> >>>>                                             l1mfn,
l1e_content, 1);
> >>>>                    }
> >>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct
p2m_domain
> >>>>                                    entry_count++;
> >>>>                                continue;
> >>>>                            }
> >>>> -                        mfn = l1e_get_pfn(l1e[i1]);
> >>>> +                        mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
> >>>>                            ASSERT(mfn_valid(_mfn(mfn)));
> >>>>                            m2pfn = get_gpfn_from_mfn(mfn);
> >>>>                            if ( m2pfn != gfn&&
> >>>>
> >>>>
> >>>> _______________________________________________
> >>>> Xen-devel mailing list
> >>>> Xen-devel@lists.xensource.com
> >>>> http://lists.xensource.com/xen-devel
> >>>>
> >>>
> >>>
> >>>
> >>
> >>
> >>
> >
> >
> >
> 
>

Andres Lagar-Cavilla

2012-Feb-15 17:09 UTC

head link

Re: RFC: AMD support for paging

> At 08:06 -0800 on 15 Feb (1329293205), Andres Lagar-Cavilla wrote:
>> > Where will p2m access bit be stored? Please note that bit 52 - bit
58
>> > for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit
63
>> > and bit 1 - bit 8 are free to use if PR bit = 1.
>>
>> Wei,
>>
>> Why *must* bits 52-58 be zero for p2m_ram_rw pages? Or is it just the
>> current convention?
>>
>> I propose limiting p2m type to bits 52-55, and storing p2m access on
>> bits
>> 56-58. So, p2m_ram_rw pages with a non-zero access would break the
"must
>> be zero" requirement/convention.
>
> Eh, let''s step back a bit from this.  The problem is that the
IOMMU
> can''t handle these bits being set, but the IOMMU can''t
handle paged-out
> or CoW memory either (because there''s no way to cause the
peripheral to
> retry a DMA that faulted).
>
> So can we just say that any VM that''s going to use paging, sharing
> or access can''t have a unified p2m and IOMMU table? 
That''s a bit ugly
> since the admin will have to make the choice, but it seems like the
> hardware is forcing us into it.
>
> Maybe the sensible choice is to default to _not_ sharing p2m and IOMMU
> tables, and have thet be something an expert user can turn on (and which
> disables the other features)?
>
> Should we have a hypervisor-level interlock between PCI passthrough and
> paging/sharing/access anyway?  Doing both is unlikely to lead to
> happiness.
That''s basically my thinking, these are mutually exclusive conditions.
And
I''m curious as to whether that is causing the triple fault
that''s killing
my domains. You seem to imply that p2m+IOMMU unification is turned on by
default -- that would crash PoD domains on AMD, wouldn''t it?

Andres
>
> Tim.
>
>> Right now, without any considerations for paging, we''re
storing the p2m
>> type in bits 52-58 when PR=1. That p2m type can be non zero.
p2m_ram_ro,
>> p2m_mmio_dm, p2m_logdirty, p2m_populate_on_demand are all par for the
>> course. Given your statement above, and table 14 in the IOMMU manual,
>> how
>> is this even working? Or is it not working?
>>
>> Would a violation of these rules cause a VMEXIT_SHUTDOWN?
>>
>> Thanks a lot for the info!
>> Andres
>> >
>> > Thanks,
>> > Wei
>> >
>> >> An alternative to enabling mem_access on AMD processors will
be to
>> limit
>> >> the access types to 3 bits. This would force disabling support
for
>> two
>> >> modes. I''m inclined to disable two out of X, W and
WX. I don''t think
>> >> they
>> >> make sense without R permissions.
>> >> Thanks!
>> >> Andres
>> >>
>> >>>
>> >>> Thanks,
>> >>> Wei
>> >>>
>> >>>>
>> >>>> Finally, the triple fault. Maybe I''m missing
something obvious.
>> >>>> Comments
>> >>>> welcome.
>> >>>>
>> >>>> Patch inline below, thanks!
>> >>>> Andres
>> >>>>
>> >>>> Enable AMD support for paging.
>> >>>>
>> >>>> Signed-off-by: Andres
Lagar-Cavilla<andres@lagarcavilla.org>
>> >>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>> >>>>
>> >>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/mem_event.c
>> >>>> --- a/xen/arch/x86/mm/mem_event.c
>> >>>> +++ b/xen/arch/x86/mm/mem_event.c
>> >>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct
domain *d, x
>> >>>>                if ( !hap_enabled(d) )
>> >>>>                    break;
>> >>>>
>> >>>> -            /* Currently only EPT is supported */
>> >>>> -            if ( boot_cpu_data.x86_vendor !=
X86_VENDOR_INTEL )
>> >>>> -                break;
>> >>>> -
>> >>>>                rc = -EXDEV;
>> >>>>                /* Disallow paging in a PoD guest */
>> >>>>                if ( p2m->pod.entry_count )
>> >>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/p2m-pt.c
>> >>>> --- a/xen/arch/x86/mm/p2m-pt.c
>> >>>> +++ b/xen/arch/x86/mm/p2m-pt.c
>> >>>> @@ -53,6 +53,20 @@
>> >>>>    #define P2M_BASE_FLAGS \
>> >>>>            (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
>> _PAGE_ACCESSED)
>> >>>>
>> >>>> +#ifdef __x86_64__
>> >>>> +/* l1e_from_pfn is not designed to have INVALID_MFN
stored. The
>> >>>> 0xff..ff
>> >>>> + * value tramples over the higher-order bits used for
flags (NX,
>> >>>> p2mt,
>> >>>> + * etc.) This happens for paging entries. Thus we do
this
>> clip/unclip
>> >>>> + * juggle for l1 entries only (no paging superpages!)
*/
>> >>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT)
/* 40 bits */
>> >>>> +#define clipped_mfn(mfn)    ((mfn)&  
((1UL<<   EFF_MFN_WIDTH) -
>> 1))
>> >>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) ==
INVALID_MFN) ?
>> \
>> >>>> +                                INVALID_MFN : (mfn))
>> >>>> +#else
>> >>>> +#define clipped_mfn(mfn)    (mfn)
>> >>>> +#define unclip_mfn(mfn)     (mfn)
>> >>>> +#endif /* __x86_64__ */
>> >>>> +
>> >>>>    static unsigned long p2m_type_to_flags(p2m_type_t
t, mfn_t mfn)
>> >>>>    {
>> >>>>        unsigned long flags;
>> >>>> @@ -77,6 +91,9 @@ static unsigned long
p2m_type_to_flags(p
>> >>>>        case p2m_invalid:
>> >>>>        case p2m_mmio_dm:
>> >>>>        case p2m_populate_on_demand:
>> >>>> +    case p2m_ram_paging_out:
>> >>>> +    case p2m_ram_paged:
>> >>>> +    case p2m_ram_paging_in:
>> >>>>        default:
>> >>>>            return flags;
>> >>>>        case p2m_ram_ro:
>> >>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain
*p2m, m
>> >>>>                                          shift, max))
)
>> >>>>            return 0;
>> >>>>
>> >>>> -    /* PoD: Not present doesn''t imply empty.
*/
>> >>>> +    /* PoD/paging: Not present doesn''t imply
empty. */
>> >>>>        if ( !l1e_get_flags(*p2m_entry) )
>> >>>>        {
>> >>>>            struct page_info *pg;
>> >>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain
*p2m, un
>> >>>>                                       0,
L1_PAGETABLE_ENTRIES);
>> >>>>            ASSERT(p2m_entry);
>> >>>>
>> >>>> -        if ( mfn_valid(mfn) || (p2mt ==
p2m_mmio_direct) )
>> >>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>> >>>> +        if ( mfn_valid(mfn) || (p2mt ==
p2m_mmio_direct) ||
>> >>>> +             (p2mt == p2m_ram_paged) || (p2mt
=>> p2m_ram_paging_in) )
>> >>>> +            entry_content =
l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>> >>>>                                            
p2m_type_to_flags(p2mt,
>> >>>> mfn));
>> >>>>            else
>> >>>>                entry_content = l1e_empty();
>> >>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain
*p2m, un
>> >>>>            if ( entry_content.l1 != 0 )
>> >>>>            {
>> >>>>                p2m_add_iommu_flags(&entry_content,
0,
>> >>>> iommu_pte_flags);
>> >>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>> >>>> +            old_mfn =
unclip_mfn(l1e_get_pfn(*p2m_entry));
>> >>>>            }
>> >>>>            /* level 1 entry */
>> >>>>            p2m->write_p2m_entry(p2m, gfn,
p2m_entry, table_mfn,
>> >>>> entry_content, 1);
>> >>>> @@ -615,11 +633,12 @@ pod_retry_l1:
>> >>>>                               sizeof(l1e));
>> >>>>
>> >>>>        if ( ret == 0 ) {
>> >>>> +        unsigned long l1e_mfn =
unclip_mfn(l1e_get_pfn(l1e));
>> >>>>            p2mt =
p2m_flags_to_type(l1e_get_flags(l1e));
>> >>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
>> !p2m_is_ram(p2mt));
>> >>>> +        ASSERT( (l1e_mfn != INVALID_MFN ||
!p2m_is_ram(p2mt)) ||
>> >>>> +                (l1e_mfn == INVALID_MFN&&  
p2m_is_paging(p2mt))
>> );
>> >>>>
>> >>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>> >>>> -             == p2m_populate_on_demand )
>> >>>> +        if ( p2mt == p2m_populate_on_demand )
>> >>>>            {
>> >>>>                /* The read has succeeded, so we know
that the
>> mapping
>> >>>>                 * exits at this point.  */
>> >>>> @@ -641,7 +660,7 @@ pod_retry_l1:
>> >>>>            }
>> >>>>
>> >>>>            if ( p2m_is_valid(p2mt) ||
p2m_is_grant(p2mt) )
>> >>>> -            mfn = _mfn(l1e_get_pfn(l1e));
>> >>>> +            mfn = _mfn(l1e_mfn);
>> >>>>            else
>> >>>>                /* XXX see above */
>> >>>>                p2mt = p2m_mmio_dm;
>> >>>> @@ -783,18 +802,26 @@ pod_retry_l2:
>> >>>>    pod_retry_l1:
>> >>>>        if ( (l1e_get_flags(*l1e)&   _PAGE_PRESENT)
== 0 )
>> >>>>        {
>> >>>> +        p2m_type_t l1t =
p2m_flags_to_type(l1e_get_flags(*l1e));
>> >>>>            /* PoD: Try to populate */
>> >>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e))
=>> >>>> p2m_populate_on_demand )
>> >>>> +        if ( l1t == p2m_populate_on_demand )
>> >>>>            {
>> >>>>                if ( q != p2m_query ) {
>> >>>>                    if ( !p2m_pod_demand_populate(p2m,
gfn,
>> >>>> PAGE_ORDER_4K,
>> >>>> q) )
>> >>>>                        goto pod_retry_l1;
>> >>>>                } else
>> >>>>                    *t = p2m_populate_on_demand;
>> >>>> +        } else {
>> >>>> +            if ( p2m_is_paging(l1t) )
>> >>>> +            {
>> >>>> +                *t = l1t;
>> >>>> +                /* No need to unclip due to check
below */
>> >>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>> >>>> +            }
>> >>>>            }
>> >>>>
>> >>>>            unmap_domain_page(l1e);
>> >>>> -        return _mfn(INVALID_MFN);
>> >>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
>> _mfn(INVALID_MFN);
>> >>>>        }
>> >>>>        mfn = _mfn(l1e_get_pfn(*l1e));
>> >>>>        *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>> >>>> @@ -914,7 +941,7 @@ static void
p2m_change_type_global(struc
>> >>>>                        flags = l1e_get_flags(l1e[i1]);
>> >>>>                        if ( p2m_flags_to_type(flags)
!= ot )
>> >>>>                            continue;
>> >>>> -                    mfn = l1e_get_pfn(l1e[i1]);
>> >>>> +                    mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>> >>>>                        gfn = i1 + (i2 + (i3
>> >>>>    #if CONFIG_PAGING_LEVELS>= 4
>> >>>>    					+ (i4 * L3_PAGETABLE_ENTRIES)
>> >>>> @@ -923,7 +950,7 @@ static void
p2m_change_type_global(struc
>> >>>>                               * L2_PAGETABLE_ENTRIES)
*
>> >>>> L1_PAGETABLE_ENTRIES;
>> >>>>                        /* create a new 1le entry with
the new type
>> */
>> >>>>                        flags = p2m_type_to_flags(nt,
_mfn(mfn));
>> >>>> -                    l1e_content = l1e_from_pfn(mfn,
flags);
>> >>>> +                    l1e_content =
l1e_from_pfn(clipped_mfn(mfn),
>> >>>> flags);
>> >>>>                        p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
>> >>>>                                             l1mfn,
l1e_content, 1);
>> >>>>                    }
>> >>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct
p2m_domain
>> >>>>                                    entry_count++;
>> >>>>                                continue;
>> >>>>                            }
>> >>>> -                        mfn = l1e_get_pfn(l1e[i1]);
>> >>>> +                        mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>> >>>>                           
ASSERT(mfn_valid(_mfn(mfn)));
>> >>>>                            m2pfn =
get_gpfn_from_mfn(mfn);
>> >>>>                            if ( m2pfn != gfn&&
>> >>>>
>> >>>>
>> >>>> _______________________________________________
>> >>>> Xen-devel mailing list
>> >>>> Xen-devel@lists.xensource.com
>> >>>> http://lists.xensource.com/xen-devel
>> >>>>
>> >>>
>> >>>
>> >>>
>> >>
>> >>
>> >>
>> >
>> >
>> >
>>
>>
>

Tim Deegan

2012-Feb-15 17:34 UTC

head link

Re: RFC: AMD support for paging

At 09:09 -0800 on 15 Feb (1329296951), Andres Lagar-Cavilla
wrote:> That''s basically my thinking, these are mutually exclusive
conditions. And
> I''m curious as to whether that is causing the triple fault
that''s killing
> my domains.
I doubt it - at least not directly.  Have you passed through a PCI[e]
card to them?
> You seem to imply that p2m+IOMMU unification is turned on by
> default -- that would crash PoD domains on AMD, wouldn''t it?
Only PoD + passthrough, if you happened to DMA to a non-populated
address.  But the same is true on Intel (though not all Intel h/w has
compatible EPT and VT-D tables).

Cheers,

Tim.

Hongkaixing

2012-Feb-16 02:31 UTC

head link

Re: RFC: AMD support for paging

> -----Original Message-----
> From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Andres Lagar-Cavilla
> Sent: Wednesday, February 15, 2012 3:05 AM
> To: xen-devel@lists.xensource.com
> Cc: olaf@aepfle.de; keir.xen@gmail.com; tim@xen.org; JBeulich@suse.com;
adin@gridcentric.ca
> Subject: [Xen-devel] RFC: AMD support for paging
> 
> We started hashing out some AMD support for mem_paging and mem_access.
> Right now my VMs boot, page out a bit, and then die on an HVM triple
> fault.
> 
> Most importantly, I want to get somebody from AMD to comment/help out on
> this. It feels like we''re inches away from enabling support for
this very
> nice feature. I''m not sure who exactly on AMD monitors the list
for these
> kinds of things. It''d be great to have you on board!
> 
> For starters, the changes to the p2m code are relatively mild, but
it''d be
> great if somebody spots a red flag.
> 
> Another issue: comments indicate that bits 59-62 in NPT entries are used
> for IOMMU flags but effectively bits 61-62 are. Repossessing one bit (59)
> would give us enough space to support mem_access. Right now we only have 7
> bits available for Xen flags and that is not enough for paging and access.
> Is bit 59 effectively reserved?
> 
> Finally, the triple fault. Maybe I''m missing something obvious.
Comments
> welcome.
> 
> Patch inline below, thanks!
> Andres
> 
> Enable AMD support for paging.
> 
> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>
> Signed-off-by: Adin Scannell <adin@scannell.ca>
> 
> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
> --- a/xen/arch/x86/mm/mem_event.c
> +++ b/xen/arch/x86/mm/mem_event.c
> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>              if ( !hap_enabled(d) )
>                  break;
> 
> -            /* Currently only EPT is supported */
> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
> -                break;
> -
>              rc = -EXDEV;
>              /* Disallow paging in a PoD guest */
>              if ( p2m->pod.entry_count )
> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -53,6 +53,20 @@
>  #define P2M_BASE_FLAGS \
>          (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
> 
> +#ifdef __x86_64__
> +/* l1e_from_pfn is not designed to have INVALID_MFN stored. The 0xff..ff
> + * value tramples over the higher-order bits used for flags (NX, p2mt,
> + * etc.) This happens for paging entries. Thus we do this clip/unclip
> + * juggle for l1 entries only (no paging superpages!) */
> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
> +#define clipped_mfn(mfn)    ((mfn) & ((1UL << EFF_MFN_WIDTH) -
1))
> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ? \
> +                                INVALID_MFN : (mfn))
> +#else
> +#define clipped_mfn(mfn)    (mfn)
> +#define unclip_mfn(mfn)     (mfn)
> +#endif /* __x86_64__ */
> +
>  static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
>  {
>      unsigned long flags;
> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>      case p2m_invalid:
>      case p2m_mmio_dm:
>      case p2m_populate_on_demand:
> +    case p2m_ram_paging_out:
> +    case p2m_ram_paged:
> +    case p2m_ram_paging_in:
>      default:
>          return flags;
>      case p2m_ram_ro:
> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>                                        shift, max)) )
>          return 0;
> 
> -    /* PoD: Not present doesn''t imply empty. */
> +    /* PoD/paging: Not present doesn''t imply empty. */
>      if ( !l1e_get_flags(*p2m_entry) )
>      {
>          struct page_info *pg;
> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>                                     0, L1_PAGETABLE_ENTRIES);
>          ASSERT(p2m_entry);
> 
> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
> -            entry_content = l1e_from_pfn(mfn_x(mfn),
> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
> +             (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) )
> +            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>                                           p2m_type_to_flags(p2mt, mfn));
>          else
>              entry_content = l1e_empty();
> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>          if ( entry_content.l1 != 0 )
>          {
>              p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
> -            old_mfn = l1e_get_pfn(*p2m_entry);
> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>          }
>          /* level 1 entry */
>          p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
> entry_content, 1);
> @@ -615,11 +633,12 @@ pod_retry_l1:
>                             sizeof(l1e));
> 
>      if ( ret == 0 ) {
> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>          p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
> +                (l1e_mfn == INVALID_MFN && p2m_is_paging(p2mt)) );
> 
> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
> -             == p2m_populate_on_demand )
> +        if ( p2mt == p2m_populate_on_demand )
>          {
>              /* The read has succeeded, so we know that the mapping
>               * exits at this point.  */
> @@ -641,7 +660,7 @@ pod_retry_l1:
>          }
> 
>          if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
> -            mfn = _mfn(l1e_get_pfn(l1e));
> +            mfn = _mfn(l1e_mfn);
>          else
>              /* XXX see above */
>              p2mt = p2m_mmio_dm;
> @@ -783,18 +802,26 @@ pod_retry_l2:
>  pod_retry_l1:
>      if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
>      {
> +        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
>          /* PoD: Try to populate */
> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>
p2m_populate_on_demand )
> +        if ( l1t == p2m_populate_on_demand )
>          {
>              if ( q != p2m_query ) {
>                  if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q)
)
>                      goto pod_retry_l1;
>              } else
>                  *t = p2m_populate_on_demand;
> +        } else {
> +            if ( p2m_is_paging(l1t) )
> +            {
> +                *t = l1t;
> +                /* No need to unclip due to check below */
> +                mfn = _mfn(l1e_get_pfn(*l1e));
> +            }
>          }
> 
>          unmap_domain_page(l1e);
> -        return _mfn(INVALID_MFN);
> +        return (l1t == p2m_ram_paging_out) ? mfn : _mfn(INVALID_MFN);            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                   This means when p2mt is p2m_ram_paging_in, the mfn must be
INVALID_MFN;
                   But in p2m_type_to_flags,when p2m_ram_paging_in or 
p2m_ram_paged, the _PAGE_PRESENT is not set.
 
My idea shows like this:
In p2m_gfn_to_mfn function:
@@ -1370,6 +1379,7 @@
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     l2_pgentry_t *l2e;
     l1_pgentry_t *l1e;
+    unsigned long flags;
 
     ASSERT(paging_mode_translate(d));
 
@@ -1455,7 +1465,8 @@
     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
 pod_retry_l1:
-    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    flags = l1e_get_flags(*l1e);
+    if ((( flags & _PAGE_PRESENT) == 0) &&
(!p2m_is_paging(p2m_flags_to_type(flags))))
     {
         /* PoD: Try to populate */
         if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
        {
            if ( q != p2m_query ) {
                if ( !p2m_pod_check_and_populate(p2m, gfn,
                                                 (l1_pgentry_t *)l1e,
PAGE_ORDER_4K, q) )
                    goto pod_retry_l1;
            } else
                *t = p2m_populate_on_demand;
        }
    
        unmap_domain_page(l1e);
        return _mfn(INVALID_MFN);
    }
    mfn = _mfn(l1e_get_pfn(*l1e));
    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
    unmap_domain_page(l1e);



>      }
>      mfn = _mfn(l1e_get_pfn(*l1e));
>      *t = p2m_flags_to_type(l1e_get_flags(*l1e));
> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>                      flags = l1e_get_flags(l1e[i1]);
>                      if ( p2m_flags_to_type(flags) != ot )
>                          continue;
> -                    mfn = l1e_get_pfn(l1e[i1]);
> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>                      gfn = i1 + (i2 + (i3
>  #if CONFIG_PAGING_LEVELS >= 4
>  					+ (i4 * L3_PAGETABLE_ENTRIES)
> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>                             * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
>                      /* create a new 1le entry with the new type */
>                      flags = p2m_type_to_flags(nt, _mfn(mfn));
> -                    l1e_content = l1e_from_pfn(mfn, flags);
> +                    l1e_content = l1e_from_pfn(clipped_mfn(mfn), flags);
>                      p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
>                                           l1mfn, l1e_content, 1);
>                  }
> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>                                  entry_count++;
>                              continue;
>                          }
> -                        mfn = l1e_get_pfn(l1e[i1]);
> +                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>                          ASSERT(mfn_valid(_mfn(mfn)));
>                          m2pfn = get_gpfn_from_mfn(mfn);
>                          if ( m2pfn != gfn &&
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

Andres Lagar-Cavilla

2012-Feb-16 05:19 UTC

head link

Re: RFC: AMD support for paging

>
>
>> -----Original Message-----
>> From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Andres
>> Lagar-Cavilla
>> Sent: Wednesday, February 15, 2012 3:05 AM
>> To: xen-devel@lists.xensource.com
>> Cc: olaf@aepfle.de; keir.xen@gmail.com; tim@xen.org; JBeulich@suse.com;
>> adin@gridcentric.ca
>> Subject: [Xen-devel] RFC: AMD support for paging
>>
>> We started hashing out some AMD support for mem_paging and mem_access.
>> Right now my VMs boot, page out a bit, and then die on an HVM triple
>> fault.
>>
>> Most importantly, I want to get somebody from AMD to comment/help out
on
>> this. It feels like we''re inches away from enabling support
for this
>> very
>> nice feature. I''m not sure who exactly on AMD monitors the
list for
>> these
>> kinds of things. It''d be great to have you on board!
>>
>> For starters, the changes to the p2m code are relatively mild, but
it''d
>> be
>> great if somebody spots a red flag.
>>
>> Another issue: comments indicate that bits 59-62 in NPT entries are
used
>> for IOMMU flags but effectively bits 61-62 are. Repossessing one bit
>> (59)
>> would give us enough space to support mem_access. Right now we only
have
>> 7
>> bits available for Xen flags and that is not enough for paging and
>> access.
>> Is bit 59 effectively reserved?
>>
>> Finally, the triple fault. Maybe I''m missing something
obvious. Comments
>> welcome.
>>
>> Patch inline below, thanks!
>> Andres
>>
>> Enable AMD support for paging.
>>
>> Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>
>> Signed-off-by: Adin Scannell <adin@scannell.ca>
>>
>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
>> --- a/xen/arch/x86/mm/mem_event.c
>> +++ b/xen/arch/x86/mm/mem_event.c
>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>>              if ( !hap_enabled(d) )
>>                  break;
>>
>> -            /* Currently only EPT is supported */
>> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
>> -                break;
>> -
>>              rc = -EXDEV;
>>              /* Disallow paging in a PoD guest */
>>              if ( p2m->pod.entry_count )
>> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
>> --- a/xen/arch/x86/mm/p2m-pt.c
>> +++ b/xen/arch/x86/mm/p2m-pt.c
>> @@ -53,6 +53,20 @@
>>  #define P2M_BASE_FLAGS \
>>          (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
>>
>> +#ifdef __x86_64__
>> +/* l1e_from_pfn is not designed to have INVALID_MFN stored. The
>> 0xff..ff
>> + * value tramples over the higher-order bits used for flags (NX, p2mt,
>> + * etc.) This happens for paging entries. Thus we do this clip/unclip
>> + * juggle for l1 entries only (no paging superpages!) */
>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
>> +#define clipped_mfn(mfn)    ((mfn) & ((1UL << EFF_MFN_WIDTH)
- 1))
>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ? \
>> +                                INVALID_MFN : (mfn))
>> +#else
>> +#define clipped_mfn(mfn)    (mfn)
>> +#define unclip_mfn(mfn)     (mfn)
>> +#endif /* __x86_64__ */
>> +
>>  static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
>>  {
>>      unsigned long flags;
>> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>>      case p2m_invalid:
>>      case p2m_mmio_dm:
>>      case p2m_populate_on_demand:
>> +    case p2m_ram_paging_out:
>> +    case p2m_ram_paged:
>> +    case p2m_ram_paging_in:
>>      default:
>>          return flags;
>>      case p2m_ram_ro:
>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>>                                        shift, max)) )
>>          return 0;
>>
>> -    /* PoD: Not present doesn''t imply empty. */
>> +    /* PoD/paging: Not present doesn''t imply empty. */
>>      if ( !l1e_get_flags(*p2m_entry) )
>>      {
>>          struct page_info *pg;
>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>                                     0, L1_PAGETABLE_ENTRIES);
>>          ASSERT(p2m_entry);
>>
>> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
>> +             (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) )
>> +            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>                                           p2m_type_to_flags(p2mt,
mfn));
>>          else
>>              entry_content = l1e_empty();
>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>>          if ( entry_content.l1 != 0 )
>>          {
>>              p2m_add_iommu_flags(&entry_content, 0,
iommu_pte_flags);
>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>>          }
>>          /* level 1 entry */
>>          p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
>> entry_content, 1);
>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>                             sizeof(l1e));
>>
>>      if ( ret == 0 ) {
>> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>>          p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
>> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
>> +                (l1e_mfn == INVALID_MFN &&
p2m_is_paging(p2mt)) );
>>
>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>> -             == p2m_populate_on_demand )
>> +        if ( p2mt == p2m_populate_on_demand )
>>          {
>>              /* The read has succeeded, so we know that the mapping
>>               * exits at this point.  */
>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>          }
>>
>>          if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
>> -            mfn = _mfn(l1e_get_pfn(l1e));
>> +            mfn = _mfn(l1e_mfn);
>>          else
>>              /* XXX see above */
>>              p2mt = p2m_mmio_dm;
>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>  pod_retry_l1:
>>      if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
>>      {
>> +        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>          /* PoD: Try to populate */
>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>>
p2m_populate_on_demand )
>> +        if ( l1t == p2m_populate_on_demand )
>>          {
>>              if ( q != p2m_query ) {
>>                  if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K,
>> q) )
>>                      goto pod_retry_l1;
>>              } else
>>                  *t = p2m_populate_on_demand;
>> +        } else {
>> +            if ( p2m_is_paging(l1t) )
>> +            {
>> +                *t = l1t;
>> +                /* No need to unclip due to check below */
>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>> +            }
>>          }
>>
>>          unmap_domain_page(l1e);
>> -        return _mfn(INVALID_MFN);
>> +        return (l1t == p2m_ram_paging_out) ? mfn : _mfn(INVALID_MFN);
>             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>                    This means when p2mt is p2m_ram_paging_in, the mfn must
> be INVALID_MFN;
>                    But in p2m_type_to_flags,when p2m_ram_paging_in or
> p2m_ram_paged, the _PAGE_PRESENT is not set.
Your patch below looks reasonable. I''ll give it a try.
Andres
>
> My idea shows like this:
> In p2m_gfn_to_mfn function:
> @@ -1370,6 +1379,7 @@
>      paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
>      l2_pgentry_t *l2e;
>      l1_pgentry_t *l1e;
> +    unsigned long flags;
>
>      ASSERT(paging_mode_translate(d));
>
> @@ -1455,7 +1465,8 @@
>      l1e = map_domain_page(mfn_x(mfn));
>      l1e += l1_table_offset(addr);
>  pod_retry_l1:
> -    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
> +    flags = l1e_get_flags(*l1e);
> +    if ((( flags & _PAGE_PRESENT) == 0) &&
> (!p2m_is_paging(p2m_flags_to_type(flags))))
>      {
>          /* PoD: Try to populate */
>          if ( p2m_flags_to_type(l1e_get_flags(*l1e)) =>
p2m_populate_on_demand )
>         {
>             if ( q != p2m_query ) {
>                 if ( !p2m_pod_check_and_populate(p2m, gfn,
>                                                  (l1_pgentry_t *)l1e,
> PAGE_ORDER_4K, q) )
>                     goto pod_retry_l1;
>             } else
>                 *t = p2m_populate_on_demand;
>         }
>
>         unmap_domain_page(l1e);
>         return _mfn(INVALID_MFN);
>     }
>     mfn = _mfn(l1e_get_pfn(*l1e));
>     *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>     unmap_domain_page(l1e);
>
>
>
>
>>      }
>>      mfn = _mfn(l1e_get_pfn(*l1e));
>>      *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>>                      flags = l1e_get_flags(l1e[i1]);
>>                      if ( p2m_flags_to_type(flags) != ot )
>>                          continue;
>> -                    mfn = l1e_get_pfn(l1e[i1]);
>> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>                      gfn = i1 + (i2 + (i3
>>  #if CONFIG_PAGING_LEVELS >= 4
>>  					+ (i4 * L3_PAGETABLE_ENTRIES)
>> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>>                             * L2_PAGETABLE_ENTRIES) *
>> L1_PAGETABLE_ENTRIES;
>>                      /* create a new 1le entry with the new type */
>>                      flags = p2m_type_to_flags(nt, _mfn(mfn));
>> -                    l1e_content = l1e_from_pfn(mfn, flags);
>> +                    l1e_content = l1e_from_pfn(clipped_mfn(mfn),
>> flags);
>>                      p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
>>                                           l1mfn, l1e_content, 1);
>>                  }
>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>>                                  entry_count++;
>>                              continue;
>>                          }
>> -                        mfn = l1e_get_pfn(l1e[i1]);
>> +                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>>                          ASSERT(mfn_valid(_mfn(mfn)));
>>                          m2pfn = get_gpfn_from_mfn(mfn);
>>                          if ( m2pfn != gfn &&
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>
>

Wei Wang

2012-Feb-16 10:17 UTC

head link

Re: RFC: AMD support for paging

On 02/15/2012 05:06 PM, Andres Lagar-Cavilla wrote:>> On 02/15/2012 04:14 PM, Andres Lagar-Cavilla wrote:
>>>> On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:
>>>>> We started hashing out some AMD support for mem_paging and
mem_access.
>>>>> Right now my VMs boot, page out a bit, and then die on an
HVM triple
>>>>> fault.
>>>>>
>>>>> Most importantly, I want to get somebody from AMD to
comment/help out
>>>>> on
>>>>> this. It feels like we''re inches away from
enabling support for this
>>>>> very
>>>>> nice feature. I''m not sure who exactly on AMD
monitors the list for
>>>>> these
>>>>> kinds of things. It''d be great to have you on
board!
>>>>>
>>>>> For starters, the changes to the p2m code are relatively
mild, but
>>>>> it''d
>>>>> be
>>>>> great if somebody spots a red flag.
>>>>>
>>>>> Another issue: comments indicate that bits 59-62 in NPT
entries are
>>>>> used
>>>>> for IOMMU flags but effectively bits 61-62 are.
Repossessing one bit
>>>>> (59)
>>>>> would give us enough space to support mem_access. Right now
we only
>>>>> have
>>>>> 7
>>>>> bits available for Xen flags and that is not enough for
paging and
>>>>> access.
>>>>> Is bit 59 effectively reserved?
>>>>
>>>> Hi
>>>> bit 59 is used by iommu hardware for ATS response. In most
cases, for
>>>> p2m_ram_rw pages, U bit must be 0. But maybe for other page
types that
>>>> are not potentially used by DMA, this bit could be non-zero. I
could
>>>> tested it on my iommu machines if you had some patches that use
U bits.
>>>
>>> Hi Wei, thanks for pitching in! I assume you''re talking
about table 14
>>> (and fig 9) in
http://support.amd.com/us/Processor_TechDocs/48882.pdf
>>
>> Yes, indeed.
>>
>>> I don''t think this will work. The p2m access value is
arbitrary, and
>>> independent of the p2m type. So bit 59 is out of reach and
we''re stuck
>>> with 7 bits for Xen use. Thanks for the clarification.
>>
>> Where will p2m access bit be stored? Please note that bit 52 - bit 58
>> for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit 63
>> and bit 1 - bit 8 are free to use if PR bit = 1.
>
> Wei,
>
> Why *must* bits 52-58 be zero for p2m_ram_rw pages? Or is it just the
> current convention?
>
> I propose limiting p2m type to bits 52-55, and storing p2m access on bits
> 56-58. So, p2m_ram_rw pages with a non-zero access would break the
"must
> be zero" requirement/convention.
>
> Right now, without any considerations for paging, we''re storing
the p2m
> type in bits 52-58 when PR=1. That p2m type can be non zero. p2m_ram_ro,
> p2m_mmio_dm, p2m_logdirty, p2m_populate_on_demand are all par for the
> course. Given your statement above, and table 14 in the IOMMU manual, how
> is this even working? Or is it not working?
It works because only p2m_ram_rw (which is 0) pages suppose to be used 
by DMA. But, indeed, if iommu tries to access pages with non-zero types, 
like p2m_ram_ro,,it will have io page faults. It seems that we don''t 
have use cases like this.

If mem access bits are independent to p2m types, I think we might have a 
few solutions here:
1) reverse iommu pt sharing for amd iommu totally.
2) disable iommu pt sharing when xen paging enabled.

I am open for both of them.

Thanks,

Wei

> Would a violation of these rules cause a VMEXIT_SHUTDOWN?
>
> Thanks a lot for the info!
> Andres
>>
>> Thanks,
>> Wei
>>
>>> An alternative to enabling mem_access on AMD processors will be to
limit
>>> the access types to 3 bits. This would force disabling support for
two
>>> modes. I''m inclined to disable two out of X, W and WX. I
don''t think
>>> they
>>> make sense without R permissions.
>>> Thanks!
>>> Andres
>>>
>>>>
>>>> Thanks,
>>>> Wei
>>>>
>>>>>
>>>>> Finally, the triple fault. Maybe I''m missing
something obvious.
>>>>> Comments
>>>>> welcome.
>>>>>
>>>>> Patch inline below, thanks!
>>>>> Andres
>>>>>
>>>>> Enable AMD support for paging.
>>>>>
>>>>> Signed-off-by: Andres
Lagar-Cavilla<andres@lagarcavilla.org>
>>>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>>>>>
>>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/mem_event.c
>>>>> --- a/xen/arch/x86/mm/mem_event.c
>>>>> +++ b/xen/arch/x86/mm/mem_event.c
>>>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d,
x
>>>>>                 if ( !hap_enabled(d) )
>>>>>                     break;
>>>>>
>>>>> -            /* Currently only EPT is supported */
>>>>> -            if ( boot_cpu_data.x86_vendor !=
X86_VENDOR_INTEL )
>>>>> -                break;
>>>>> -
>>>>>                 rc = -EXDEV;
>>>>>                 /* Disallow paging in a PoD guest */
>>>>>                 if ( p2m->pod.entry_count )
>>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/p2m-pt.c
>>>>> --- a/xen/arch/x86/mm/p2m-pt.c
>>>>> +++ b/xen/arch/x86/mm/p2m-pt.c
>>>>> @@ -53,6 +53,20 @@
>>>>>     #define P2M_BASE_FLAGS \
>>>>>             (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
_PAGE_ACCESSED)
>>>>>
>>>>> +#ifdef __x86_64__
>>>>> +/* l1e_from_pfn is not designed to have INVALID_MFN
stored. The
>>>>> 0xff..ff
>>>>> + * value tramples over the higher-order bits used for
flags (NX,
>>>>> p2mt,
>>>>> + * etc.) This happens for paging entries. Thus we do this
clip/unclip
>>>>> + * juggle for l1 entries only (no paging superpages!) */
>>>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40
bits */
>>>>> +#define clipped_mfn(mfn)    ((mfn)&    ((1UL<<  
EFF_MFN_WIDTH) - 1))
>>>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) ==
INVALID_MFN) ? \
>>>>> +                                INVALID_MFN : (mfn))
>>>>> +#else
>>>>> +#define clipped_mfn(mfn)    (mfn)
>>>>> +#define unclip_mfn(mfn)     (mfn)
>>>>> +#endif /* __x86_64__ */
>>>>> +
>>>>>     static unsigned long p2m_type_to_flags(p2m_type_t t,
mfn_t mfn)
>>>>>     {
>>>>>         unsigned long flags;
>>>>> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>>>>>         case p2m_invalid:
>>>>>         case p2m_mmio_dm:
>>>>>         case p2m_populate_on_demand:
>>>>> +    case p2m_ram_paging_out:
>>>>> +    case p2m_ram_paged:
>>>>> +    case p2m_ram_paging_in:
>>>>>         default:
>>>>>             return flags;
>>>>>         case p2m_ram_ro:
>>>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m,
m
>>>>>                                           shift, max)) )
>>>>>             return 0;
>>>>>
>>>>> -    /* PoD: Not present doesn''t imply empty. */
>>>>> +    /* PoD/paging: Not present doesn''t imply
empty. */
>>>>>         if ( !l1e_get_flags(*p2m_entry) )
>>>>>         {
>>>>>             struct page_info *pg;
>>>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m,
un
>>>>>                                        0,
L1_PAGETABLE_ENTRIES);
>>>>>             ASSERT(p2m_entry);
>>>>>
>>>>> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
>>>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>>>>> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct)
||
>>>>> +             (p2mt == p2m_ram_paged) || (p2mt ==
p2m_ram_paging_in) )
>>>>> +            entry_content =
l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>>>>                                             
p2m_type_to_flags(p2mt,
>>>>> mfn));
>>>>>             else
>>>>>                 entry_content = l1e_empty();
>>>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m,
un
>>>>>             if ( entry_content.l1 != 0 )
>>>>>             {
>>>>>                 p2m_add_iommu_flags(&entry_content, 0,
>>>>> iommu_pte_flags);
>>>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>>>>> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>>>>>             }
>>>>>             /* level 1 entry */
>>>>>             p2m->write_p2m_entry(p2m, gfn, p2m_entry,
table_mfn,
>>>>> entry_content, 1);
>>>>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>>>>                                sizeof(l1e));
>>>>>
>>>>>         if ( ret == 0 ) {
>>>>> +        unsigned long l1e_mfn =
unclip_mfn(l1e_get_pfn(l1e));
>>>>>             p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
>>>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
!p2m_is_ram(p2mt));
>>>>> +        ASSERT( (l1e_mfn != INVALID_MFN ||
!p2m_is_ram(p2mt)) ||
>>>>> +                (l1e_mfn == INVALID_MFN&&   
p2m_is_paging(p2mt)) );
>>>>>
>>>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>>>>> -             == p2m_populate_on_demand )
>>>>> +        if ( p2mt == p2m_populate_on_demand )
>>>>>             {
>>>>>                 /* The read has succeeded, so we know that
the mapping
>>>>>                  * exits at this point.  */
>>>>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>>>>             }
>>>>>
>>>>>             if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
>>>>> -            mfn = _mfn(l1e_get_pfn(l1e));
>>>>> +            mfn = _mfn(l1e_mfn);
>>>>>             else
>>>>>                 /* XXX see above */
>>>>>                 p2mt = p2m_mmio_dm;
>>>>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>>>>     pod_retry_l1:
>>>>>         if ( (l1e_get_flags(*l1e)&    _PAGE_PRESENT) ==
0 )
>>>>>         {
>>>>> +        p2m_type_t l1t =
p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>>             /* PoD: Try to populate */
>>>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e))
=>>>>> p2m_populate_on_demand )
>>>>> +        if ( l1t == p2m_populate_on_demand )
>>>>>             {
>>>>>                 if ( q != p2m_query ) {
>>>>>                     if ( !p2m_pod_demand_populate(p2m, gfn,
>>>>> PAGE_ORDER_4K,
>>>>> q) )
>>>>>                         goto pod_retry_l1;
>>>>>                 } else
>>>>>                     *t = p2m_populate_on_demand;
>>>>> +        } else {
>>>>> +            if ( p2m_is_paging(l1t) )
>>>>> +            {
>>>>> +                *t = l1t;
>>>>> +                /* No need to unclip due to check below */
>>>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>>>>> +            }
>>>>>             }
>>>>>
>>>>>             unmap_domain_page(l1e);
>>>>> -        return _mfn(INVALID_MFN);
>>>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
_mfn(INVALID_MFN);
>>>>>         }
>>>>>         mfn = _mfn(l1e_get_pfn(*l1e));
>>>>>         *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>> @@ -914,7 +941,7 @@ static void
p2m_change_type_global(struc
>>>>>                         flags = l1e_get_flags(l1e[i1]);
>>>>>                         if ( p2m_flags_to_type(flags) != ot
)
>>>>>                             continue;
>>>>> -                    mfn = l1e_get_pfn(l1e[i1]);
>>>>> +                    mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>>                         gfn = i1 + (i2 + (i3
>>>>>     #if CONFIG_PAGING_LEVELS>= 4
>>>>>     					+ (i4 * L3_PAGETABLE_ENTRIES)
>>>>> @@ -923,7 +950,7 @@ static void
p2m_change_type_global(struc
>>>>>                                * L2_PAGETABLE_ENTRIES) *
>>>>> L1_PAGETABLE_ENTRIES;
>>>>>                         /* create a new 1le entry with the
new type */
>>>>>                         flags = p2m_type_to_flags(nt,
_mfn(mfn));
>>>>> -                    l1e_content = l1e_from_pfn(mfn,
flags);
>>>>> +                    l1e_content =
l1e_from_pfn(clipped_mfn(mfn),
>>>>> flags);
>>>>>                         p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
>>>>>                                              l1mfn,
l1e_content, 1);
>>>>>                     }
>>>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct
p2m_domain
>>>>>                                     entry_count++;
>>>>>                                 continue;
>>>>>                             }
>>>>> -                        mfn = l1e_get_pfn(l1e[i1]);
>>>>> +                        mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>>                             ASSERT(mfn_valid(_mfn(mfn)));
>>>>>                             m2pfn = get_gpfn_from_mfn(mfn);
>>>>>                             if ( m2pfn != gfn&&
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> Xen-devel mailing list
>>>>> Xen-devel@lists.xensource.com
>>>>> http://lists.xensource.com/xen-devel
>>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>>
>>
>>
>>
>
>
>

Andres Lagar-Cavilla

2012-Feb-16 14:36 UTC

head link

Re: RFC: AMD support for paging

> On 02/15/2012 05:06 PM, Andres Lagar-Cavilla wrote:
>>> On 02/15/2012 04:14 PM, Andres Lagar-Cavilla wrote:
>>>>> On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:
>>>>>> We started hashing out some AMD support for mem_paging
and
>>>>>> mem_access.
>>>>>> Right now my VMs boot, page out a bit, and then die on
an HVM triple
>>>>>> fault.
>>>>>>
>>>>>> Most importantly, I want to get somebody from AMD to
comment/help
>>>>>> out
>>>>>> on
>>>>>> this. It feels like we''re inches away from
enabling support for this
>>>>>> very
>>>>>> nice feature. I''m not sure who exactly on AMD
monitors the list for
>>>>>> these
>>>>>> kinds of things. It''d be great to have you on
board!
>>>>>>
>>>>>> For starters, the changes to the p2m code are
relatively mild, but
>>>>>> it''d
>>>>>> be
>>>>>> great if somebody spots a red flag.
>>>>>>
>>>>>> Another issue: comments indicate that bits 59-62 in NPT
entries are
>>>>>> used
>>>>>> for IOMMU flags but effectively bits 61-62 are.
Repossessing one bit
>>>>>> (59)
>>>>>> would give us enough space to support mem_access. Right
now we only
>>>>>> have
>>>>>> 7
>>>>>> bits available for Xen flags and that is not enough for
paging and
>>>>>> access.
>>>>>> Is bit 59 effectively reserved?
>>>>>
>>>>> Hi
>>>>> bit 59 is used by iommu hardware for ATS response. In most
cases, for
>>>>> p2m_ram_rw pages, U bit must be 0. But maybe for other page
types
>>>>> that
>>>>> are not potentially used by DMA, this bit could be
non-zero. I could
>>>>> tested it on my iommu machines if you had some patches that
use U
>>>>> bits.
>>>>
>>>> Hi Wei, thanks for pitching in! I assume you''re
talking about table 14
>>>> (and fig 9) in
http://support.amd.com/us/Processor_TechDocs/48882.pdf
>>>
>>> Yes, indeed.
>>>
>>>> I don''t think this will work. The p2m access value is
arbitrary, and
>>>> independent of the p2m type. So bit 59 is out of reach and
we''re stuck
>>>> with 7 bits for Xen use. Thanks for the clarification.
>>>
>>> Where will p2m access bit be stored? Please note that bit 52 - bit
58
>>> for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit
63
>>> and bit 1 - bit 8 are free to use if PR bit = 1.
>>
>> Wei,
>>
>> Why *must* bits 52-58 be zero for p2m_ram_rw pages? Or is it just the
>> current convention?
>>
>> I propose limiting p2m type to bits 52-55, and storing p2m access on
>> bits
>> 56-58. So, p2m_ram_rw pages with a non-zero access would break the
"must
>> be zero" requirement/convention.
>>
>> Right now, without any considerations for paging, we''re
storing the p2m
>> type in bits 52-58 when PR=1. That p2m type can be non zero.
p2m_ram_ro,
>> p2m_mmio_dm, p2m_logdirty, p2m_populate_on_demand are all par for the
>> course. Given your statement above, and table 14 in the IOMMU manual,
>> how
>> is this even working? Or is it not working?
>
> It works because only p2m_ram_rw (which is 0) pages suppose to be used
> by DMA. But, indeed, if iommu tries to access pages with non-zero types,
> like p2m_ram_ro,,it will have io page faults. It seems that we
don''t
> have use cases like this.
>I guess no one unleashed log dirty on a domain doing DMA. Rightly so.
> If mem access bits are independent to p2m types, I think we might have a
> few solutions here:
> 1) reverse iommu pt sharing for amd iommu totally.
> 2) disable iommu pt sharing when xen paging enabled.
>
> I am open for both of them.
Great. When I get back to this, I''ll look into making the two feature
sets
mutually exclusive. And then we can enjoy full mem_access on AMD
platforms.

Thanks,
Andres>
> Thanks,
>
> Wei
>
>
>> Would a violation of these rules cause a VMEXIT_SHUTDOWN?
>>
>> Thanks a lot for the info!
>> Andres
>>>
>>> Thanks,
>>> Wei
>>>
>>>> An alternative to enabling mem_access on AMD processors will be
to
>>>> limit
>>>> the access types to 3 bits. This would force disabling support
for two
>>>> modes. I''m inclined to disable two out of X, W and WX.
I don''t think
>>>> they
>>>> make sense without R permissions.
>>>> Thanks!
>>>> Andres
>>>>
>>>>>
>>>>> Thanks,
>>>>> Wei
>>>>>
>>>>>>
>>>>>> Finally, the triple fault. Maybe I''m missing
something obvious.
>>>>>> Comments
>>>>>> welcome.
>>>>>>
>>>>>> Patch inline below, thanks!
>>>>>> Andres
>>>>>>
>>>>>> Enable AMD support for paging.
>>>>>>
>>>>>> Signed-off-by: Andres
Lagar-Cavilla<andres@lagarcavilla.org>
>>>>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>>>>>>
>>>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/mem_event.c
>>>>>> --- a/xen/arch/x86/mm/mem_event.c
>>>>>> +++ b/xen/arch/x86/mm/mem_event.c
>>>>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain
*d, x
>>>>>>                 if ( !hap_enabled(d) )
>>>>>>                     break;
>>>>>>
>>>>>> -            /* Currently only EPT is supported */
>>>>>> -            if ( boot_cpu_data.x86_vendor !=
X86_VENDOR_INTEL )
>>>>>> -                break;
>>>>>> -
>>>>>>                 rc = -EXDEV;
>>>>>>                 /* Disallow paging in a PoD guest */
>>>>>>                 if ( p2m->pod.entry_count )
>>>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/p2m-pt.c
>>>>>> --- a/xen/arch/x86/mm/p2m-pt.c
>>>>>> +++ b/xen/arch/x86/mm/p2m-pt.c
>>>>>> @@ -53,6 +53,20 @@
>>>>>>     #define P2M_BASE_FLAGS \
>>>>>>             (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
>>>>>> _PAGE_ACCESSED)
>>>>>>
>>>>>> +#ifdef __x86_64__
>>>>>> +/* l1e_from_pfn is not designed to have INVALID_MFN
stored. The
>>>>>> 0xff..ff
>>>>>> + * value tramples over the higher-order bits used for
flags (NX,
>>>>>> p2mt,
>>>>>> + * etc.) This happens for paging entries. Thus we do
this
>>>>>> clip/unclip
>>>>>> + * juggle for l1 entries only (no paging superpages!)
*/
>>>>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /*
40 bits */
>>>>>> +#define clipped_mfn(mfn)    ((mfn)&   
((1UL<<    EFF_MFN_WIDTH) -
>>>>>> 1))
>>>>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) ==
INVALID_MFN) ?
>>>>>> \
>>>>>> +                                INVALID_MFN : (mfn))
>>>>>> +#else
>>>>>> +#define clipped_mfn(mfn)    (mfn)
>>>>>> +#define unclip_mfn(mfn)     (mfn)
>>>>>> +#endif /* __x86_64__ */
>>>>>> +
>>>>>>     static unsigned long p2m_type_to_flags(p2m_type_t
t, mfn_t mfn)
>>>>>>     {
>>>>>>         unsigned long flags;
>>>>>> @@ -77,6 +91,9 @@ static unsigned long
p2m_type_to_flags(p
>>>>>>         case p2m_invalid:
>>>>>>         case p2m_mmio_dm:
>>>>>>         case p2m_populate_on_demand:
>>>>>> +    case p2m_ram_paging_out:
>>>>>> +    case p2m_ram_paged:
>>>>>> +    case p2m_ram_paging_in:
>>>>>>         default:
>>>>>>             return flags;
>>>>>>         case p2m_ram_ro:
>>>>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain
*p2m, m
>>>>>>                                           shift, max))
)
>>>>>>             return 0;
>>>>>>
>>>>>> -    /* PoD: Not present doesn''t imply empty.
*/
>>>>>> +    /* PoD/paging: Not present doesn''t imply
empty. */
>>>>>>         if ( !l1e_get_flags(*p2m_entry) )
>>>>>>         {
>>>>>>             struct page_info *pg;
>>>>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain
*p2m, un
>>>>>>                                        0,
L1_PAGETABLE_ENTRIES);
>>>>>>             ASSERT(p2m_entry);
>>>>>>
>>>>>> -        if ( mfn_valid(mfn) || (p2mt ==
p2m_mmio_direct) )
>>>>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>>>>>> +        if ( mfn_valid(mfn) || (p2mt ==
p2m_mmio_direct) ||
>>>>>> +             (p2mt == p2m_ram_paged) || (p2mt ==
p2m_ram_paging_in)
>>>>>> )
>>>>>> +            entry_content =
l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>>>>>                                             
p2m_type_to_flags(p2mt,
>>>>>> mfn));
>>>>>>             else
>>>>>>                 entry_content = l1e_empty();
>>>>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain
*p2m, un
>>>>>>             if ( entry_content.l1 != 0 )
>>>>>>             {
>>>>>>                 p2m_add_iommu_flags(&entry_content,
0,
>>>>>> iommu_pte_flags);
>>>>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>>>>>> +            old_mfn =
unclip_mfn(l1e_get_pfn(*p2m_entry));
>>>>>>             }
>>>>>>             /* level 1 entry */
>>>>>>             p2m->write_p2m_entry(p2m, gfn,
p2m_entry, table_mfn,
>>>>>> entry_content, 1);
>>>>>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>>>>>                                sizeof(l1e));
>>>>>>
>>>>>>         if ( ret == 0 ) {
>>>>>> +        unsigned long l1e_mfn =
unclip_mfn(l1e_get_pfn(l1e));
>>>>>>             p2mt =
p2m_flags_to_type(l1e_get_flags(l1e));
>>>>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
>>>>>> !p2m_is_ram(p2mt));
>>>>>> +        ASSERT( (l1e_mfn != INVALID_MFN ||
!p2m_is_ram(p2mt)) ||
>>>>>> +                (l1e_mfn == INVALID_MFN&&   
p2m_is_paging(p2mt))
>>>>>> );
>>>>>>
>>>>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>>>>>> -             == p2m_populate_on_demand )
>>>>>> +        if ( p2mt == p2m_populate_on_demand )
>>>>>>             {
>>>>>>                 /* The read has succeeded, so we know
that the
>>>>>> mapping
>>>>>>                  * exits at this point.  */
>>>>>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>>>>>             }
>>>>>>
>>>>>>             if ( p2m_is_valid(p2mt) ||
p2m_is_grant(p2mt) )
>>>>>> -            mfn = _mfn(l1e_get_pfn(l1e));
>>>>>> +            mfn = _mfn(l1e_mfn);
>>>>>>             else
>>>>>>                 /* XXX see above */
>>>>>>                 p2mt = p2m_mmio_dm;
>>>>>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>>>>>     pod_retry_l1:
>>>>>>         if ( (l1e_get_flags(*l1e)&   
_PAGE_PRESENT) == 0 )
>>>>>>         {
>>>>>> +        p2m_type_t l1t =
p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>>>             /* PoD: Try to populate */
>>>>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e))
=>>>>>> p2m_populate_on_demand )
>>>>>> +        if ( l1t == p2m_populate_on_demand )
>>>>>>             {
>>>>>>                 if ( q != p2m_query ) {
>>>>>>                     if ( !p2m_pod_demand_populate(p2m,
gfn,
>>>>>> PAGE_ORDER_4K,
>>>>>> q) )
>>>>>>                         goto pod_retry_l1;
>>>>>>                 } else
>>>>>>                     *t = p2m_populate_on_demand;
>>>>>> +        } else {
>>>>>> +            if ( p2m_is_paging(l1t) )
>>>>>> +            {
>>>>>> +                *t = l1t;
>>>>>> +                /* No need to unclip due to check
below */
>>>>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>>>>>> +            }
>>>>>>             }
>>>>>>
>>>>>>             unmap_domain_page(l1e);
>>>>>> -        return _mfn(INVALID_MFN);
>>>>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
>>>>>> _mfn(INVALID_MFN);
>>>>>>         }
>>>>>>         mfn = _mfn(l1e_get_pfn(*l1e));
>>>>>>         *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>>> @@ -914,7 +941,7 @@ static void
p2m_change_type_global(struc
>>>>>>                         flags = l1e_get_flags(l1e[i1]);
>>>>>>                         if ( p2m_flags_to_type(flags)
!= ot )
>>>>>>                             continue;
>>>>>> -                    mfn = l1e_get_pfn(l1e[i1]);
>>>>>> +                    mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>>>                         gfn = i1 + (i2 + (i3
>>>>>>     #if CONFIG_PAGING_LEVELS>= 4
>>>>>>     					+ (i4 * L3_PAGETABLE_ENTRIES)
>>>>>> @@ -923,7 +950,7 @@ static void
p2m_change_type_global(struc
>>>>>>                                * L2_PAGETABLE_ENTRIES)
*
>>>>>> L1_PAGETABLE_ENTRIES;
>>>>>>                         /* create a new 1le entry with
the new type
>>>>>> */
>>>>>>                         flags = p2m_type_to_flags(nt,
_mfn(mfn));
>>>>>> -                    l1e_content = l1e_from_pfn(mfn,
flags);
>>>>>> +                    l1e_content =
l1e_from_pfn(clipped_mfn(mfn),
>>>>>> flags);
>>>>>>                         p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
>>>>>>                                              l1mfn,
l1e_content, 1);
>>>>>>                     }
>>>>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct
p2m_domain
>>>>>>                                     entry_count++;
>>>>>>                                 continue;
>>>>>>                             }
>>>>>> -                        mfn = l1e_get_pfn(l1e[i1]);
>>>>>> +                        mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>>>                            
ASSERT(mfn_valid(_mfn(mfn)));
>>>>>>                             m2pfn =
get_gpfn_from_mfn(mfn);
>>>>>>                             if ( m2pfn != gfn&&
>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> Xen-devel mailing list
>>>>>> Xen-devel@lists.xensource.com
>>>>>> http://lists.xensource.com/xen-devel
>>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>>
>>
>>
>>
>
>
>

Wei Wang

2012-Feb-16 15:47 UTC

head link

Re: RFC: AMD support for paging

Am 15.02.2012 18:02, schrieb Tim Deegan:> At 08:06 -0800 on 15 Feb (1329293205), Andres Lagar-Cavilla wrote:
>>> Where will p2m access bit be stored? Please note that bit 52 - bit
58
>>> for pte must be zero for p2m_ram_rw pages. For iommu pte, only bit
63
>>> and bit 1 - bit 8 are free to use if PR bit = 1.
>>
>> Wei,
>>
>> Why *must* bits 52-58 be zero for p2m_ram_rw pages? Or is it just the
>> current convention?
>>
>> I propose limiting p2m type to bits 52-55, and storing p2m access on
bits
>> 56-58. So, p2m_ram_rw pages with a non-zero access would break the
"must
>> be zero" requirement/convention.
>
> Eh, let''s step back a bit from this.  The problem is that the
IOMMU
> can''t handle these bits being set, but the IOMMU can''t
handle paged-out
> or CoW memory either (because there''s no way to cause the
peripheral to
> retry a DMA that faulted).
>
> So can we just say that any VM that''s going to use paging, sharing
> or access can''t have a unified p2m and IOMMU table? 
That''s a bit ugly
> since the admin will have to make the choice, but it seems like the
> hardware is forcing us into it.
Seems true. Although iommuv2 support demand paging, it only works for 
guest OS process context. And we also need support from device side to 
do that.
>
> Maybe the sensible choice is to default to _not_ sharing p2m and IOMMU
> tables, and have thet be something an expert user can turn on (and which
> disables the other features)?
That looks good to me.
> Should we have a hypervisor-level interlock between PCI passthrough and
> paging/sharing/access anyway?  Doing both is unlikely to lead to happiness.
I think we should. DMA might result miserable errors when paging happens.

Thanks
Wei
> Tim.
>
>> Right now, without any considerations for paging, we''re
storing the p2m
>> type in bits 52-58 when PR=1. That p2m type can be non zero.
p2m_ram_ro,
>> p2m_mmio_dm, p2m_logdirty, p2m_populate_on_demand are all par for the
>> course. Given your statement above, and table 14 in the IOMMU manual,
how
>> is this even working? Or is it not working?
>>
>> Would a violation of these rules cause a VMEXIT_SHUTDOWN?
>>
>> Thanks a lot for the info!
>> Andres
>>>
>>> Thanks,
>>> Wei
>>>
>>>> An alternative to enabling mem_access on AMD processors will be
to limit
>>>> the access types to 3 bits. This would force disabling support
for two
>>>> modes. I''m inclined to disable two out of X, W and WX.
I don''t think
>>>> they
>>>> make sense without R permissions.
>>>> Thanks!
>>>> Andres
>>>>
>>>>>
>>>>> Thanks,
>>>>> Wei
>>>>>
>>>>>>
>>>>>> Finally, the triple fault. Maybe I''m missing
something obvious.
>>>>>> Comments
>>>>>> welcome.
>>>>>>
>>>>>> Patch inline below, thanks!
>>>>>> Andres
>>>>>>
>>>>>> Enable AMD support for paging.
>>>>>>
>>>>>> Signed-off-by: Andres
Lagar-Cavilla<andres@lagarcavilla.org>
>>>>>> Signed-off-by: Adin Scannell<adin@scannell.ca>
>>>>>>
>>>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/mem_event.c
>>>>>> --- a/xen/arch/x86/mm/mem_event.c
>>>>>> +++ b/xen/arch/x86/mm/mem_event.c
>>>>>> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain
*d, x
>>>>>>                 if ( !hap_enabled(d) )
>>>>>>                     break;
>>>>>>
>>>>>> -            /* Currently only EPT is supported */
>>>>>> -            if ( boot_cpu_data.x86_vendor !=
X86_VENDOR_INTEL )
>>>>>> -                break;
>>>>>> -
>>>>>>                 rc = -EXDEV;
>>>>>>                 /* Disallow paging in a PoD guest */
>>>>>>                 if ( p2m->pod.entry_count )
>>>>>> diff -r 25ca78889ed4 -r 10ca4e4293ce
xen/arch/x86/mm/p2m-pt.c
>>>>>> --- a/xen/arch/x86/mm/p2m-pt.c
>>>>>> +++ b/xen/arch/x86/mm/p2m-pt.c
>>>>>> @@ -53,6 +53,20 @@
>>>>>>     #define P2M_BASE_FLAGS \
>>>>>>             (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY |
_PAGE_ACCESSED)
>>>>>>
>>>>>> +#ifdef __x86_64__
>>>>>> +/* l1e_from_pfn is not designed to have INVALID_MFN
stored. The
>>>>>> 0xff..ff
>>>>>> + * value tramples over the higher-order bits used for
flags (NX,
>>>>>> p2mt,
>>>>>> + * etc.) This happens for paging entries. Thus we do
this clip/unclip
>>>>>> + * juggle for l1 entries only (no paging superpages!)
*/
>>>>>> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /*
40 bits */
>>>>>> +#define clipped_mfn(mfn)    ((mfn)&   
((1UL<<    EFF_MFN_WIDTH) - 1))
>>>>>> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) ==
INVALID_MFN) ? \
>>>>>> +                                INVALID_MFN : (mfn))
>>>>>> +#else
>>>>>> +#define clipped_mfn(mfn)    (mfn)
>>>>>> +#define unclip_mfn(mfn)     (mfn)
>>>>>> +#endif /* __x86_64__ */
>>>>>> +
>>>>>>     static unsigned long p2m_type_to_flags(p2m_type_t
t, mfn_t mfn)
>>>>>>     {
>>>>>>         unsigned long flags;
>>>>>> @@ -77,6 +91,9 @@ static unsigned long
p2m_type_to_flags(p
>>>>>>         case p2m_invalid:
>>>>>>         case p2m_mmio_dm:
>>>>>>         case p2m_populate_on_demand:
>>>>>> +    case p2m_ram_paging_out:
>>>>>> +    case p2m_ram_paged:
>>>>>> +    case p2m_ram_paging_in:
>>>>>>         default:
>>>>>>             return flags;
>>>>>>         case p2m_ram_ro:
>>>>>> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain
*p2m, m
>>>>>>                                           shift, max))
)
>>>>>>             return 0;
>>>>>>
>>>>>> -    /* PoD: Not present doesn''t imply empty.
*/
>>>>>> +    /* PoD/paging: Not present doesn''t imply
empty. */
>>>>>>         if ( !l1e_get_flags(*p2m_entry) )
>>>>>>         {
>>>>>>             struct page_info *pg;
>>>>>> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain
*p2m, un
>>>>>>                                        0,
L1_PAGETABLE_ENTRIES);
>>>>>>             ASSERT(p2m_entry);
>>>>>>
>>>>>> -        if ( mfn_valid(mfn) || (p2mt ==
p2m_mmio_direct) )
>>>>>> -            entry_content = l1e_from_pfn(mfn_x(mfn),
>>>>>> +        if ( mfn_valid(mfn) || (p2mt ==
p2m_mmio_direct) ||
>>>>>> +             (p2mt == p2m_ram_paged) || (p2mt ==
p2m_ram_paging_in) )
>>>>>> +            entry_content =
l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>>>>>>                                             
p2m_type_to_flags(p2mt,
>>>>>> mfn));
>>>>>>             else
>>>>>>                 entry_content = l1e_empty();
>>>>>> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain
*p2m, un
>>>>>>             if ( entry_content.l1 != 0 )
>>>>>>             {
>>>>>>                 p2m_add_iommu_flags(&entry_content,
0,
>>>>>> iommu_pte_flags);
>>>>>> -            old_mfn = l1e_get_pfn(*p2m_entry);
>>>>>> +            old_mfn =
unclip_mfn(l1e_get_pfn(*p2m_entry));
>>>>>>             }
>>>>>>             /* level 1 entry */
>>>>>>             p2m->write_p2m_entry(p2m, gfn,
p2m_entry, table_mfn,
>>>>>> entry_content, 1);
>>>>>> @@ -615,11 +633,12 @@ pod_retry_l1:
>>>>>>                                sizeof(l1e));
>>>>>>
>>>>>>         if ( ret == 0 ) {
>>>>>> +        unsigned long l1e_mfn =
unclip_mfn(l1e_get_pfn(l1e));
>>>>>>             p2mt =
p2m_flags_to_type(l1e_get_flags(l1e));
>>>>>> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN ||
!p2m_is_ram(p2mt));
>>>>>> +        ASSERT( (l1e_mfn != INVALID_MFN ||
!p2m_is_ram(p2mt)) ||
>>>>>> +                (l1e_mfn == INVALID_MFN&&   
p2m_is_paging(p2mt)) );
>>>>>>
>>>>>> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
>>>>>> -             == p2m_populate_on_demand )
>>>>>> +        if ( p2mt == p2m_populate_on_demand )
>>>>>>             {
>>>>>>                 /* The read has succeeded, so we know
that the mapping
>>>>>>                  * exits at this point.  */
>>>>>> @@ -641,7 +660,7 @@ pod_retry_l1:
>>>>>>             }
>>>>>>
>>>>>>             if ( p2m_is_valid(p2mt) ||
p2m_is_grant(p2mt) )
>>>>>> -            mfn = _mfn(l1e_get_pfn(l1e));
>>>>>> +            mfn = _mfn(l1e_mfn);
>>>>>>             else
>>>>>>                 /* XXX see above */
>>>>>>                 p2mt = p2m_mmio_dm;
>>>>>> @@ -783,18 +802,26 @@ pod_retry_l2:
>>>>>>     pod_retry_l1:
>>>>>>         if ( (l1e_get_flags(*l1e)&   
_PAGE_PRESENT) == 0 )
>>>>>>         {
>>>>>> +        p2m_type_t l1t =
p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>>>             /* PoD: Try to populate */
>>>>>> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e))
=>>>>>> p2m_populate_on_demand )
>>>>>> +        if ( l1t == p2m_populate_on_demand )
>>>>>>             {
>>>>>>                 if ( q != p2m_query ) {
>>>>>>                     if ( !p2m_pod_demand_populate(p2m,
gfn,
>>>>>> PAGE_ORDER_4K,
>>>>>> q) )
>>>>>>                         goto pod_retry_l1;
>>>>>>                 } else
>>>>>>                     *t = p2m_populate_on_demand;
>>>>>> +        } else {
>>>>>> +            if ( p2m_is_paging(l1t) )
>>>>>> +            {
>>>>>> +                *t = l1t;
>>>>>> +                /* No need to unclip due to check
below */
>>>>>> +                mfn = _mfn(l1e_get_pfn(*l1e));
>>>>>> +            }
>>>>>>             }
>>>>>>
>>>>>>             unmap_domain_page(l1e);
>>>>>> -        return _mfn(INVALID_MFN);
>>>>>> +        return (l1t == p2m_ram_paging_out) ? mfn :
_mfn(INVALID_MFN);
>>>>>>         }
>>>>>>         mfn = _mfn(l1e_get_pfn(*l1e));
>>>>>>         *t = p2m_flags_to_type(l1e_get_flags(*l1e));
>>>>>> @@ -914,7 +941,7 @@ static void
p2m_change_type_global(struc
>>>>>>                         flags = l1e_get_flags(l1e[i1]);
>>>>>>                         if ( p2m_flags_to_type(flags)
!= ot )
>>>>>>                             continue;
>>>>>> -                    mfn = l1e_get_pfn(l1e[i1]);
>>>>>> +                    mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>>>                         gfn = i1 + (i2 + (i3
>>>>>>     #if CONFIG_PAGING_LEVELS>= 4
>>>>>>     					+ (i4 * L3_PAGETABLE_ENTRIES)
>>>>>> @@ -923,7 +950,7 @@ static void
p2m_change_type_global(struc
>>>>>>                                * L2_PAGETABLE_ENTRIES)
*
>>>>>> L1_PAGETABLE_ENTRIES;
>>>>>>                         /* create a new 1le entry with
the new type */
>>>>>>                         flags = p2m_type_to_flags(nt,
_mfn(mfn));
>>>>>> -                    l1e_content = l1e_from_pfn(mfn,
flags);
>>>>>> +                    l1e_content =
l1e_from_pfn(clipped_mfn(mfn),
>>>>>> flags);
>>>>>>                         p2m->write_p2m_entry(p2m,
gfn,&l1e[i1],
>>>>>>                                              l1mfn,
l1e_content, 1);
>>>>>>                     }
>>>>>> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct
p2m_domain
>>>>>>                                     entry_count++;
>>>>>>                                 continue;
>>>>>>                             }
>>>>>> -                        mfn = l1e_get_pfn(l1e[i1]);
>>>>>> +                        mfn =
unclip_mfn(l1e_get_pfn(l1e[i1]));
>>>>>>                            
ASSERT(mfn_valid(_mfn(mfn)));
>>>>>>                             m2pfn =
get_gpfn_from_mfn(mfn);
>>>>>>                             if ( m2pfn != gfn&&
>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> Xen-devel mailing list
>>>>>> Xen-devel@lists.xensource.com
>>>>>> http://lists.xensource.com/xen-devel
>>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>>
>>
>>
>

Xen devel - Feb 2012 - RFC: AMD support for paging

RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging

Re: RFC: AMD support for paging