thr3ads.net - Xen devel - [Xen-devel] [PATCH] X86 MCE: Add SRAR handler [Sep 2011]

If this information is useful, please help other people find it:
Share via:

Liu, Jinsong

2011-Sep-29 15:20 UTC

[Xen-devel] [PATCH] X86 MCE: Add SRAR handler

X86 MCE: Add SRAR handler

Currently Intel SDM add 2 kinds of MCE SRAR errors:
1). Data Load error, error code = 0x134
2). Instruction Fetch error, error code = 0x150
This patch add handler to these new SRAR errors.
It based on existed mce infrastructure, add code to handle SRAR specific error.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>

diff -r 8d6edc3d26d2 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Sat Aug 13 10:14:58 2011 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sat Aug 20 23:53:02 2011 +0800
@@ -37,6 +37,14 @@ static int __read_mostly nr_intel_ext_ms
  */
 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
 #define INTEL_SRAO_L3_EWB    0x17A
+
+/* 
+ * Currently Intel SDM define 2 kinds of srar errors:
+ * 1). Data Load error, error code = 0x134
+ * 2). Instruction Fetch error, error code = 0x150
+ */
+#define INTEL_SRAR_DATA_LOAD	0x134
+#define INTEL_SRAR_INSTR_FETCH	0x150
 
 /* Thermal Hanlding */
 #ifdef CONFIG_X86_MCE_THERMAL
@@ -255,7 +263,7 @@ static enum mce_result mce_action(struct
         for ( i = 0; i < handler_num; i++ ) {
             if (handlers[i].owned_error(binfo.mib->mc_status))
             {
-                handlers[i].recovery_handler(&binfo, &bank_result);
+                handlers[i].recovery_handler(&binfo, &bank_result,
regs);
                 if (worst_result < bank_result)
                     worst_result = bank_result;
                 break;
@@ -621,7 +629,8 @@ struct mcinfo_recovery *mci_add_pageoff_
 
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     struct mcinfo_bank *bank = binfo->mib;
     struct mcinfo_global *global = binfo->mig;
@@ -718,6 +727,32 @@ vmce_failed:
     }
 }
 
+static int intel_srar_check(uint64_t status)
+{
+    return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
+}
+
+static void intel_srar_dhandler(
+             struct mca_binfo *binfo,
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
+{
+    uint64_t status = binfo->mib->mc_status;
+
+    /* For unknown srar error code, reset system */
+    *result = MCER_RESET;
+
+    switch ( status & INTEL_MCCOD_MASK )
+    {
+    case INTEL_SRAR_DATA_LOAD:
+    case INTEL_SRAR_INSTR_FETCH:
+        intel_memerr_dhandler(binfo, result, regs);
+        break;
+    default:
+        break;
+    }
+}
+
 static int intel_srao_check(uint64_t status)
 {
     return ( intel_check_mce_type(status) == intel_mce_ucr_srao );
@@ -725,7 +760,8 @@ static int intel_srao_check(uint64_t sta
 
 static void intel_srao_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
 
@@ -738,7 +774,7 @@ static void intel_srao_dhandler(
         {
         case INTEL_SRAO_MEM_SCRUB:
         case INTEL_SRAO_L3_EWB:
-            intel_memerr_dhandler(binfo, result);
+            intel_memerr_dhandler(binfo, result, regs);
             break;
         default:
             break;
@@ -753,14 +789,15 @@ static int intel_default_check(uint64_t 
 
 static void intel_default_mce_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs * regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
 
     type = intel_check_mce_type(status);
 
-    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+    if (type == intel_mce_fatal)
         *result = MCER_RESET;
     else
         *result = MCER_CONTINUE;
@@ -768,12 +805,14 @@ static void intel_default_mce_dhandler(
 
 static const struct mca_error_handler intel_mce_dhandlers[] = {
     {intel_srao_check, intel_srao_dhandler},
+    {intel_srar_check, intel_srar_dhandler},
     {intel_default_check, intel_default_mce_dhandler}
 };
 
 static void intel_default_mce_uhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
 
     switch (type)
     {
-    /* Panic if no handler for SRAR error */
     case intel_mce_ucr_srar:
+        if ( !guest_mode(regs) )
+            *result = MCER_RESET;
+        else
+            *result = MCER_CONTINUE;
+        break;
     case intel_mce_fatal:
         *result = MCER_RESET;
         break;
@@ -958,10 +1001,8 @@ static int intel_recoverable_scan(u64 st
     /* SRAR error */
     else if ( ser_support && !(status & MCi_STATUS_OVER) 
                 && !(status & MCi_STATUS_PCC) && (status
& MCi_STATUS_S)
-                && (status & MCi_STATUS_AR) ) {
-        mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined
currently.\n");
-        return 0;
-    }
+                && (status & MCi_STATUS_AR) && (status
& MCi_STATUS_EN) )
+        return 1;
     /* SRAO error */
     else if (ser_support && !(status & MCi_STATUS_PCC)
                 && (status & MCi_STATUS_S) && !(status
& MCi_STATUS_AR)
diff -r 8d6edc3d26d2 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Sat Aug 13 10:14:58 2011 +0100
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sat Aug 20 23:53:02 2011 +0800
@@ -151,7 +151,7 @@ struct mca_error_handler
     */
     int (*owned_error)(uint64_t status);
     void (*recovery_handler)(struct mca_binfo *binfo,
-                    enum mce_result *result);
+                    enum mce_result *result, struct cpu_user_regs *regs);
 };
 
 /* Global variables */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Sep-29 15:42 UTC

head link

Re: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 29.09.11 at 17:20, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> @@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
>  
>      switch (type)
>      {
> -    /* Panic if no handler for SRAR error */
>      case intel_mce_ucr_srar:
> +        if ( !guest_mode(regs) )
> +            *result = MCER_RESET;
> +        else
> +            *result = MCER_CONTINUE;
> +        break;
>      case intel_mce_fatal:
>          *result = MCER_RESET;
>          break;
Using the stack based registers for any decision in an MCE handler
seems bogus to me - without knowing that the error occurred
synchronously, the result is meaningless. Unfortunately I wasn''t
able to spot - throughout your patch - what SRAR actually stands
for, and the manual is no help in that respect either. It does state,
however, that EIPV in three of four cases would be clear for these,
so using the registers on stack is likely wrong here.

This made me look at the current source, and there I see in
mce_urgent_action()

    if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
        return -1;

which I think should say ... _EIPV and use || instead. Thoughts?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jiang, Yunhong

2011-Sep-30 02:51 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Thursday, September 29, 2011 11:42 PM
> To: Liu, Jinsong
> Cc: Keir Fraser; Jiang, Yunhong; xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
> 
> >>> On 29.09.11 at 17:20, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> > @@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
> >
> >      switch (type)
> >      {
> > -    /* Panic if no handler for SRAR error */
> >      case intel_mce_ucr_srar:
> > +        if ( !guest_mode(regs) )
> > +            *result = MCER_RESET;
> > +        else
> > +            *result = MCER_CONTINUE;
> > +        break;
> >      case intel_mce_fatal:
> >          *result = MCER_RESET;
> >          break;
> 
> Using the stack based registers for any decision in an MCE handler
> seems bogus to me - without knowing that the error occurred
> synchronously, the result is meaningless. Unfortunately I wasn''t
I think the usage of stack in MCE handler should be case by case. 
For example, it''s ok to use it at data load instruction since the EIPV
is
valid for it.
For the instruction load, not that sure and I will check it internally.

But agree that we should not do this depends on the error type (like SRAR),
but should depends on the specific error code.
> able to spot - throughout your patch - what SRAR actually stands
> for, and the manual is no help in that respect either. It does state,
> however, that EIPV in three of four cases would be clear for these,
> so using the registers on stack is likely wrong here.
> 
> This made me look at the current source, and there I see in
> mce_urgent_action()
> 
>     if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
>         return -1;
> 
> which I think should say ... _EIPV and use || instead. Thoughts?
I think this code means, if the error happens in hypervisor mode (i.e.
!guest_mode()), and RIPV indicate the RIP in stack can''t be restarted,
we
have to panic.

Thanks
--jyh> 
> Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Sep-30 07:25 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com> wrote:
> 
>> -----Original Message-----
>> From: Jan Beulich [mailto:JBeulich@suse.com] 
>> Sent: Thursday, September 29, 2011 11:42 PM
>> To: Liu, Jinsong
>> Cc: Keir Fraser; Jiang, Yunhong; xen-devel@lists.xensource.com 
>> Subject: Re: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
>> 
>> >>> On 29.09.11 at 17:20, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>> > @@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
>> >
>> >      switch (type)
>> >      {
>> > -    /* Panic if no handler for SRAR error */
>> >      case intel_mce_ucr_srar:
>> > +        if ( !guest_mode(regs) )
>> > +            *result = MCER_RESET;
>> > +        else
>> > +            *result = MCER_CONTINUE;
>> > +        break;
>> >      case intel_mce_fatal:
>> >          *result = MCER_RESET;
>> >          break;
>> 
>> Using the stack based registers for any decision in an MCE handler
>> seems bogus to me - without knowing that the error occurred
>> synchronously, the result is meaningless. Unfortunately I
wasn''t
> 
> I think the usage of stack in MCE handler should be case by case. 
> For example, it''s ok to use it at data load instruction since the
EIPV is
> valid for it.
According to the table in the manual, this is only the case on the local
thread.
> For the instruction load, not that sure and I will check it internally.
> 
> But agree that we should not do this depends on the error type (like SRAR),
> but should depends on the specific error code.
> 
>> able to spot - throughout your patch - what SRAR actually stands
>> for, and the manual is no help in that respect either. It does state,
>> however, that EIPV in three of four cases would be clear for these,
>> so using the registers on stack is likely wrong here.
>> 
>> This made me look at the current source, and there I see in
>> mce_urgent_action()
>> 
>>     if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
>>         return -1;
>> 
>> which I think should say ... _EIPV and use || instead. Thoughts?
> 
> I think this code means, if the error happens in hypervisor mode (i.e.
> !guest_mode()), and RIPV indicate the RIP in stack can''t be
restarted, we
> have to panic.
Then the guest_mode() check still lacks an extra check of EIPV, like

     if ( !(gstatus & MCG_STATUS_RIPV) &&
          (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
         return -1;

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Sep-30 07:44 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>> wrote: 
> 
>> 
>>> -----Original Message-----
>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>> Sent: Thursday, September 29, 2011 11:42 PM
>>> To: Liu, Jinsong
>>> Cc: Keir Fraser; Jiang, Yunhong; xen-devel@lists.xensource.com
>>> Subject: Re: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
>>> 
>>>>>> On 29.09.11 at 17:20, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>> wrote: 
>>>> @@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
>>>> 
>>>>      switch (type)
>>>>      {
>>>> -    /* Panic if no handler for SRAR error */
>>>>      case intel_mce_ucr_srar:
>>>> +        if ( !guest_mode(regs) )
>>>> +            *result = MCER_RESET;
>>>> +        else
>>>> +            *result = MCER_CONTINUE;
>>>> +        break;
>>>>      case intel_mce_fatal:
>>>>          *result = MCER_RESET;
>>>>          break;
>>> 
>>> Using the stack based registers for any decision in an MCE handler
>>> seems bogus to me - without knowing that the error occurred
>>> synchronously, the result is meaningless. Unfortunately I
wasn''t
>> 
>> I think the usage of stack in MCE handler should be case by case.
>> For example, it''s ok to use it at data load instruction since
the
>> EIPV is valid for it.
> 
> According to the table in the manual, this is only the case on the
> local thread.
> 
OK, we can remove srar check at mce isr ''uhandler''.
so at mce isr, the check
	if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs)) 
		return -1;
is a total insurance for the error which triggered at hypervisor while cannot
restart from RIP.

>> For the instruction load, not that sure and I will check it
>> internally. 
>> 
>> But agree that we should not do this depends on the error type (like
>> SRAR), but should depends on the specific error code.
>> 
>>> able to spot - throughout your patch - what SRAR actually stands
>>> for, and the manual is no help in that respect either. It does
>>> state, however, that EIPV in three of four cases would be clear for
>>> these, so using the registers on stack is likely wrong here.
>>> 
>>> This made me look at the current source, and there I see in
>>> mce_urgent_action() 
>>> 
>>>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>>> return -1; 
>>> 
>>> which I think should say ... _EIPV and use || instead. Thoughts?
>> 
>> I think this code means, if the error happens in hypervisor mode
>> (i.e. !guest_mode()), and RIPV indicate the RIP in stack can''t
be
>> restarted, we have to panic.
> 
> Then the guest_mode() check still lacks an extra check of EIPV, like
> 
>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>           (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
>          return -1;
> 
That would be overkilled.
Considering instruction fetch error occur at guest context, hypervisor deliver
to guest to handle the error is perfer, not panic all system.

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Sep-30 07:57 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 30.09.11 at 09:44, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Jan Beulich wrote:
>>>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>> wrote: 
>>>> This made me look at the current source, and there I see in
>>>> mce_urgent_action() 
>>>> 
>>>>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>>>> return -1; 
>>>> 
>>>> which I think should say ... _EIPV and use || instead.
Thoughts?
>>> 
>>> I think this code means, if the error happens in hypervisor mode
>>> (i.e. !guest_mode()), and RIPV indicate the RIP in stack
can''t be
>>> restarted, we have to panic.
>> 
>> Then the guest_mode() check still lacks an extra check of EIPV, like
>> 
>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>           (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
>>          return -1;
>> 
> 
> That would be overkilled.
> Considering instruction fetch error occur at guest context, hypervisor 
> deliver to guest to handle the error is perfer, not panic all system.
Even if it was hypervisor code that got prefetched while still
executing guest code (which ought to be possible at least
across a syscall/sysenter instruction)?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Sep-30 08:21 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 30.09.11 at 09:44, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>> Jan Beulich wrote:
>>>>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>>> wrote:
>>>>> This made me look at the current source, and there I see in
>>>>> mce_urgent_action() 
>>>>> 
>>>>>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>>>>> return -1; 
>>>>> 
>>>>> which I think should say ... _EIPV and use || instead.
Thoughts?
>>>> 
>>>> I think this code means, if the error happens in hypervisor
mode
>>>> (i.e. !guest_mode()), and RIPV indicate the RIP in stack
can''t be
>>>> restarted, we have to panic.
>>> 
>>> Then the guest_mode() check still lacks an extra check of EIPV,
like
>>> 
>>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>>           (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
>>> return -1; 
>>> 
>> 
>> That would be overkilled.
>> Considering instruction fetch error occur at guest context,
>> hypervisor deliver to guest to handle the error is perfer, not panic
>> all system. 
> 
> Even if it was hypervisor code that got prefetched while still
> executing guest code (which ought to be possible at least
> across a syscall/sysenter instruction)?
> 
Executing guest code will not satisfy the check
if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
	return -1;
so it would not panic system.

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Sep-30 08:44 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 30.09.11 at 10:21, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Jan Beulich wrote:
>>>>> On 30.09.11 at 09:44, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>>> Jan Beulich wrote:
>>>>>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>>>> wrote:
>>>>>> This made me look at the current source, and there I
see in
>>>>>> mce_urgent_action() 
>>>>>> 
>>>>>>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>>>>>> return -1; 
>>>>>> 
>>>>>> which I think should say ... _EIPV and use || instead.
Thoughts?
>>>>> 
>>>>> I think this code means, if the error happens in hypervisor
mode
>>>>> (i.e. !guest_mode()), and RIPV indicate the RIP in stack
can''t be
>>>>> restarted, we have to panic.
>>>> 
>>>> Then the guest_mode() check still lacks an extra check of EIPV,
like
>>>> 
>>>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>>>           (!(gstatus & MCG_STATUS_EIPV) ||
!guest_mode(regs)))
>>>> return -1; 
>>>> 
>>> 
>>> That would be overkilled.
>>> Considering instruction fetch error occur at guest context,
>>> hypervisor deliver to guest to handle the error is perfer, not
panic
>>> all system. 
>> 
>> Even if it was hypervisor code that got prefetched while still
>> executing guest code (which ought to be possible at least
>> across a syscall/sysenter instruction)?
>> 
> 
> Executing guest code will not satisfy the check
> if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
> 	return -1;
> so it would not panic system.
Exactly. But it should when the prefetch was to hypervisor code.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Sep-30 09:42 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 30.09.11 at 10:21, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>> Jan Beulich wrote:
>>>>>> On 30.09.11 at 09:44, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>> wrote: 
>>>> Jan Beulich wrote:
>>>>>>>> On 30.09.11 at 04:51, "Jiang,
Yunhong"
>>>>>>>> <yunhong.jiang@intel.com> wrote:
>>>>>>> This made me look at the current source, and there
I see in
>>>>>>> mce_urgent_action() 
>>>>>>> 
>>>>>>>     if ( !(gstatus & MCG_STATUS_RIPV)
&& !guest_mode(regs))
>>>>>>> return -1; 
>>>>>>> 
>>>>>>> which I think should say ... _EIPV and use ||
instead. Thoughts?
>>>>>> 
>>>>>> I think this code means, if the error happens in
hypervisor mode
>>>>>> (i.e. !guest_mode()), and RIPV indicate the RIP in
stack can''t be
>>>>>> restarted, we have to panic.
>>>>> 
>>>>> Then the guest_mode() check still lacks an extra check of
EIPV,
>>>>> like 
>>>>> 
>>>>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>>>>           (!(gstatus & MCG_STATUS_EIPV) ||
!guest_mode(regs)))
>>>>> return -1; 
>>>>> 
>>>> 
>>>> That would be overkilled.
>>>> Considering instruction fetch error occur at guest context,
>>>> hypervisor deliver to guest to handle the error is perfer, not
>>>> panic all system.
>>> 
>>> Even if it was hypervisor code that got prefetched while still
>>> executing guest code (which ought to be possible at least
>>> across a syscall/sysenter instruction)?
>>> 
>> 
>> Executing guest code will not satisfy the check
>> if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs)) 
return -1;
>> so it would not panic system.
> 
> Exactly. But it should when the prefetch was to hypervisor code.
> 
Wouldn''t processor refresh instruction prefetch queue under such case?

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Sep-30 10:05 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 30.09.11 at 11:42, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Jan Beulich wrote:
>>>>> On 30.09.11 at 10:21, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>>> Executing guest code will not satisfy the check
>>> if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
return -1;
>>> so it would not panic system.
>> 
>> Exactly. But it should when the prefetch was to hypervisor code.
>> 
> 
> Wouldn''t processor refresh instruction prefetch queue under such
case?
That''s a question that you are better positioned to answer than me.
But the SRAR errors being a sub-category of uncorrected errors I
would think it can''t be that simple.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Sep-30 10:11 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 30.09.11 at 11:42, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>> Jan Beulich wrote:
>>>>>> On 30.09.11 at 10:21, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>> wrote: 
>>>> Executing guest code will not satisfy the check
>>>> if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs)) 	return -1;
>>>> so it would not panic system.
>>> 
>>> Exactly. But it should when the prefetch was to hypervisor code.
>>> 
>> 
>> Wouldn''t processor refresh instruction prefetch queue under
such
>> case? 
> 
> That''s a question that you are better positioned to answer than
me.
> But the SRAR errors being a sub-category of uncorrected errors I
> would think it can''t be that simple.
> 
Hmm, I will check this question internally first.
BTW, we would have 7 days holiday (1/10 ~ 7/10), so email reply maybe some slow.

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Sep-30 11:47 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Liu, Jinsong wrote:> Jan Beulich wrote:
>>>>> On 30.09.11 at 11:42, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>> wrote: 
>>> Jan Beulich wrote:
>>>>>>> On 30.09.11 at 10:21, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>>> wrote:
>>>>> Executing guest code will not satisfy the check
>>>>> if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs)) 	return
>>>>> -1; so it would not panic system.
>>>> 
>>>> Exactly. But it should when the prefetch was to hypervisor
code.
>>>> 
>>> 
>>> Wouldn''t processor refresh instruction prefetch queue
under such
>>> case?
>> 
>> That''s a question that you are better positioned to answer
than me.
>> But the SRAR errors being a sub-category of uncorrected errors I
>> would think it can''t be that simple.
>> 
> 
> Hmm, I will check this question internally first.
> BTW, we would have 7 days holiday (1/10 ~ 7/10), so email reply maybe
> some slow. 
> 
> Thanks,
> Jinsong
Ah, just think our talking context: the prefetched instruction would have been
flushed since we now at mce exception context.
So I think no need to overkill here, just let guest handle it --> who own,
who take.

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Oct-06 18:40 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Liu, Jinsong wrote:> Liu, Jinsong wrote:
>> Jan Beulich wrote:
>>>>>> On 30.09.11 at 11:42, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>> wrote:
>>>> Jan Beulich wrote:
>>>>>>>> On 30.09.11 at 10:21, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>>>> wrote:
>>>>>> Executing guest code will not satisfy the check
>>>>>> if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs)) 	return
>>>>>> -1; so it would not panic system.
>>>>> 
>>>>> Exactly. But it should when the prefetch was to hypervisor
code.
>>>>> 
>>>> 
>>>> Wouldn''t processor refresh instruction prefetch queue
under such
>>>> case?
>>> 
>>> That''s a question that you are better positioned to answer
than me.
>>> But the SRAR errors being a sub-category of uncorrected errors I
>>> would think it can''t be that simple.
>>> 
>> 
>> Hmm, I will check this question internally first.
>> BTW, we would have 7 days holiday (1/10 ~ 7/10), so email reply
>> maybe some slow. 
>> 
>> Thanks,
>> Jinsong
> 
> Ah, just think our talking context: the prefetched instruction would
> have been flushed since we now at mce exception context. So I think
> no need to overkill here, just let guest handle it --> who own, who
> take.  
> 
> Thanks,
> Jinsong
Jan,

Do you think following is OK?

if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs)) 
    return -1;

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Oct-07 07:42 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 06.10.11 at 20:40, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Liu, Jinsong wrote:
>> Liu, Jinsong wrote:
>>> Jan Beulich wrote:
>>>>>>> On 30.09.11 at 11:42, "Liu, Jinsong"
<jinsong.liu@intel.com>
>>>>>>> wrote:
>>>>> Jan Beulich wrote:
>>>>>>>>> On 30.09.11 at 10:21, "Liu,
Jinsong" <jinsong.liu@intel.com>
>>>>>>>>> wrote:
>>>>>>> Executing guest code will not satisfy the check
>>>>>>> if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs)) 	return
>>>>>>> -1; so it would not panic system.
>>>>>> 
>>>>>> Exactly. But it should when the prefetch was to
hypervisor code.
>>>>>> 
>>>>> 
>>>>> Wouldn''t processor refresh instruction prefetch
queue under such
>>>>> case?
>>>> 
>>>> That''s a question that you are better positioned to
answer than me.
>>>> But the SRAR errors being a sub-category of uncorrected errors
I
>>>> would think it can''t be that simple.
>>>> 
>>> 
>>> Hmm, I will check this question internally first.
>>> BTW, we would have 7 days holiday (1/10 ~ 7/10), so email reply
>>> maybe some slow. 
>>> 
>>> Thanks,
>>> Jinsong
>> 
>> Ah, just think our talking context: the prefetched instruction would
>> have been flushed since we now at mce exception context. So I think
>> no need to overkill here, just let guest handle it --> who own, who
>> take.  
>> 
>> Thanks,
>> Jinsong
> 
> Jan,
> 
> Do you think following is OK?
> 
> if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs)) 
>     return -1;
That''s what we have currently, and as I said earlier I don''t
think using
the result of guest_mode() for any decision is valid when the EIPV bit
is clear.

Jan
> Thanks,
> Jinsong



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jiang, Yunhong

2011-Oct-08 08:29 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Friday, September 30, 2011 3:25 PM
> To: Liu, Jinsong; Jiang, Yunhong
> Cc: keir.xen@gmail.com; xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
> 
> >>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
wrote:> 
> >
> >> -----Original Message-----
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: Thursday, September 29, 2011 11:42 PM
> >> To: Liu, Jinsong
> >> Cc: Keir Fraser; Jiang, Yunhong; xen-devel@lists.xensource.com
> >> Subject: Re: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
> >>
> >> >>> On 29.09.11 at 17:20, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> >> > @@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
> >> >
> >> >      switch (type)
> >> >      {
> >> > -    /* Panic if no handler for SRAR error */
> >> >      case intel_mce_ucr_srar:
> >> > +        if ( !guest_mode(regs) )
> >> > +            *result = MCER_RESET;
> >> > +        else
> >> > +            *result = MCER_CONTINUE;
> >> > +        break;
> >> >      case intel_mce_fatal:
> >> >          *result = MCER_RESET;
> >> >          break;
> >>
> >> Using the stack based registers for any decision in an MCE handler
> >> seems bogus to me - without knowing that the error occurred
> >> synchronously, the result is meaningless. Unfortunately I
wasn''t
> >
> > I think the usage of stack in MCE handler should be case by case.
> > For example, it''s ok to use it at data load instruction since
the EIPV
is> > valid for it.
> 
> According to the table in the manual, this is only the case on the local
> thread.
Sorry for long delay response because of china holiday
.
That''s about the so-called "affected processors". For the
un-affected
processor, it should not be impact.
But current patch does not handle the share bank situation correctly and
need some changes. According to the SDM, only affected processor should
clear the bank, un-affected process should not clear it, current patch does
not handle such situation.
> 
> > For the instruction load, not that sure and I will check it
internally.
> >
> > But agree that we should not do this depends on the error type (like
SRAR),> > but should depends on the specific error code.
> >
> >> able to spot - throughout your patch - what SRAR actually stands
> >> for, and the manual is no help in that respect either. It does
state,
> >> however, that EIPV in three of four cases would be clear for
these,
> >> so using the registers on stack is likely wrong here.
> >>
> >> This made me look at the current source, and there I see in
> >> mce_urgent_action()
> >>
> >>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
> >>         return -1;
> >>
> >> which I think should say ... _EIPV and use || instead. Thoughts?
> >
> > I think this code means, if the error happens in hypervisor mode (i.e.
> > !guest_mode()), and RIPV indicate the RIP in stack can''t be
restarted,
we> > have to panic.
> 
> Then the guest_mode() check still lacks an extra check of EIPV, like
> 
>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>           (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
>          return -1;
> 
The RIPV is not related to the EIPV. RIPV means the context saved in the
stack can''t be restarted anymore. According to the SDM, RIPV means
"execution can be restarted reliably at the instruction pointed to by the
instruction pointer pushed on the stack". It''s not about error
happened
synchronously or asynchronously. The point is, if the program is running in
hypervisor context, and RIPV tells us that the program can''t be
restarted,
we can''t do anything but panic, because we can''t switch
context while we are
in xen. So this code have nothing to do with EIPV. 

If Xen is pre-emptible and can be switched to a new context, possibly
sometimes (like in shadow handling) we can switch to a new context and
don''t
need such detection, but at least currently we don''t want to involve so
complex handling. 

The boundary condition of syscall/sysret is something interesting. If a
Instruction Fetch Error happens in hypervisor''s syscall entry point, it
will
cause system panic because in the end, we should find it''s a xen page
broken
by checking the MCi_ADDR MSR. And if guest''s sysret rip is broken, it
will
overkill xen hypervisor for a guest error. But possibly the handling result
is acceptable considering the low possibility and there is no potential data
pollution.
 
Thanks
--jyh> Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jiang, Yunhong

2011-Oct-08 09:05 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Friday, October 07, 2011 3:43 PM
> To: Liu, Jinsong; Jiang, Yunhong
> Cc: keir.xen@gmail.com; xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
> 
> >>> On 06.10.11 at 20:40, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> > Liu, Jinsong wrote:
> >> Liu, Jinsong wrote:
> >>> Jan Beulich wrote:
> >>>>>>> On 30.09.11 at 11:42, "Liu, Jinsong"
<jinsong.liu@intel.com>
> >>>>>>> wrote:
> >>>>> Jan Beulich wrote:
> >>>>>>>>> On 30.09.11 at 10:21, "Liu,
Jinsong" <jinsong.liu@intel.com>
> >>>>>>>>> wrote:
> >>>>>>> Executing guest code will not satisfy the
check
> >>>>>>> if ( !(gstatus & MCG_STATUS_RIPV)
&& !guest_mode(regs))
> 	return
> >>>>>>> -1; so it would not panic system.
> >>>>>>
> >>>>>> Exactly. But it should when the prefetch was to
hypervisor code.
> >>>>>>
> >>>>>
> >>>>> Wouldn''t processor refresh instruction
prefetch queue under such
> >>>>> case?
> >>>>
> >>>> That''s a question that you are better positioned
to answer than me.
> >>>> But the SRAR errors being a sub-category of uncorrected
errors I
> >>>> would think it can''t be that simple.
> >>>>
> >>>
> >>> Hmm, I will check this question internally first.
> >>> BTW, we would have 7 days holiday (1/10 ~ 7/10), so email
reply
> >>> maybe some slow.
> >>>
> >>> Thanks,
> >>> Jinsong
> >>
> >> Ah, just think our talking context: the prefetched instruction
would
> >> have been flushed since we now at mce exception context. So I
think
> >> no need to overkill here, just let guest handle it --> who own,
who
> >> take.
> >>
> >> Thanks,
> >> Jinsong
> >
> > Jan,
> >
> > Do you think following is OK?
> >
> > if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
> >     return -1;
> 
> That''s what we have currently, and as I said earlier I
don''t think using
> the result of guest_mode() for any decision is valid when the EIPV bit
> is clear.
guest_mode() should be ok to be used together with RIPV, since RIPV is
talking about the RIP in the stack.

Thanks
--jyh
> 
> Jan
> 
> > Thanks,
> > Jinsong
> 
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Oct-10 08:05 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 08.10.11 at 10:29, "Jiang, Yunhong"
<yunhong.jiang@intel.com> wrote:
> 
>> -----Original Message-----
>> From: Jan Beulich [mailto:JBeulich@suse.com] 
>> Sent: Friday, September 30, 2011 3:25 PM
>> To: Liu, Jinsong; Jiang, Yunhong
>> Cc: keir.xen@gmail.com; xen-devel@lists.xensource.com 
>> Subject: RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
>> 
>> >>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
> wrote:
>> 
>> >
>> >> -----Original Message-----
>> >> From: Jan Beulich [mailto:JBeulich@suse.com] 
>> >> This made me look at the current source, and there I see in
>> >> mce_urgent_action()
>> >>
>> >>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>> >>         return -1;
>> >>
>> >> which I think should say ... _EIPV and use || instead.
Thoughts?
>> >
>> > I think this code means, if the error happens in hypervisor mode
(i.e.
>> > !guest_mode()), and RIPV indicate the RIP in stack can''t
be restarted,
> we
>> > have to panic.
>> 
>> Then the guest_mode() check still lacks an extra check of EIPV, like
>> 
>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>           (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
>>          return -1;
>> 
> 
> The RIPV is not related to the EIPV. RIPV means the context saved in the
> stack can''t be restarted anymore. According to the SDM, RIPV means
> "execution can be restarted reliably at the instruction pointed to by
the
> instruction pointer pushed on the stack". It''s not about
error happened
> synchronously or asynchronously. The point is, if the program is running in
> hypervisor context, and RIPV tells us that the program can''t be
restarted,
> we can''t do anything but panic, because we can''t switch
context while we are
> in xen. So this code have nothing to do with EIPV. 
I continue to disagree (including the statement in your other response):
RIPV only tells us whether we can resume, not in which context the error
occurred. EIPV tells us whether, by looking at the saved registers, we
can determine the context that the error occurred in. Since with !RIPV
we have to determine in what context the error occurred in order to
decide whether to panic or just kill a guest, we can''t ignore EIPV (and
if
it''s not set we have to assume the worst case, since even if the
registers indicate guest mode the error may have occurred in
hypervisor context or accessing hypervisor structures [consider e.g. a
data load error during a GDT access]).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Oct-11 08:15 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 08.10.11 at 10:29, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>> wrote: 
> 
>> 
>>> -----Original Message-----
>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>> Sent: Friday, September 30, 2011 3:25 PM
>>> To: Liu, Jinsong; Jiang, Yunhong
>>> Cc: keir.xen@gmail.com; xen-devel@lists.xensource.com
>>> Subject: RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
>>> 
>>>>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>>> wrote: 
>>> 
>>>> 
>>>>> -----Original Message-----
>>>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>>>> This made me look at the current source, and there I see in
>>>>> mce_urgent_action() 
>>>>> 
>>>>>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>>>>> return -1; 
>>>>> 
>>>>> which I think should say ... _EIPV and use || instead.
Thoughts?
>>>> 
>>>> I think this code means, if the error happens in hypervisor
mode
>>>> (i.e. !guest_mode()), and RIPV indicate the RIP in stack
can''t be
>>>> restarted, we have to panic.
>>> 
>>> Then the guest_mode() check still lacks an extra check of EIPV,
like
>>> 
>>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>>           (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)))
>>> return -1; 
>>> 
>> 
>> The RIPV is not related to the EIPV. RIPV means the context saved in
>> the stack can''t be restarted anymore. According to the SDM,
RIPV
>> means "execution can be restarted reliably at the instruction
>> pointed to by the instruction pointer pushed on the stack".
It''s not
>> about error happened synchronously or asynchronously. The point is,
>> if the program is running in hypervisor context, and RIPV tells us
>> that the program can''t be restarted, we can''t do
anything but panic,
>> because we can''t switch context while we are in xen. So this
code
>> have nothing to do with EIPV. 
> 
> I continue to disagree (including the statement in your other
> response): RIPV only tells us whether we can resume, not in which
> context the error occurred. EIPV tells us whether, by looking at the
> saved registers, we can determine the context that the error occurred
> in. Since with !RIPV 
> we have to determine in what context the error occurred in order to
> decide whether to panic or just kill a guest, we can''t ignore EIPV
> (and if it''s not set we have to assume the worst case, since even
if
> the registers indicate guest mode the error may have occurred in
> hypervisor context or accessing hypervisor structures [consider e.g. a
> data load error during a GDT access]).
> 
> Jan
Yes, I agree EIPV=0 may indicate async error, but I think your solution
*overkilled* most cases (i.e. the real guest instruction fetch error).

Our idea is,
  * xen mce would flush prefetched instruction so we can delay handle it until
if real need;
  * a h/w error will not disappear, but if it was not being *consumed*,
it''s OK for system keep going (like SRAO error which do not need s/w
handle immediately);

Suppose an async instruction fetch error (RIVP=EIVP=0), triggered at guest
context but instruction prefetch hypervisor context. The scenario is,
  * at xen mce, the prefetched instruction has been flushed. xen mce handler
needn''t panic, instead it mark the page as broken page, then trigger
vmce to guest;
  * guest may kill app, kernel thread, guest itself, or whatever;

The error is still an error, w/ 2 possibilities in the future:
  1. it may not be consumed as an SRAR error, system keep going, h/w mechanism
may detect a SRAO error (i.e. memroy scrub) at some time point and handled then;
  2. it may be consumed at some time point and a SRAR error triggered again. At
this time,
   1). if srar occurred at hypervisor context, xen will panic. or, 
   2). if srar occurred at guest context, xen kill the guest as a malicious one
(as what the 2nd patch do), and move the page to broken page list;

Considering the rare possibility of the above case, I think it''s
acceptable to handle it in this way.
Thoughts?

Thanks,
Jinsong
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Oct-11 08:42 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 11.10.11 at 10:15, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Jan Beulich wrote:
>>>>> On 08.10.11 at 10:29, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>> wrote: 
>> 
>>> 
>>>> -----Original Message-----
>>>> From: Jan Beulich [mailto:JBeulich@suse.com] 
>>>> Sent: Friday, September 30, 2011 3:25 PM
>>>> To: Liu, Jinsong; Jiang, Yunhong
>>>> Cc: keir.xen@gmail.com; xen-devel@lists.xensource.com 
>>>> Subject: RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
>>>> 
>>>>>>> On 30.09.11 at 04:51, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>>>> wrote: 
>>>> 
>>>>> 
>>>>>> -----Original Message-----
>>>>>> From: Jan Beulich [mailto:JBeulich@suse.com] 
>>>>>> This made me look at the current source, and there I
see in
>>>>>> mce_urgent_action() 
>>>>>> 
>>>>>>     if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>>>>>> return -1; 
>>>>>> 
>>>>>> which I think should say ... _EIPV and use || instead.
Thoughts?
>>>>> 
>>>>> I think this code means, if the error happens in hypervisor
mode
>>>>> (i.e. !guest_mode()), and RIPV indicate the RIP in stack
can''t be
>>>>> restarted, we have to panic.
>>>> 
>>>> Then the guest_mode() check still lacks an extra check of EIPV,
like
>>>> 
>>>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>>>           (!(gstatus & MCG_STATUS_EIPV) ||
!guest_mode(regs)))
>>>> return -1; 
>>>> 
>>> 
>>> The RIPV is not related to the EIPV. RIPV means the context saved
in
>>> the stack can''t be restarted anymore. According to the
SDM, RIPV
>>> means "execution can be restarted reliably at the instruction
>>> pointed to by the instruction pointer pushed on the stack".
It''s not
>>> about error happened synchronously or asynchronously. The point is,
>>> if the program is running in hypervisor context, and RIPV tells us
>>> that the program can''t be restarted, we can''t do
anything but panic,
>>> because we can''t switch context while we are in xen. So
this code
>>> have nothing to do with EIPV. 
>> 
>> I continue to disagree (including the statement in your other
>> response): RIPV only tells us whether we can resume, not in which
>> context the error occurred. EIPV tells us whether, by looking at the
>> saved registers, we can determine the context that the error occurred
>> in. Since with !RIPV 
>> we have to determine in what context the error occurred in order to
>> decide whether to panic or just kill a guest, we can''t ignore
EIPV
>> (and if it''s not set we have to assume the worst case, since
even if
>> the registers indicate guest mode the error may have occurred in
>> hypervisor context or accessing hypervisor structures [consider e.g. a
>> data load error during a GDT access]).
>> 
>> Jan
> 
> Yes, I agree EIPV=0 may indicate async error, but I think your solution 
> *overkilled* most cases (i.e. the real guest instruction fetch error).
> 
> Our idea is,
>   * xen mce would flush prefetched instruction so we can delay handle it 
> until if real need;
>   * a h/w error will not disappear, but if it was not being *consumed*,
it''s
> OK for system keep going (like SRAO error which do not need s/w handle 
> immediately);
> 
> Suppose an async instruction fetch error (RIVP=EIVP=0), triggered at guest 
> context but instruction prefetch hypervisor context. The scenario is,
>   * at xen mce, the prefetched instruction has been flushed. xen mce
handler
> needn''t panic, instead it mark the page as broken page, then
trigger vmce to
> guest;
If the prefetch was from Xen space (only in guest context), delivering a
vMCE to the guest is pointless (and perhaps confusing to the guest).
>   * guest may kill app, kernel thread, guest itself, or whatever;
> 
> The error is still an error, w/ 2 possibilities in the future:
>   1. it may not be consumed as an SRAR error, system keep going, h/w 
> mechanism may detect a SRAO error (i.e. memroy scrub) at some time point
and
> handled then;
>   2. it may be consumed at some time point and a SRAR error triggered
again.
> At this time,
>    1). if srar occurred at hypervisor context, xen will panic. or, 
>    2). if srar occurred at guest context, xen kill the guest as a malicious
> one (as what the 2nd patch do), and move the page to broken page list;
> 
> Considering the rare possibility of the above case, I think it''s
acceptable
> to handle it in this way.
> Thoughts?
You''re only discussing instruction fetches (which can be discarded),
but
you''re not covering the other example I gave (GDT access from guest
context - just like this is a ring-0 operations from the paging unit''s
pov,
this ought to be an out-of-context operation from MCE''s perspective).

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Oct-11 09:51 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 11.10.11 at 10:15, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>> Jan Beulich wrote:
>>>>>> On 08.10.11 at 10:29, "Jiang, Yunhong"
<yunhong.jiang@intel.com>
>>>>>> wrote:
>>> 
>>>> 
>>>>> -----Original Message-----
>>>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>>>> Sent: Friday, September 30, 2011 3:25 PM
>>>>> To: Liu, Jinsong; Jiang, Yunhong
>>>>> Cc: keir.xen@gmail.com; xen-devel@lists.xensource.com
>>>>> Subject: RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
>>>>> 
>>>>>>>> On 30.09.11 at 04:51, "Jiang,
Yunhong"
>>>>>>>> <yunhong.jiang@intel.com> wrote:
>>>>> 
>>>>>> 
>>>>>>> -----Original Message-----
>>>>>>> From: Jan Beulich [mailto:JBeulich@suse.com]
>>>>>>> This made me look at the current source, and there
I see in
>>>>>>> mce_urgent_action() 
>>>>>>> 
>>>>>>>     if ( !(gstatus & MCG_STATUS_RIPV)
&& !guest_mode(regs))
>>>>>>> return -1; 
>>>>>>> 
>>>>>>> which I think should say ... _EIPV and use ||
instead. Thoughts?
>>>>>> 
>>>>>> I think this code means, if the error happens in
hypervisor mode
>>>>>> (i.e. !guest_mode()), and RIPV indicate the RIP in
stack can''t be
>>>>>> restarted, we have to panic.
>>>>> 
>>>>> Then the guest_mode() check still lacks an extra check of
EIPV,
>>>>> like 
>>>>> 
>>>>>      if ( !(gstatus & MCG_STATUS_RIPV) &&
>>>>>           (!(gstatus & MCG_STATUS_EIPV) ||
!guest_mode(regs)))
>>>>> return -1; 
>>>>> 
>>>> 
>>>> The RIPV is not related to the EIPV. RIPV means the context
saved
>>>> in the stack can''t be restarted anymore. According to
the SDM, RIPV
>>>> means "execution can be restarted reliably at the
instruction
>>>> pointed to by the instruction pointer pushed on the
stack". It''s
>>>> not about error happened synchronously or asynchronously. The
>>>> point is, if the program is running in hypervisor context, and
>>>> RIPV tells us that the program can''t be restarted, we
can''t do
>>>> anything but panic, because we can''t switch context
while we are
>>>> in xen. So this code have nothing to do with EIPV.
>>> 
>>> I continue to disagree (including the statement in your other
>>> response): RIPV only tells us whether we can resume, not in which
>>> context the error occurred. EIPV tells us whether, by looking at
the
>>> saved registers, we can determine the context that the error
>>> occurred in. Since with !RIPV we have to determine in what context
>>> the error occurred in order to decide whether to panic or just kill
>>> a guest, we can''t ignore EIPV (and if it''s not
set we have to
>>> assume the worst case, since even if the registers indicate guest
>>> mode the error may have occurred in hypervisor context or accessing
>>> hypervisor structures [consider e.g. a data load error during a GDT
>>> access]). 
>>> 
>>> Jan
>> 
>> Yes, I agree EIPV=0 may indicate async error, but I think your
>> solution *overkilled* most cases (i.e. the real guest instruction
>> fetch error). 
>> 
>> Our idea is,
>>   * xen mce would flush prefetched instruction so we can delay
>> handle it until if real need; 
>>   * a h/w error will not disappear, but if it was not being
>> *consumed*, it''s OK for system keep going (like SRAO error
which do
>> not need s/w handle immediately); 
>> 
>> Suppose an async instruction fetch error (RIVP=EIVP=0), triggered at
>> guest context but instruction prefetch hypervisor context. The
>>   scenario is, * at xen mce, the prefetched instruction has been
>> flushed. xen mce handler needn''t panic, instead it mark the
page as
>> broken page, then trigger vmce to guest;
> 
> If the prefetch was from Xen space (only in guest context),
> delivering a vMCE to the guest is pointless (and perhaps confusing to
> the guest). 
> 
Yes, exactly. how about delay handle it as:
* at mce isr
	if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
		xen panic;
* at mce softirq
	if ( (srar error) && (EIPV ==0) && (broken page owned by
hypervisor) )
		xen panic;
>>   * guest may kill app, kernel thread, guest itself, or whatever;
>> 
>> The error is still an error, w/ 2 possibilities in the future:
>>   1. it may not be consumed as an SRAR error, system keep going, h/w
>> mechanism may detect a SRAO error (i.e. memroy scrub) at some time
>> point and handled then; 
>>   2. it may be consumed at some time point and a SRAR error
>>    triggered again. At this time, 1). if srar occurred at hypervisor
>>    context, xen will panic. or, 2). if srar occurred at guest
>> context, xen kill the guest as a malicious one (as what the 2nd
>> patch do), and move the page to broken page list; 
>> 
>> Considering the rare possibility of the above case, I think
it''s
>> acceptable to handle it in this way. Thoughts?
> 
> You''re only discussing instruction fetches (which can be
discarded),
> but you''re not covering the other example I gave (GDT access from
> guest context - just like this is a ring-0 operations from the paging
> unit''s pov, this ought to be an out-of-context operation from
MCE''s
> perspective). 
> 
> Jan
That would be data load error (EIPV=1), a sync error.

Thanks,
Jinsong


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Oct-11 10:13 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 11.10.11 at 11:51, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Jan Beulich wrote:
>> If the prefetch was from Xen space (only in guest context),
>> delivering a vMCE to the guest is pointless (and perhaps confusing to
>> the guest). 
>> 
> 
> Yes, exactly. how about delay handle it as:
> * at mce isr
> 	if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
> 		xen panic;
> * at mce softirq
> 	if ( (srar error) && (EIPV ==0) && (broken page owned by
hypervisor) )
> 		xen panic;
Possible, but I''m not convinced.
>>>   * guest may kill app, kernel thread, guest itself, or whatever;
>>> 
>>> The error is still an error, w/ 2 possibilities in the future:
>>>   1. it may not be consumed as an SRAR error, system keep going,
h/w
>>> mechanism may detect a SRAO error (i.e. memroy scrub) at some time
>>> point and handled then; 
>>>   2. it may be consumed at some time point and a SRAR error
>>>    triggered again. At this time, 1). if srar occurred at
hypervisor
>>>    context, xen will panic. or, 2). if srar occurred at guest
>>> context, xen kill the guest as a malicious one (as what the 2nd
>>> patch do), and move the page to broken page list; 
>>> 
>>> Considering the rare possibility of the above case, I think
it''s
>>> acceptable to handle it in this way. Thoughts?
>> 
>> You''re only discussing instruction fetches (which can be
discarded),
>> but you''re not covering the other example I gave (GDT access
from
>> guest context - just like this is a ring-0 operations from the paging
>> unit''s pov, this ought to be an out-of-context operation from
MCE''s
>> perspective). 
> 
> That would be data load error (EIPV=1), a sync error.
If indeed implemented that way in hardware, that would make the
handling ambiguous: A GDT access must not (unconditionally) be
attributed to the (pv) guest, as it is not a problem the guest can
(necessarily) deal with (considering the split page ownership of
what constitutes the GDT under Xen, the guest should only be
accountable for the non-reserved part of the GDT, the rest should
be attributed back to Xen).

The same would go for (perhaps speculative) page table walks.

Furthermore, data prefetching is possible too - how would a problem
there get reported?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Oct-11 11:58 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan Beulich wrote:>>>> On 11.10.11 at 11:51, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>> Jan Beulich wrote:
>>> If the prefetch was from Xen space (only in guest context),
>>> delivering a vMCE to the guest is pointless (and perhaps confusing
>>> to the guest). 
>>> 
>> 
>> Yes, exactly. how about delay handle it as:
>> * at mce isr
>> 	if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs)) 	
xen panic;
>> * at mce softirq
>> 	if ( (srar error) && (EIPV ==0) && (broken page owned
by
>> 		hypervisor) ) xen panic;
> 
> Possible, but I''m not convinced.
> 
>>>>   * guest may kill app, kernel thread, guest itself, or
whatever;
>>>> 
>>>> The error is still an error, w/ 2 possibilities in the future:
>>>>   1. it may not be consumed as an SRAR error, system keep
going,
>>>> h/w mechanism may detect a SRAO error (i.e. memroy scrub) at
some
>>>> time point and handled then; 
>>>>   2. it may be consumed at some time point and a SRAR error
>>>>    triggered again. At this time, 1). if srar occurred at
>>>>    hypervisor context, xen will panic. or, 2). if srar occurred
at
>>>> guest 
>>>> context, xen kill the guest as a malicious one (as what the 2nd
>>>> patch do), and move the page to broken page list;
>>>> 
>>>> Considering the rare possibility of the above case, I think
it''s
>>>> acceptable to handle it in this way. Thoughts?
>>> 
>>> You''re only discussing instruction fetches (which can be
discarded),
>>> but you''re not covering the other example I gave (GDT
access from
>>> guest context - just like this is a ring-0 operations from the
>>> paging unit''s pov, this ought to be an out-of-context
operation
>>> from MCE''s perspective).
>> 
>> That would be data load error (EIPV=1), a sync error.
> 
> If indeed implemented that way in hardware, that would make the
> handling ambiguous: A GDT access must not (unconditionally) be
> attributed to the (pv) guest, as it is not a problem the guest can
> (necessarily) deal with (considering the split page ownership of
> what constitutes the GDT under Xen, the guest should only be
> accountable for the non-reserved part of the GDT, the rest should
> be attributed back to Xen).
> 
> The same would go for (perhaps speculative) page table walks.
> 
Seems not ambiguous here: who own, who take.
If error caused by hypervisor access broken page, xen panic;
If error caused by guest access, guest would handle it (I guess normally kill
itself);
If guest maliciously access again, it would be killed by hypervisor.
> Furthermore, data prefetching is possible too - how would a problem
> there get reported?
> 
It may be reported as unkown error, or nothing, but not as srar data load error
w/ EIPV=1.

Thanks,
Jinsong

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Oct-11 12:51 UTC

head link

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

>>> On 11.10.11 at 13:58, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> Jan Beulich wrote:
>>>>> On 11.10.11 at 11:51, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
>>> That would be data load error (EIPV=1), a sync error.
>> 
>> If indeed implemented that way in hardware, that would make the
>> handling ambiguous: A GDT access must not (unconditionally) be
>> attributed to the (pv) guest, as it is not a problem the guest can
>> (necessarily) deal with (considering the split page ownership of
>> what constitutes the GDT under Xen, the guest should only be
>> accountable for the non-reserved part of the GDT, the rest should
>> be attributed back to Xen).
>> 
>> The same would go for (perhaps speculative) page table walks.
>> 
> 
> Seems not ambiguous here: who own, who take.
> If error caused by hypervisor access broken page, xen panic;
> If error caused by guest access, guest would handle it (I guess normally 
> kill itself);
If a guest accesses the hypervisor part of the GDT or page tables, or
some other shared data structure owned by the hypervisor (like the
M2P table), its handler may get utterly confused by being presented
an address it doesn''t own and knows nothing about (i.e. in violation
of your "who own, who take").

And even from a theoretical pov, a hypervisor should panic if one of
its data structures got corrupted, no matter whether that was due to
its own or a guest''s access. Delaying the panic here will only lead to
the situation becoming worse. (The same would go for a "normal"
kernel: If an application causes an MCE due to e.g. a GDT access, it
shouldn''t be just the application that gets killed. Of course, with the
interesting GDT descriptors all being located close to each other,
there''s little chance the kernel would be able to handle that, but
that''s an implementation aspect of the kernel, not something that
should matter to the hardware design.)

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Liu, Jinsong

2011-Oct-21 20:25 UTC

head link

[Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Jan,

I update a little for my former patch, as attached.
For my former patch, you mainly have 2 concerns (list below). I double check xen
mce code, w/ my opinion append:

Concern 1: for SRAR IFU error, since RIPV=EIPV=0, it maybe an async error which
occur at guest but root from hypervisor.
[Jinsong]: 
    Yes, but EIPV didn''t tell us where the error root from
(it''s just a hint, warning us async possibility).
    It no need to overkill xen at mce isr, instead, at mce softirq we can find
out error root location and then handle accordingly:
    * at mce isr:
            /* a total insurance */
            /* if error is async, we delay handle it at mce softirq */
            if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
                return -1;
    * at mce softirq:
            /* detect error location by bank->mc_addr */
            /* handle different page OWNER cases at intel_memerr_dhandler() and
offline_page() */
            /* who own, who take */
            if (error page owner is guest)
                trigger vmce to guest;
            else
                panic xen;


Concern 2: If a guest accesses the hypervisor part of the GDT or page tables, or
some other shared data structure owned by the hypervisor (like the M2P table),
its handler may get utterly confused by being presented an address it
doesn''t own and knows nothing about.
[Jinsong]: for such cases, page owner would be dom_xen/dom_cow or NULL, but not
guest --> it would be handled at hypervisor, not triggering vmce to guest
--> who own, who take.


Thanks,
Jinsong

============================X86 MCE: Add SRAR handler

Currently Intel SDM add 2 kinds of MCE SRAR errors:
1). Data Load error, error code = 0x134
2). Instruction Fetch error, error code = 0x150
This patch add handler to these new SRAR errors.
It based on existed mce infrastructure, add code to handle SRAR specific error.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>

diff -r 1515484353c6 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Thu Oct 13 10:09:28 2011 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sat Oct 22 01:40:41 2011 +0800
@@ -37,6 +37,14 @@ static int __read_mostly nr_intel_ext_ms
  */
 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
 #define INTEL_SRAO_L3_EWB    0x17A
+
+/* 
+ * Currently Intel SDM define 2 kinds of srar errors:
+ * 1). Data Load error, error code = 0x134
+ * 2). Instruction Fetch error, error code = 0x150
+ */
+#define INTEL_SRAR_DATA_LOAD	0x134
+#define INTEL_SRAR_INSTR_FETCH	0x150
 
 /* Thermal Hanlding */
 #ifdef CONFIG_X86_MCE_THERMAL
@@ -256,7 +264,7 @@ static enum mce_result mce_action(struct
         for ( i = 0; i < handler_num; i++ ) {
             if (handlers[i].owned_error(binfo.mib->mc_status))
             {
-                handlers[i].recovery_handler(&binfo, &bank_result);
+                handlers[i].recovery_handler(&binfo, &bank_result,
regs);
                 if (worst_result < bank_result)
                     worst_result = bank_result;
                 break;
@@ -622,7 +630,8 @@ struct mcinfo_recovery *mci_add_pageoff_
 
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     struct mcinfo_bank *bank = binfo->mib;
     struct mcinfo_global *global = binfo->mig;
@@ -721,6 +730,32 @@ vmce_failed:
     }
 }
 
+static int intel_srar_check(uint64_t status)
+{
+    return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
+}
+
+static void intel_srar_dhandler(
+             struct mca_binfo *binfo,
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
+{
+    uint64_t status = binfo->mib->mc_status;
+
+    /* For unknown srar error code, reset system */
+    *result = MCER_RESET;
+
+    switch ( status & INTEL_MCCOD_MASK )
+    {
+    case INTEL_SRAR_DATA_LOAD:
+    case INTEL_SRAR_INSTR_FETCH:
+        intel_memerr_dhandler(binfo, result, regs);
+        break;
+    default:
+        break;
+    }
+}
+
 static int intel_srao_check(uint64_t status)
 {
     return ( intel_check_mce_type(status) == intel_mce_ucr_srao );
@@ -728,7 +763,8 @@ static int intel_srao_check(uint64_t sta
 
 static void intel_srao_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
 
@@ -741,7 +777,7 @@ static void intel_srao_dhandler(
         {
         case INTEL_SRAO_MEM_SCRUB:
         case INTEL_SRAO_L3_EWB:
-            intel_memerr_dhandler(binfo, result);
+            intel_memerr_dhandler(binfo, result, regs);
             break;
         default:
             break;
@@ -756,14 +792,15 @@ static int intel_default_check(uint64_t 
 
 static void intel_default_mce_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs * regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
 
     type = intel_check_mce_type(status);
 
-    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+    if (type == intel_mce_fatal)
         *result = MCER_RESET;
     else
         *result = MCER_CONTINUE;
@@ -771,12 +808,14 @@ static void intel_default_mce_dhandler(
 
 static const struct mca_error_handler intel_mce_dhandlers[] = {
     {intel_srao_check, intel_srao_dhandler},
+    {intel_srar_check, intel_srar_dhandler},
     {intel_default_check, intel_default_mce_dhandler}
 };
 
 static void intel_default_mce_uhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -785,8 +824,6 @@ static void intel_default_mce_uhandler(
 
     switch (type)
     {
-    /* Panic if no handler for SRAR error */
-    case intel_mce_ucr_srar:
     case intel_mce_fatal:
         *result = MCER_RESET;
         break;
@@ -961,10 +998,8 @@ static int intel_recoverable_scan(u64 st
     /* SRAR error */
     else if ( ser_support && !(status & MCi_STATUS_OVER) 
                 && !(status & MCi_STATUS_PCC) && (status
& MCi_STATUS_S)
-                && (status & MCi_STATUS_AR) ) {
-        mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined
currently.\n");
-        return 0;
-    }
+                && (status & MCi_STATUS_AR) && (status
& MCi_STATUS_EN) )
+        return 1;
     /* SRAO error */
     else if (ser_support && !(status & MCi_STATUS_PCC)
                 && (status & MCi_STATUS_S) && !(status
& MCi_STATUS_AR)
diff -r 1515484353c6 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Thu Oct 13 10:09:28 2011 +0200
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sat Oct 22 01:40:41 2011 +0800
@@ -151,7 +151,7 @@ struct mca_error_handler
     */
     int (*owned_error)(uint64_t status);
     void (*recovery_handler)(struct mca_binfo *binfo,
-                    enum mce_result *result);
+                    enum mce_result *result, struct cpu_user_regs *regs);
 };
 
 /* Global variables */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2011-Nov-02 09:57 UTC

head link

[Xen-devel] Re: [PATCH] X86 MCE: Add SRAR handler

>>> On 21.10.11 at 22:25, "Liu, Jinsong"
<jinsong.liu@intel.com> wrote:
> I update a little for my former patch, as attached.
> For my former patch, you mainly have 2 concerns (list below). I double
check
> xen mce code, w/ my opinion append:
Those concerns were really just triggered by the patch, not directly
related to it. The patch itself looks okay to me.
> Concern 1: for SRAR IFU error, since RIPV=EIPV=0, it maybe an async error 
> which occur at guest but root from hypervisor.
> [Jinsong]: 
>     Yes, but EIPV didn''t tell us where the error root from
(it''s just a
> hint, warning us async possibility). 
>     It no need to overkill xen at mce isr, instead, at mce softirq we can 
> find out error root location and then handle accordingly:
>     * at mce isr:
>             /* a total insurance */
>             /* if error is async, we delay handle it at mce softirq */
>             if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
>                 return -1;
I continue to think that guest_mode() must not be used without
EIPV, no matter what the purpose of the use.
>     * at mce softirq:
>             /* detect error location by bank->mc_addr */
>             /* handle different page OWNER cases at intel_memerr_dhandler()
> and offline_page() */
>             /* who own, who take */
>             if (error page owner is guest)
>                 trigger vmce to guest;
>             else
>                 panic xen;
That part is certainly fine.
> Concern 2: If a guest accesses the hypervisor part of the GDT or page 
> tables, or some other shared data structure owned by the hypervisor (like
the
> M2P table), its handler may get utterly confused by being presented an 
> address it doesn''t own and knows nothing about.
> [Jinsong]: for such cases, page owner would be dom_xen/dom_cow or NULL, but
> not guest --> it would be handled at hypervisor, not triggering vmce to
guest -->
> who own, who take.
That latter part of your explanation is fine too, but with the caveat that
the bogus use of guest_mode() above may have an overall effect on the
behavior.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jiang, Yunhong

2011-Nov-08 23:59 UTC

head link

[Xen-devel] RE: [PATCH] X86 MCE: Add SRAR handler

> > Concern 1: for SRAR IFU error, since RIPV=EIPV=0, it maybe an async
error> > which occur at guest but root from hypervisor.
> > [Jinsong]:
> >     Yes, but EIPV didn''t tell us where the error root from
(it''s just a
> > hint, warning us async possibility).
> >     It no need to overkill xen at mce isr, instead, at mce softirq we
can> > find out error root location and then handle accordingly:
> >     * at mce isr:
> >             /* a total insurance */
> >             /* if error is async, we delay handle it at mce softirq */
> >             if ( !(gstatus & MCG_STATUS_RIPV) &&
!guest_mode(regs))
> >                 return -1;
> 
> I continue to think that guest_mode() must not be used without
> EIPV, no matter what the purpose of the use.
Jan, as explained before, this is purely a sanity check to make sure we can
go to next level error handling, and the check for guest_mode have nothing
to do with where the error happen, but just make sure that if the context in
the stack is for hypervisor mode, and RIPV told us that we can''t switch
context, then we can''t do anything more but just panic since
it''s complexity
to pre-emption/ context switch in hypervisor mode.

Thanks
--jyh



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Xen devel - Sep 2011 - [PATCH] X86 MCE: Add SRAR handler

[Xen-devel] [PATCH] X86 MCE: Add SRAR handler

Re: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

RE: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler

[Xen-devel] [PATCH] X86 MCE: Add SRAR handler

[Xen-devel] Re: [PATCH] X86 MCE: Add SRAR handler

[Xen-devel] RE: [PATCH] X86 MCE: Add SRAR handler