Joerg Roedel
2021-Jun-10 09:11 UTC
[PATCH v4 3/6] x86/sev-es: Split up runtime #VC handler for correct state tracking
From: Joerg Roedel <jroedel at suse.de>
Split up the #VC handler code into a from-user and a from-kernel part.
This allows clean and correct state tracking, as the #VC handler needs
to enter NMI-state when raised from kernel mode and plain IRQ state when
raised from user-mode.
Fixes: 62441a1fb532 ("x86/sev-es: Correctly track IRQ states in runtime #VC
handler")
Suggested-by: Peter Zijlstra <peterz at infradead.org>
Signed-off-by: Joerg Roedel <jroedel at suse.de>
---
arch/x86/kernel/sev.c | 118 ++++++++++++++++++++++++------------------
1 file changed, 68 insertions(+), 50 deletions(-)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 2a922d1b03c8..475bbc1b3547 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1326,43 +1326,14 @@ static __always_inline bool on_vc_fallback_stack(struct
pt_regs *regs)
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp <
__this_cpu_ist_top_va(VC2));
}
-/*
- * Main #VC exception handler. It is called when the entry code was able to
- * switch off the IST to a safe kernel stack.
- *
- * With the current implementation it is always possible to switch to a safe
- * stack because #VC exceptions only happen at known places, like intercepted
- * instructions or accesses to MMIO areas/IO ports. They can also happen with
- * code instrumentation when the hypervisor intercepts #DB, but the critical
- * paths are forbidden to be instrumented, so #DB exceptions currently also
- * only happen in safe places.
- */
-DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long
error_code)
{
- irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
unsigned long flags;
struct ghcb *ghcb;
-
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
- vc_handle_trap_db(regs);
- return;
- }
-
- irq_state = irqentry_nmi_enter(regs);
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
-
- /*
- * This is invoked through an interrupt gate, so IRQs are disabled. The
- * code below might walk page-tables for user or kernel addresses, so
- * keep the IRQs disabled to protect us against concurrent TLB flushes.
- */
+ bool ret = true;
ghcb = sev_es_get_ghcb(&state, &flags);
@@ -1382,15 +1353,18 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
case ES_UNSUPPORTED:
pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP:
0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_VMM_ERROR:
pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx
IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_DECODE_FAILED:
pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP:
0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_EXCEPTION:
vc_forward_exception(&ctxt);
break;
@@ -1406,24 +1380,16 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
BUG();
}
-out:
- instrumentation_end();
- irqentry_nmi_exit(regs, irq_state);
+ return ret;
+}
- return;
+static void vc_handle_from_kernel(struct pt_regs *regs, unsigned long
error_code)
+{
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
-fail:
- if (user_mode(regs)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- } else {
- pr_emerg("PANIC: Unhandled #VC exception in kernel space
(result=%d)\n",
- result);
+ instrumentation_begin();
+ if (!vc_raw_handle_exception(regs, error_code)) {
/* Show some debug info */
show_regs(regs);
@@ -1434,7 +1400,59 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
panic("Returned from Terminate-Request to Hypervisor\n");
}
- goto out;
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+static void vc_handle_from_user(struct pt_regs *regs, unsigned long error_code)
+{
+ irqentry_state_t irq_state = irqentry_enter(regs);
+
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit(regs, irq_state);
+}
+/*
+ * Main #VC exception handler. It is called when the entry code was able to
+ * switch off the IST to a safe kernel stack.
+ *
+ * With the current implementation it is always possible to switch to a safe
+ * stack because #VC exceptions only happen at known places, like intercepted
+ * instructions or accesses to MMIO areas/IO ports. They can also happen with
+ * code instrumentation when the hypervisor intercepts #DB, but the critical
+ * paths are forbidden to be instrumented, so #DB exceptions currently also
+ * only happen in safe places.
+ */
+DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
+{
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
+ vc_handle_trap_db(regs);
+ return;
+ }
+
+ /*
+ * This is invoked through an interrupt gate, so IRQs are disabled. The
+ * code below might walk page-tables for user or kernel addresses, so
+ * keep the IRQs disabled to protect us against concurrent TLB flushes.
+ */
+
+ if (user_mode(regs))
+ vc_handle_from_user(regs, error_code);
+ else
+ vc_handle_from_kernel(regs, error_code);
}
/* This handler runs on the #VC fall-back stack. It can cause further #VC
exceptions */
--
2.31.1
Peter Zijlstra
2021-Jun-10 10:19 UTC
[PATCH v4 3/6] x86/sev-es: Split up runtime #VC handler for correct state tracking
Bah, I suppose the trouble is that this SEV crap requires PARAVIRT? I should really get around to fixing noinstr validation with PARAVIRT on :-( On Thu, Jun 10, 2021 at 11:11:38AM +0200, Joerg Roedel wrote:> +static void vc_handle_from_kernel(struct pt_regs *regs, unsigned long error_code)static noinstr ...> +{ > + irqentry_state_t irq_state = irqentry_nmi_enter(regs); > > + instrumentation_begin(); > > + if (!vc_raw_handle_exception(regs, error_code)) { > /* Show some debug info */ > show_regs(regs); > > @@ -1434,7 +1400,59 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) > panic("Returned from Terminate-Request to Hypervisor\n"); > } > > + instrumentation_end(); > + irqentry_nmi_exit(regs, irq_state); > +} > + > +static void vc_handle_from_user(struct pt_regs *regs, unsigned long error_code)static noinstr ...> +{ > + irqentry_state_t irq_state = irqentry_enter(regs); > + > + instrumentation_begin(); > + > + if (!vc_raw_handle_exception(regs, error_code)) { > + /* > + * Do not kill the machine if user-space triggered the > + * exception. Send SIGBUS instead and let user-space deal with > + * it. > + */ > + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); > + } > + > + instrumentation_end(); > + irqentry_exit(regs, irq_state); > +}+ linebreak> +/* > + * Main #VC exception handler. It is called when the entry code was able to > + * switch off the IST to a safe kernel stack. > + * > + * With the current implementation it is always possible to switch to a safe > + * stack because #VC exceptions only happen at known places, like intercepted > + * instructions or accesses to MMIO areas/IO ports. They can also happen with > + * code instrumentation when the hypervisor intercepts #DB, but the critical > + * paths are forbidden to be instrumented, so #DB exceptions currently also > + * only happen in safe places. > + */ > +DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) > +{ > + /* > + * Handle #DB before calling into !noinstr code to avoid recursive #DB. > + */ > + if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { > + vc_handle_trap_db(regs); > + return; > + } > + > + /* > + * This is invoked through an interrupt gate, so IRQs are disabled. The > + * code below might walk page-tables for user or kernel addresses, so > + * keep the IRQs disabled to protect us against concurrent TLB flushes. > + */ > + > + if (user_mode(regs)) > + vc_handle_from_user(regs, error_code); > + else > + vc_handle_from_kernel(regs, error_code); > }#DB and MCE use idtentry_mce_db and split out in asm. When I look at idtentry_vc, it appears to me that VC_SAFE_STACK already implies from-user, or am I reading that wrong? Ah, it appears you're muddling things up again by then also calling safe_stack_ from exc_. How about you don't do that and have exc_ call your new from_kernel function, then we know that safe_stack_ is always from-user. Then also maybe do: s/VS_SAFE_STACK/VC_USER/ s/safe_stack_/noist_/ to match all the others (#DB/MCE). Also, AFAICT, you don't actually need DEFINE_IDTENTRY_VC_IST, it doesn't have an ASM counterpart. So then you end up with something like: DEFINE_IDTENTRY_VC(exc_vc) { if (unlikely(on_vc_fallback_stack(regs))) { instrumentation_begin(); panic("boohooo\n"); instrumentation_end(); } vc_from_kernel(); } DEFINE_IDTENTRY_VC_USER(exc_vc) { vc_from_user(); } Which is, I'm thinking, much simpler, no?