public inbox for linux-arm-kernel@lists.infradead.org 
 help / color / mirror / Atom feed
From: Kiryl Shutsemau <kirill@shutemov•name>
To: Catalin Marinas <catalin.marinas@arm•com>,
	Will Deacon <will@kernel•org>, James Morse <james.morse@arm•com>
Cc: Mark Rutland <mark.rutland@arm•com>,
	Marc Zyngier <maz@kernel•org>,
	Doug Anderson <dianders@chromium•org>,
	Petr Mladek <pmladek@suse•com>,
	Thomas Gleixner <tglx@linutronix•de>,
	Andrew Morton <akpm@linux-foundation•org>,
	Baoquan He <bhe@redhat•com>, Puranjay Mohan <puranjay@kernel•org>,
	Usama Arif <usama.arif@linux•dev>,
	Breno Leitao <leitao@debian•org>,
	Julien Thierry <julien.thierry.kdev@gmail•com>,
	Lecopzer Chen <lecopzer.chen@mediatek•com>,
	Sumit Garg <sumit.garg@kernel•org>,
	kernel-team@meta•com, kexec@lists•infradead.org,
	linux-arm-kernel@lists•infradead.org,
	linux-kernel@vger•kernel.org,
	"Kiryl Shutsemau (Meta)" <kas@kernel•org>
Subject: [PATCH 4/4] arm64: route crash_smp_send_stop() last resort through SDEI
Date: Wed,  3 Jun 2026 15:36:35 +0100	[thread overview]
Message-ID: <54cb99db3c981dc39eb3031aff5caeaadb09e8b9.1780496779.git.kas@kernel.org> (raw)
In-Reply-To: <cover.1780496779.git.kas@kernel.org>

From: "Kiryl Shutsemau (Meta)" <kas@kernel•org>

Add SDEI as the final rung after the normal stop IPI (and the pseudo-NMI
IPI, if enabled): signal event 0 at the CPUs still online, whose handler
runs crash_save_cpu() on the wedged context and parks them. It only ever
touches CPUs the normal path couldn't reach.

SDEI is last because a CPU parked in the handler never completes the
event, so it is less recoverable -- a cost paid only when nothing else
worked.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel•org>
---
 arch/arm64/include/asm/nmi.h |   6 ++
 arch/arm64/kernel/smp.c      |  24 ++++++
 drivers/firmware/Kconfig     |   1 +
 drivers/firmware/sdei_nmi.c  | 137 ++++++++++++++++++++++++++++++++++-
 4 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
index ccdb75692e9d..e3edfb24fc08 100644
--- a/arch/arm64/include/asm/nmi.h
+++ b/arch/arm64/include/asm/nmi.h
@@ -13,12 +13,18 @@
  */
 #ifdef CONFIG_ARM_SDEI_NMI
 bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu);
+bool sdei_nmi_crash_smp_send_stop(void);
 #else
 static inline bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
 						      int exclude_cpu)
 {
 	return false;
 }
+
+static inline bool sdei_nmi_crash_smp_send_stop(void)
+{
+	return false;
+}
 #endif
 
 #endif /* __ASM_NMI_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 656b8417af72..386ddd526b48 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1288,8 +1288,32 @@ void crash_smp_send_stop(void)
 		return;
 	crash_stop = 1;
 
+	/*
+	 * Stop the normal way first: IPI_CPU_STOP escalating to a pseudo-NMI
+	 * IPI. Every CPU that responds saves its state via crash_save_cpu()
+	 * and parks in cpu_park_loop() with its online bit cleared -- the
+	 * standard kdump stop, identical to a kernel without SDEI. Crucially
+	 * those CPUs stay in a clean, potentially-reusable state.
+	 */
 	smp_send_stop();
 
+	/*
+	 * Whatever is still online didn't respond -- typically a CPU wedged
+	 * with interrupts masked. The plain IPI can't reach it, and a fleet
+	 * that declines the pseudo-NMI hot-path cost has no NMI IPI to
+	 * escalate to. Hit only the survivors with the SDEI cross-CPU NMI
+	 * (no-op if SDEI isn't active, or if everything already stopped):
+	 * firmware delivers out of EL3 regardless of PSTATE.DAIF, and the
+	 * handler captures crash_save_cpu() state from the wedged context
+	 * before parking the CPU.
+	 *
+	 * SDEI is deliberately last: an SDEI-stopped CPU never completes its
+	 * event (it parks inside the handler, so EL3 retains its dispatch
+	 * slot until reset), which is strictly less recoverable than a normal
+	 * stop. We pay that only for CPUs that left no other way to reach them.
+	 */
+	sdei_nmi_crash_smp_send_stop();
+
 	sdei_handler_abort();
 }
 
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 552eff7b9bc3..84aead609406 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -49,6 +49,7 @@ config ARM_SDEI_NMI
 	      hung-task auxiliary dumps)
 	    - the hardlockup watchdog backend, when HARDLOCKUP_DETECTOR is
 	      also enabled
+	    - crash_smp_send_stop()              (panic / kdump path)
 
 	  The driver registers a handler for the SDEI software-signalled
 	  event (event 0) and reaches a target CPU by signalling it with
diff --git a/drivers/firmware/sdei_nmi.c b/drivers/firmware/sdei_nmi.c
index 51e220d4083d..ad8fbb1c90a6 100644
--- a/drivers/firmware/sdei_nmi.c
+++ b/drivers/firmware/sdei_nmi.c
@@ -29,6 +29,11 @@
  *     hardlockup_all_cpu_backtrace, soft-lockup/hung-task secondary
  *     dumps all reach interrupt-masked CPUs.
  *
+ *   - sdei_nmi_crash_smp_send_stop() — override for arm64's
+ *     crash_smp_send_stop(); the panic/kdump last resort for CPUs that
+ *     didn't answer the normal stop IPI, capturing the wedged context
+ *     into the vmcore before parking the CPU.
+ *
  *   - the hardlockup-detector backend (watchdog_hardlockup_enable/
  *     disable/probe()), when CONFIG_HARDLOCKUP_DETECTOR is also on.
  *     ARM_SDEI_NMI selects HAVE_HARDLOCKUP_DETECTOR_ARCH, so the
@@ -50,11 +55,15 @@
 #define pr_fmt(fmt) "sdei_nmi: " fmt
 
 #include <linux/arm_sdei.h>
+#include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/err.h>
 #include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/kexec.h>
 #include <linux/nmi.h>
 #include <linux/percpu-defs.h>
 #include <linux/perf_event.h>
@@ -72,8 +81,66 @@ static bool sdei_nmi_available;
 
 #define SDEI_NMI_EVENT			0
 
+/*
+ * Crash-stop dispatch lives on the same SDEI event 0 as everything else.
+ * The requesting CPU sets sdei_nmi_crash_stop_requested for each target
+ * before signalling event 0; the target's handler clears it, saves crash
+ * state, parks, and sets sdei_nmi_crash_stop_acked so the requester knows
+ * the target is down.
+ *
+ * Using a per-CPU flag rather than a separate SDEI event avoids needing
+ * extra registrations from firmware. The SDEI_EVENT_SIGNAL SMC is itself
+ * a write barrier, so a WRITE_ONCE() before the signal is sufficient
+ * ordering against the handler's READ_ONCE() on the target.
+ */
+static DEFINE_PER_CPU(unsigned long, sdei_nmi_crash_stop_requested);
+static DEFINE_PER_CPU(unsigned long, sdei_nmi_crash_stop_acked);
+
 static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
 {
+	int cpu = smp_processor_id();
+
+	if (READ_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_requested))) {
+		WRITE_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_requested), 0);
+
+		/*
+		 * Capture the wedged context for kdump while pt_regs still
+		 * points at the interrupted PC. This is the main motivation
+		 * for using SDEI here: the plain IPI stop path can't reach an
+		 * interrupt-masked CPU (and the fleet declines pseudo-NMI to
+		 * keep the IRQ-mask hot path cheap), so crash_save_cpu() for
+		 * that CPU would otherwise record nothing useful.
+		 */
+		crash_save_cpu(regs, cpu);
+		set_cpu_online(cpu, false);
+
+		/* publish the crash state/offline before the requester sees the ack */
+		smp_wmb();
+		WRITE_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_acked), 1);
+
+		/*
+		 * Park forever from within the SDEI handler. We deliberately
+		 * do NOT issue SDEI_EVENT_COMPLETE: the framework's return
+		 * path restores firmware's saved interrupted context, which
+		 * would land the CPU back wherever it was running (often
+		 * do_idle, which then notices cpu_is_offline=true and BUGs
+		 * at cpuhp_report_idle_dead). Returning the modified pt_regs
+		 * doesn't help -- arch/arm64/kernel/sdei.c::do_sdei_event
+		 * only honours a PC override via its IRQ-state heuristic
+		 * and otherwise hands EL3 its own saved-context slot back.
+		 *
+		 * Trade-off: EL3 firmware retains ~one saved-context slot
+		 * per parked CPU until the next hardware reset (~hundreds of
+		 * bytes per CPU). The CPU itself is parked in cpu_park_loop
+		 * exactly as if IPI_CPU_STOP had stopped it; recoverability
+		 * is unchanged versus the existing path (neither is
+		 * recoverable without hardware reset, since PSCI sees the
+		 * CPU as ALREADY_ON in both cases).
+		 */
+		cpu_park_loop();
+		/* unreachable */
+	}
+
 	/*
 	 * Both consumers no-op on a CPU that wasn't actually requested:
 	 * nmi_cpu_backtrace() unless this CPU's bit is set in the global
@@ -84,7 +151,7 @@ static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
 	 */
 	nmi_cpu_backtrace(regs);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER
-	watchdog_hardlockup_check(smp_processor_id(), regs);
+	watchdog_hardlockup_check(cpu, regs);
 #endif
 	return SDEI_EV_HANDLED;
 }
@@ -133,6 +200,74 @@ bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
 	return true;
 }
 
+/*
+ * Last-resort half of arm64's crash_smp_send_stop() (see
+ * arch/arm64/kernel/smp.c). The caller runs the normal IPI / pseudo-NMI
+ * stop first; whatever is left in cpu_online_mask by the time we're
+ * called are the CPUs that didn't respond -- wedged with interrupts
+ * masked, unreachable by those paths. We snapshot that residual mask,
+ * set each survivor's per-CPU crash-stop request flag, signal event 0
+ * at it, and poll for acks. The handler captures crash_save_cpu() state
+ * and parks the CPU (without completing the SDEI event, see
+ * sdei_nmi_handler()).
+ *
+ * Because SDEI-stopped CPUs are less recoverable than normally-stopped
+ * ones, this is intentionally the fallback, not the first choice -- it
+ * only ever runs against CPUs the normal path already gave up on.
+ *
+ * Returns true when SDEI was active and this path ran (even if some CPU
+ * failed to ack within the timeout, or there were no survivors to stop);
+ * false when SDEI isn't active, leaving the caller's normal-path result
+ * as the final word.
+ */
+bool sdei_nmi_crash_smp_send_stop(void)
+{
+	unsigned int this_cpu, cpu, remaining;
+	unsigned long timeout;
+	cpumask_t mask;
+
+	if (!sdei_nmi_available)
+		return false;
+
+	this_cpu = smp_processor_id();
+	cpumask_copy(&mask, cpu_online_mask);
+	cpumask_clear_cpu(this_cpu, &mask);
+	if (cpumask_empty(&mask))
+		return true;
+
+	for_each_cpu(cpu, &mask) {
+		WRITE_ONCE(per_cpu(sdei_nmi_crash_stop_acked, cpu), 0);
+		WRITE_ONCE(per_cpu(sdei_nmi_crash_stop_requested, cpu), 1);
+	}
+	/* Publish flags before the SMCs read them on the target side. */
+	smp_wmb();
+
+	for_each_cpu(cpu, &mask)
+		sdei_nmi_fire(cpu);
+
+	/*
+	 * Poll up to 100ms -- same order as the kernel's existing pseudo-NMI
+	 * stop wait (10ms) plus headroom for the SDEI round-trip on slow
+	 * firmware.
+	 */
+	timeout = USEC_PER_MSEC * 100;
+	while (timeout--) {
+		remaining = 0;
+		for_each_cpu(cpu, &mask)
+			if (!READ_ONCE(per_cpu(sdei_nmi_crash_stop_acked, cpu)))
+				remaining++;
+		if (!remaining)
+			break;
+		udelay(1);
+	}
+
+	if (remaining)
+		pr_warn("crash_stop: %u CPU(s) did not ack within 100ms\n",
+			remaining);
+
+	return true;
+}
+
 #ifdef CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER
 
 /*
-- 
2.54.0



      parent reply	other threads:[~2026-06-03 14:36 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-03 14:36 [PATCH 0/4] arm64: cross-CPU NMI via SDEI Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 1/4] firmware: arm_sdei: add SDEI_EVENT_SIGNAL support Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 2/4] drivers/firmware: add SDEI cross-CPU NMI service for arm64 Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 3/4] arm64: wire SDEI NMI into the hardlockup watchdog Kiryl Shutsemau
2026-06-03 14:36 ` Kiryl Shutsemau [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=54cb99db3c981dc39eb3031aff5caeaadb09e8b9.1780496779.git.kas@kernel.org \
    --to=kirill@shutemov$(echo .)name \
    --cc=akpm@linux-foundation$(echo .)org \
    --cc=bhe@redhat$(echo .)com \
    --cc=catalin.marinas@arm$(echo .)com \
    --cc=dianders@chromium$(echo .)org \
    --cc=james.morse@arm$(echo .)com \
    --cc=julien.thierry.kdev@gmail$(echo .)com \
    --cc=kas@kernel$(echo .)org \
    --cc=kernel-team@meta$(echo .)com \
    --cc=kexec@lists$(echo .)infradead.org \
    --cc=lecopzer.chen@mediatek$(echo .)com \
    --cc=leitao@debian$(echo .)org \
    --cc=linux-arm-kernel@lists$(echo .)infradead.org \
    --cc=linux-kernel@vger$(echo .)kernel.org \
    --cc=mark.rutland@arm$(echo .)com \
    --cc=maz@kernel$(echo .)org \
    --cc=pmladek@suse$(echo .)com \
    --cc=puranjay@kernel$(echo .)org \
    --cc=sumit.garg@kernel$(echo .)org \
    --cc=tglx@linutronix$(echo .)de \
    --cc=usama.arif@linux$(echo .)dev \
    --cc=will@kernel$(echo .)org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox