public inbox for linux-next@vger.kernel.org 
 help / color / mirror / Atom feed
From: Yazen Ghannam <yazen.ghannam@amd•com>
To: Bert Karwatzki <spasswolf@web•de>
Cc: Nikolay Borisov <nik.borisov@suse•com>,
	Borislav Petkov <bp@alien8•de>, Tony Luck <tony.luck@intel•com>,
	linux-kernel@vger•kernel.org, linux-next@vger•kernel.org,
	linux-edac@vger•kernel.org, linux-acpi@vger•kernel.org,
	x86@kernel•org, rafael@kernel•org, qiuxu.zhuo@intel•com,
	Smita.KoralahalliChannabasappa@amd•com
Subject: Re: spurious mce Hardware Error messages in next-20250912
Date: Thu, 9 Oct 2025 09:20:55 -0400	[thread overview]
Message-ID: <20251009132055.GA472268@yaz-khff2.amd.com> (raw)
In-Reply-To: <67c7de1011ea7b8863051889ee2a41512fb0e044.camel@web.de>

On Fri, Sep 19, 2025 at 12:07:15AM +0200, Bert Karwatzki wrote:
> Am Donnerstag, dem 18.09.2025 um 17:00 -0400 schrieb Yazen Ghannam:
> 

[...]

> 
> [  333.337523] [      C0] mce: DEBUG: CPU0 Bank:11 Status:0x8724aa0800000000
> [  333.337532] [      C0] mce: DEBUG: CPU0 Bank:14 Status:0x8724a98800000000

Thanks Bert for gathering the data.

We still don't have a system that shows this behavior. But I was able to
simulate it by manually writing the register values.

Can you please try the patch below?

This adds additional checks to ignore invalid values. And it addresses
feedback from Nikolay about clearing status registers later.

If this works for you, then I can squash this into another revision of
the patch set.

Thanks,
Yazen


From 11cdf1e18faa343c1786f6ac47f663937252c4d1 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd•com>
Date: Mon, 22 Sep 2025 20:26:06 +0000
Subject: [PATCH] x86/mce: Rework DFR handling flow

Add a flag to poll for Deferred errors similar to MCP_UC for
uncorrectable errors. This will do checks specific to deferred errors
and fallback to common UC/CE checks otherwise.

Also, clear the MCA_DESTAT register at the end of the handler rather
than the beginning.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd•com>
---
 arch/x86/include/asm/mce.h     |  1 +
 arch/x86/kernel/cpu/mce/amd.c  | 13 ++++++++----
 arch/x86/kernel/cpu/mce/core.c | 36 ++++++++++++++++++++--------------
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1cfbfff0be3f..9652fc11860d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -299,6 +299,7 @@ enum mcp_flags {
 	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
 	MCP_UC		= BIT(1),	/* log uncorrected errors */
 	MCP_QUEUE_LOG	= BIT(2),	/* only queue to genpool */
+	MCP_DFR		= BIT(3),	/* log deferred errors */
 };
 
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9b746080351f..83fad4503b1c 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -839,7 +839,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
 /* APIC interrupt handler for deferred errors */
 static void amd_deferred_error_interrupt(void)
 {
-	machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
+	machine_check_poll(MCP_TIMESTAMP | MCP_DFR, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
 }
 
 void mce_amd_handle_storm(unsigned int bank, bool on)
@@ -865,10 +865,15 @@ void amd_clear_bank(struct mce *m)
 {
 	amd_reset_thr_limit(m->bank);
 
-	if (m->kflags & MCE_CHECK_DFR_REGS)
+	/* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+	if (m->status & MCI_STATUS_DEFERRED)
 		mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
-	else
-		mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+
+	/* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+	if (m->kflags & MCE_CHECK_DFR_REGS)
+		return;
+
+	mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e2d51609d2cb..960efee4be3e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -731,27 +731,26 @@ static bool smca_should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *
 	struct mce *m = &err->m;
 
 	/*
-	 * If this is a deferred error found in MCA_STATUS, then clear
-	 * the redundant data from the MCA_DESTAT register.
+	 * If the MCA_STATUS register has a deferred error, then continue using it as
+	 * the status register.
+	 *
+	 * MCA_DESTAT will be cleared at the end of the handler.
 	 */
-	if (m->status & MCI_STATUS_VAL) {
-		if (m->status & MCI_STATUS_DEFERRED)
-			mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
-
+	if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED))
 		return true;
-	}
 
 	/*
-	 * If the MCA_DESTAT register has valid data, then use
-	 * it as the status register.
+	 * If the MCA_DESTAT register has a deferred error, then use it instead.
+	 *
+	 * MCA_STATUS will not be cleared at the end of the handler.
 	 */
 	m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+	if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+		m->kflags |= MCE_CHECK_DFR_REGS;
+		return true;
+	}
 
-	if (!(m->status & MCI_STATUS_VAL))
-		return false;
-
-	m->kflags |= MCE_CHECK_DFR_REGS;
-	return true;
+	return false;
 }
 
 /*
@@ -780,13 +779,17 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
 {
 	struct mce *m = &err->m;
 
-	if (mce_flags.smca)
+	if (flags & MCP_DFR)
 		return smca_should_log_poll_error(flags, err);
 
 	/* If this entry is not valid, ignore it. */
 	if (!(m->status & MCI_STATUS_VAL))
 		return false;
 
+	/* Ignore deferred errors if not looking for them (MCP_DFR not set). */
+	if (m->status & MCI_STATUS_DEFERRED)
+		return false;
+
 	/*
 	 * If we are logging everything (at CPU online) or this
 	 * is a corrected error, then we must log it.
@@ -1924,6 +1927,9 @@ static void __mcheck_cpu_init_prepare_banks(void)
 
 		bitmap_fill(all_banks, MAX_NR_BANKS);
 		machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
+
+		if (mce_flags.smca)
+			machine_check_poll(MCP_DFR | MCP_QUEUE_LOG, &all_banks);
 	}
 
 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
-- 
2.51.0


  reply	other threads:[~2025-10-09 13:21 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-15  1:00 spurious mce Hardware Error messages in next-20250912 Bert Karwatzki
2025-09-15 17:55 ` Yazen Ghannam
2025-09-15 21:03   ` Bert Karwatzki
2025-09-15 21:43     ` Bert Karwatzki
2025-09-16  9:10       ` Borislav Petkov
2025-09-16 14:07         ` Yazen Ghannam
2025-09-16 20:27           ` Bert Karwatzki
2025-09-17  7:13             ` Bert Karwatzki
2025-09-17 14:41               ` Yazen Ghannam
2025-09-17 15:33                 ` Bert Karwatzki
2025-09-17 19:26                   ` Yazen Ghannam
2025-09-17 21:15                     ` Yazen Ghannam
2025-09-17 22:01                       ` Bert Karwatzki
2025-09-18 10:20                     ` Nikolay Borisov
2025-09-18 21:00                       ` Yazen Ghannam
2025-09-18 21:04                         ` Luck, Tony
2025-09-18 21:14                           ` Yazen Ghannam
2025-09-18 22:07                         ` Bert Karwatzki
2025-10-09 13:20                           ` Yazen Ghannam [this message]
2026-02-12 12:50                             ` spurious (?) mce Hardware Error messages in v6.19 Bert Karwatzki
2026-02-13 12:45                               ` Bert Karwatzki
2026-02-16 20:25                               ` Yazen Ghannam
2026-02-19 14:33                                 ` Yazen Ghannam
2026-02-19 15:43                                   ` Bert Karwatzki
2026-02-20 16:49                                     ` Mario Limonciello
2026-02-20 18:24                                       ` Bert Karwatzki
2026-02-23 21:53                                         ` Yazen Ghannam
2026-04-03 14:05                                           ` Borislav Petkov
2026-04-05  8:47                                             ` Bert Karwatzki
2026-04-05 10:46                                               ` Borislav Petkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251009132055.GA472268@yaz-khff2.amd.com \
    --to=yazen.ghannam@amd$(echo .)com \
    --cc=Smita.KoralahalliChannabasappa@amd$(echo .)com \
    --cc=bp@alien8$(echo .)de \
    --cc=linux-acpi@vger$(echo .)kernel.org \
    --cc=linux-edac@vger$(echo .)kernel.org \
    --cc=linux-kernel@vger$(echo .)kernel.org \
    --cc=linux-next@vger$(echo .)kernel.org \
    --cc=nik.borisov@suse$(echo .)com \
    --cc=qiuxu.zhuo@intel$(echo .)com \
    --cc=rafael@kernel$(echo .)org \
    --cc=spasswolf@web$(echo .)de \
    --cc=tony.luck@intel$(echo .)com \
    --cc=x86@kernel$(echo .)org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox