Implement the Hardware Error Exception trap handler for RISC-V architecture synchronous hardware error handling. This enables the OS to receive hardware error notifications from firmware through the standardized ACPI HEST (Hardware Error Source Table) interface. The implementation includes: - A new exception vector entry for Hardware Error Exceptio - A trap handler (do_trap_hardware_error) that processes hardware errors in both kernel(panic now) and user modes(SIGBUS) - Integration with APEI GHES (Generic Hardware Error Source) to report hardware errors from firmware This change enables RISC-V systems with ACPI to handle synchronous hardware errors in a firmware-first manner. Signed-off-by: Ruidong Tian <tianruidong@xxxxxxxxxxxxxxxxx> --- arch/riscv/include/asm/acpi.h | 2 ++ arch/riscv/kernel/acpi.c | 55 +++++++++++++++++++++++++++++++++++ arch/riscv/kernel/entry.S | 4 +++ arch/riscv/kernel/traps.c | 19 ++++++++++++ 4 files changed, 80 insertions(+) diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h index 0c599452ef48..ae861885b97d 100644 --- a/arch/riscv/include/asm/acpi.h +++ b/arch/riscv/include/asm/acpi.h @@ -91,6 +91,7 @@ int acpi_get_riscv_isa(struct acpi_table_header *table, void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size, u32 *cboz_size, u32 *cbop_size); +int apei_claim_hee(struct pt_regs *regs); #else static inline void acpi_init_rintc_map(void) { } static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu) @@ -108,6 +109,7 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size, u32 *cboz_size, u32 *cbop_size) { } +static inline int apei_claim_hee(struct pt_regs *regs) { return -ENOENT; } #endif /* CONFIG_ACPI */ #ifdef CONFIG_ACPI_NUMA diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index 3f6d5a6789e8..928f9474bfee 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -20,6 +20,11 @@ #include <linux/of_fdt.h> #include <linux/pci.h> #include <linux/serial_core.h> +#include <linux/efi.h> +#include <linux/irq_work.h> +#include <linux/nmi.h> +#include <acpi/ghes.h> +#include <asm/csr.h> int acpi_noirq = 1; /* skip ACPI IRQ initialization */ int acpi_disabled = 1; @@ -334,3 +339,53 @@ int raw_pci_write(unsigned int domain, unsigned int bus, } #endif /* CONFIG_PCI */ + +/* + * Claim Hardware Error Exception as a firmware first notification. + * + * Used by RISC-V exception handler for hardware error processing. + * @regs may be NULL when called from process context. + */ +int apei_claim_hee(struct pt_regs *regs) +{ + int err = -ENOENT; + bool return_to_irqs_enabled; + unsigned long flags; + + if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES)) + return err; + + /* Save current interrupt state */ + local_irq_save(flags); + return_to_irqs_enabled = !irqs_disabled(); + + if (regs) + return_to_irqs_enabled = (regs->status & SR_SIE) != 0; + + /* + * HEE can interrupt other operations, handle as NMI-like context + * to ensure proper APEI processing + */ + nmi_enter(); + err = ghes_notify_hee(); + nmi_exit(); + + /* + * APEI NMI-like notifications are deferred to irq_work. Unless + * we interrupted irqs-masked code, we can do that now. + */ + if (!err) { + if (return_to_irqs_enabled) { + local_irq_restore(flags); + irq_work_run(); + } else { + pr_warn_ratelimited("APEI work queued but not completed"); + err = -EINPROGRESS; + } + } else { + local_irq_restore(flags); + } + + return err; +} +EXPORT_SYMBOL(apei_claim_hee); diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 3a0ec6fd5956..1cbefe934d84 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -459,6 +459,10 @@ SYM_DATA_START_LOCAL(excp_vect_table) RISCV_PTR do_page_fault /* load page fault */ RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ + RISCV_PTR do_trap_unknown + RISCV_PTR do_trap_unknown + RISCV_PTR do_trap_unknown + RISCV_PTR do_trap_hardware_error /* Hardware Error */ SYM_DATA_END_LABEL(excp_vect_table, SYM_L_LOCAL, excp_vect_table_end) #ifndef CONFIG_MMU diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 80230de167de..48f1ea1e03e6 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -22,6 +22,7 @@ #include <linux/irq.h> #include <linux/kexec.h> #include <linux/entry-common.h> +#include <linux/acpi.h> #include <asm/asm-prototypes.h> #include <asm/bug.h> @@ -442,3 +443,21 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) wait_for_interrupt(); } #endif + +asmlinkage __visible __trap_section void do_trap_hardware_error(struct pt_regs *regs) +{ + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + if (apei_claim_hee(regs)) + do_trap_error(regs, SIGBUS, BUS_OBJERR, regs->badaddr, "Hardware Error"); + + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + die(regs, "Hardware Error"); + + irqentry_nmi_exit(regs, state); + } +} -- 2.43.7