Skip to content

Commit

Permalink
panic, x86: Fix re-entrance problem due to panic on NMI
Browse files Browse the repository at this point in the history
If panic on NMI happens just after panic() on the same CPU, panic() is
recursively called. Kernel stalls, as a result, after failing to acquire
panic_lock.

To avoid this problem, don't call panic() in NMI context if we've
already entered panic().

For that, introduce nmi_panic() macro to reduce code duplication. In
the case of panic on NMI, don't return from NMI handlers if another CPU
already panicked.

Signed-off-by: Hidehiro Kawai <[email protected]>
Acked-by: Michal Hocko <[email protected]>
Cc: Aaron Tomlin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Baoquan He <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Gobinda Charan Maji <[email protected]>
Cc: HATAYAMA Daisuke <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Javi Merino <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: lkml <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Michal Nazarewicz <[email protected]>
Cc: Nicolas Iooss <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Prarit Bhargava <[email protected]>
Cc: Rasmus Villemoes <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Seth Jennings <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ulrich Obergfell <[email protected]>
Cc: Vitaly Kuznetsov <[email protected]>
Cc: Vivek Goyal <[email protected]>
Link: http://lkml.kernel.org/r/20151210014626.25437.13302.stgit@softrs
[ Cleanup comments, fixup formatting. ]
Signed-off-by: Borislav Petkov <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
  • Loading branch information
Hidehiro Kawai authored and KAGA-KOKO committed Dec 19, 2015
1 parent d267b8d commit 1717f20
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 8 deletions.
16 changes: 12 additions & 4 deletions arch/x86/kernel/nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
#endif

if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
nmi_panic("NMI: Not continuing");

pr_emerg("Dazed and confused, but trying to continue\n");

Expand All @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason, smp_processor_id());
show_regs(regs);

if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
if (panic_on_io_nmi) {
nmi_panic("NMI IOCK error: Not continuing");

/*
* If we end up here, it means we have received an NMI while
* processing panic(). Simply return without delaying and
* re-enabling NMIs.
*/
return;
}

/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
Expand Down Expand Up @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)

pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
nmi_panic("NMI: Not continuing");

pr_emerg("Dazed and confused, but trying to continue\n");
}
Expand Down
20 changes: 20 additions & 0 deletions include/linux/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,26 @@ extern int sysctl_panic_on_stackoverflow;

extern bool crash_kexec_post_notifiers;

/*
* panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
* holds a CPU number which is executing panic() currently. A value of
* PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
*/
extern atomic_t panic_cpu;
#define PANIC_CPU_INVALID -1

/*
* A variant of panic() called from NMI context. We return if we've already
* panicked on this CPU.
*/
#define nmi_panic(fmt, ...) \
do { \
int cpu = raw_smp_processor_id(); \
\
if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu) \
panic(fmt, ##__VA_ARGS__); \
} while (0)

/*
* Only to be used by arch init code. If the user over-wrote the default
* CONFIG_PANIC_TIMEOUT, honor it.
Expand Down
16 changes: 13 additions & 3 deletions kernel/panic.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ void __weak panic_smp_self_stop(void)
cpu_relax();
}

atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);

/**
* panic - halt the system
* @fmt: The text string to print
Expand All @@ -71,17 +73,17 @@ void __weak panic_smp_self_stop(void)
*/
void panic(const char *fmt, ...)
{
static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
va_list args;
long i, i_next = 0;
int state = 0;
int old_cpu, this_cpu;

/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
* there is nothing to prevent an interrupt handler (that runs
* after the panic_lock is acquired) from invoking panic again.
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();

Expand All @@ -94,8 +96,16 @@ void panic(const char *fmt, ...)
* multiple parallel invocations of panic, all other CPUs either
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*
* `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
* comes here, so go ahead.
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
if (!spin_trylock(&panic_lock))
this_cpu = raw_smp_processor_id();
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);

if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();

console_verbose();
Expand Down
2 changes: 1 addition & 1 deletion kernel/watchdog.c
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
trigger_allbutself_cpu_backtrace();

if (hardlockup_panic)
panic("Hard LOCKUP");
nmi_panic("Hard LOCKUP");

__this_cpu_write(hard_watchdog_warn, true);
return;
Expand Down

0 comments on commit 1717f20

Please sign in to comment.