Skip to content

Commit

Permalink
RAS: Add a Corrected Errors Collector
Browse files Browse the repository at this point in the history
Introduce a simple data structure for collecting correctable errors
along with accessors. More detailed description in the code itself.

The error decoding is done with the decoding chain now and
mce_first_notifier() gets to see the error first and the CEC decides
whether to log it and then the rest of the chain doesn't hear about it -
basically the main reason for the CE collector - or to continue running
the notifiers.

When the CEC hits the action threshold, it will try to soft-offine the
page containing the ECC and then the whole decoding chain gets to see
the error.

Signed-off-by: Borislav Petkov <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: linux-edac <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
  • Loading branch information
suryasaimadhu authored and Ingo Molnar committed Mar 28, 2017
1 parent e64edfc commit 011d826
Show file tree
Hide file tree
Showing 10 changed files with 706 additions and 83 deletions.
6 changes: 6 additions & 0 deletions Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3172,6 +3172,12 @@
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
See Documentation/blockdev/ramdisk.txt.

ras=option[,option,...] [KNL] RAS-specific options

cec_disable [X86]
Disable the Correctable Errors Collector,
see CONFIG_RAS_CEC help text.

rcu_nocbs= [KNL]
The argument is a cpu list, as described above.

Expand Down
9 changes: 5 additions & 4 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops;

enum mce_notifier_prios {
MCE_PRIO_SRAO = INT_MAX,
MCE_PRIO_EXTLOG = INT_MAX - 1,
MCE_PRIO_NFIT = INT_MAX - 2,
MCE_PRIO_EDAC = INT_MAX - 3,
MCE_PRIO_FIRST = INT_MAX,
MCE_PRIO_SRAO = INT_MAX - 1,
MCE_PRIO_EXTLOG = INT_MAX - 2,
MCE_PRIO_NFIT = INT_MAX - 3,
MCE_PRIO_EDAC = INT_MAX - 4,
MCE_PRIO_LOWEST = 0,
};

Expand Down
191 changes: 115 additions & 76 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
Expand Down Expand Up @@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {

void mce_log(struct mce *m)
{
unsigned next, entry;

/* Emit the trace record: */
trace_mce_record(m);

if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);

wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog_buf.next);
for (;;) {

/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog_buf.flags);
return;
}
/* Old left over entry. Skip: */
if (mcelog_buf.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
break;
}
memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
wmb();
mcelog_buf.entry[entry].finished = 1;
wmb();

set_bit(0, &mce_need_notify);
}

void mce_inject_log(struct mce *m)
Expand All @@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);

static struct notifier_block mce_srao_nb;

/*
* We run the default notifier if we have only the SRAO, the first and the
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
* notifiers registered on the chain.
*/
#define NUM_DEFAULT_NOTIFIERS 3
static atomic_t num_notifiers;

void mce_register_decode_chain(struct notifier_block *nb)
Expand Down Expand Up @@ -522,7 +490,6 @@ static void mce_schedule_work(void)

static void mce_irq_work_cb(struct irq_work *entry)
{
mce_notify_irq();
mce_schedule_work();
}

Expand Down Expand Up @@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
return 1;
}

static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;

if (c->x86_vendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;

return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}

return false;
}

static bool cec_add_mce(struct mce *m)
{
if (!m)
return false;

/* We eat only correctable DRAM errors with usable addresses. */
if (memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
return true;

return false;
}

static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
unsigned int next, entry;

if (!m)
return NOTIFY_DONE;

if (cec_add_mce(m))
return NOTIFY_STOP;

/* Emit the trace record: */
trace_mce_record(m);

wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog_buf.next);
for (;;) {

/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog_buf.flags);
return NOTIFY_DONE;
}
/* Old left over entry. Skip: */
if (mcelog_buf.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
break;
}
memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
wmb();
mcelog_buf.entry[entry].finished = 1;
wmb();

set_bit(0, &mce_need_notify);

mce_notify_irq();

return NOTIFY_DONE;
}

static struct notifier_block first_nb = {
.notifier_call = mce_first_notifier,
.priority = MCE_PRIO_FIRST,
};

static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
Expand Down Expand Up @@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;

/*
* Run the default notifier if we have only the SRAO
* notifier and us registered.
*/
if (atomic_read(&num_notifiers) > 2)
if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
return NOTIFY_DONE;

/* Don't print when mcelog is running */
Expand Down Expand Up @@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
}
}

static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;

if (c->x86_vendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;

return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}

return false;
}

DEFINE_PER_CPU(unsigned, mce_poll_count);

/*
Expand Down Expand Up @@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&first_nb);
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
Expand Down Expand Up @@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key);

mcheck_debugfs_init();
cec_init();

/*
* Flush out everything that has been logged during early boot, now that
Expand Down
14 changes: 14 additions & 0 deletions arch/x86/ras/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,17 @@ config MCE_AMD_INJ
aspects of the MCE handling code.

WARNING: Do not even assume this interface is staying stable!

config RAS_CEC
bool "Correctable Errors Collector"
depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
---help---
This is a small cache which collects correctable memory errors per 4K
page PFN and counts their repeated occurrence. Once the counter for a
PFN overflows, we try to soft-offline that page as we take it to mean
that it has reached a relatively high error count and would probably
be best if we don't use it anymore.

Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.

3 changes: 2 additions & 1 deletion drivers/ras/Makefile
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
obj-$(CONFIG_RAS) += ras.o debugfs.o
obj-$(CONFIG_RAS) += ras.o debugfs.o
obj-$(CONFIG_RAS_CEC) += cec.o
Loading

0 comments on commit 011d826

Please sign in to comment.