Skip to content

Commit

Permalink
Merge tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/…
Browse files Browse the repository at this point in the history
…kernel/git/nvdimm/nvdimm

Pull libnvdimm and DAX updates from Dan Williams:
 "New support for clearing memory errors when a file is in DAX mode,
  alongside with some other fixes and cleanups.

  Previously it was only possible to clear these errors using a truncate
  or hole-punch operation to trigger the filesystem to reallocate the
  block, now, any page aligned write can opportunistically clear errors
  as well.

  This change spans x86/mm, nvdimm, and fs/dax, and has received the
  appropriate sign-offs. Thanks to Jane for her work on this.

  Summary:

   - Add support for clearing memory error via pwrite(2) on DAX

   - Fix 'security overwrite' support in the presence of media errors

   - Miscellaneous cleanups and fixes for nfit_test (nvdimm unit tests)"

* tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  pmem: implement pmem_recovery_write()
  pmem: refactor pmem_clear_poison()
  dax: add .recovery_write dax_operation
  dax: introduce DAX_RECOVERY_WRITE dax access mode
  mce: fix set_mce_nospec to always unmap the whole page
  x86/mce: relocate set{clear}_mce_nospec() functions
  acpi/nfit: rely on mce->misc to determine poison granularity
  testing: nvdimm: asm/mce.h is not needed in nfit.c
  testing: nvdimm: iomap: make __nfit_test_ioremap a macro
  nvdimm: Allow overwrite in the presence of disabled dimms
  tools/testing/nvdimm: remove unneeded flush_workqueue
  • Loading branch information
torvalds committed May 27, 2022
2 parents ea6c3bc + f42e8e5 commit 35cdd86
Show file tree
Hide file tree
Showing 24 changed files with 359 additions and 171 deletions.
52 changes: 0 additions & 52 deletions arch/x86/include/asm/set_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,56 +86,4 @@ bool kernel_page_present(struct page *page);

extern int kernel_set_to_readonly;

#ifdef CONFIG_X86_64
/*
* Prevent speculative access to the page by either unmapping
* it (if we do not require access to any part of the page) or
* marking it uncacheable (if we want to try to retrieve data
* from non-poisoned lines in the page).
*/
static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
unsigned long decoy_addr;
int rc;

/* SGX pages are not in the 1:1 map */
if (arch_is_platform_page(pfn << PAGE_SHIFT))
return 0;
/*
* We would like to just call:
* set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
* This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));

if (unmap)
rc = set_memory_np(decoy_addr, 1);
else
rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
}
#define set_mce_nospec set_mce_nospec

/* Restore full speculative operation to the pfn. */
static inline int clear_mce_nospec(unsigned long pfn)
{
return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
}
#define clear_mce_nospec clear_mce_nospec
#else
/*
* Few people would run a 32-bit kernel on a machine that supports
* recoverable errors because they have too much memory to boot 32-bit.
*/
#endif

#endif /* _ASM_X86_SET_MEMORY_H */
6 changes: 3 additions & 3 deletions arch/x86/kernel/cpu/mce/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,

pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn, whole_page(mce));
set_mce_nospec(pfn);
mce->kflags |= MCE_HANDLED_UC;
}

Expand Down Expand Up @@ -1318,7 +1318,7 @@ static void kill_me_maybe(struct callback_head *cb)

ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
if (!ret) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
sync_core();
return;
}
Expand All @@ -1344,7 +1344,7 @@ static void kill_me_never(struct callback_head *cb)
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
}

static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
Expand Down
49 changes: 47 additions & 2 deletions arch/x86/mm/pat/set_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <linux/vmstat.h>
#include <linux/kernel.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>

#include <asm/e820/api.h>
#include <asm/processor.h>
Expand All @@ -29,7 +30,6 @@
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>
#include <asm/set_memory.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>

Expand Down Expand Up @@ -1805,7 +1805,7 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
}

/*
* _set_memory_prot is an internal helper for callers that have been passed
* __set_memory_prot is an internal helper for callers that have been passed
* a pgprot_t value from upper layers and a reservation has already been taken.
* If you want to set the pgprot to a specific page protocol, use the
* set_memory_xx() functions.
Expand Down Expand Up @@ -1914,6 +1914,51 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);

/* Prevent speculative access to a page by marking it not-present */
#ifdef CONFIG_X86_64
int set_mce_nospec(unsigned long pfn)
{
unsigned long decoy_addr;
int rc;

/* SGX pages are not in the 1:1 map */
if (arch_is_platform_page(pfn << PAGE_SHIFT))
return 0;
/*
* We would like to just call:
* set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
* This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));

rc = set_memory_np(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
}

static int set_memory_present(unsigned long *addr, int numpages)
{
return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
{
unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);

return set_memory_present(&addr, 1);
}
EXPORT_SYMBOL_GPL(clear_mce_nospec);
#endif /* CONFIG_X86_64 */

int set_memory_x(unsigned long addr, int numpages)
{
if (!(__supported_pte_mask & _PAGE_NX))
Expand Down
4 changes: 2 additions & 2 deletions drivers/acpi/nfit/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
*/
mutex_lock(&acpi_desc_lock);
list_for_each_entry(acpi_desc, &acpi_descs, list) {
unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc);
struct device *dev = acpi_desc->dev;
int found_match = 0;

Expand Down Expand Up @@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,

/* If this fails due to an -ENOMEM, there is little we can do */
nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
ALIGN(mce->addr, L1_CACHE_BYTES),
L1_CACHE_BYTES);
ALIGN_DOWN(mce->addr, align), align);
nvdimm_region_notify(nfit_spa->nd_region,
NVDIMM_REVALIDATE_POISON);

Expand Down
14 changes: 12 additions & 2 deletions drivers/dax/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,15 @@ enum dax_device_flags {
* @dax_dev: a dax_device instance representing the logical memory range
* @pgoff: offset in pages from the start of the device to translate
* @nr_pages: number of consecutive pages caller can handle relative to @pfn
* @mode: indicator on normal access or recovery write
* @kaddr: output parameter that returns a virtual address mapping of pfn
* @pfn: output parameter that returns an absolute pfn translation of @pgoff
*
* Return: negative errno if an error occurs, otherwise the number of
* pages accessible at the device relative @pgoff.
*/
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
void **kaddr, pfn_t *pfn)
enum dax_access_mode mode, void **kaddr, pfn_t *pfn)
{
long avail;

Expand All @@ -138,7 +139,7 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
return -EINVAL;

avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
kaddr, pfn);
mode, kaddr, pfn);
if (!avail)
return -ERANGE;
return min(avail, nr_pages);
Expand Down Expand Up @@ -194,6 +195,15 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);

size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *iter)
{
if (!dax_dev->ops->recovery_write)
return 0;
return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter);
}
EXPORT_SYMBOL_GPL(dax_recovery_write);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
Expand Down
15 changes: 13 additions & 2 deletions drivers/md/dm-linear.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,12 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
}

static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, enum dax_access_mode mode, void **kaddr,
pfn_t *pfn)
{
struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);

return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
}

static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
Expand All @@ -180,9 +181,18 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
}

static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);

return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
}

#else
#define linear_dax_direct_access NULL
#define linear_dax_zero_page_range NULL
#define linear_dax_recovery_write NULL
#endif

static struct target_type linear_target = {
Expand All @@ -200,6 +210,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_zero_page_range = linear_dax_zero_page_range,
.dax_recovery_write = linear_dax_recovery_write,
};

int __init dm_linear_init(void)
Expand Down
15 changes: 13 additions & 2 deletions drivers/md/dm-log-writes.c
Original file line number Diff line number Diff line change
Expand Up @@ -888,11 +888,12 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
}

static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, enum dax_access_mode mode, void **kaddr,
pfn_t *pfn)
{
struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);

return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
}

static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
Expand All @@ -903,9 +904,18 @@ static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT);
}

static size_t log_writes_dax_recovery_write(struct dm_target *ti,
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);

return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
}

#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_zero_page_range NULL
#define log_writes_dax_recovery_write NULL
#endif

static struct target_type log_writes_target = {
Expand All @@ -923,6 +933,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access,
.dax_zero_page_range = log_writes_dax_zero_page_range,
.dax_recovery_write = log_writes_dax_recovery_write,
};

static int __init dm_log_writes_init(void)
Expand Down
15 changes: 13 additions & 2 deletions drivers/md/dm-stripe.c
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,12 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
}

static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, enum dax_access_mode mode, void **kaddr,
pfn_t *pfn)
{
struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);

return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
}

static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
Expand All @@ -330,9 +331,18 @@ static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
}

static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);

return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
}

#else
#define stripe_dax_direct_access NULL
#define stripe_dax_zero_page_range NULL
#define stripe_dax_recovery_write NULL
#endif

/*
Expand Down Expand Up @@ -469,6 +479,7 @@ static struct target_type stripe_target = {
.io_hints = stripe_io_hints,
.direct_access = stripe_dax_direct_access,
.dax_zero_page_range = stripe_dax_zero_page_range,
.dax_recovery_write = stripe_dax_recovery_write,
};

int __init dm_stripe_init(void)
Expand Down
4 changes: 3 additions & 1 deletion drivers/md/dm-target.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/bio.h>
#include <linux/dax.h>

#define DM_MSG_PREFIX "target"

Expand Down Expand Up @@ -142,7 +143,8 @@ static void io_err_release_clone_rq(struct request *clone,
}

static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, enum dax_access_mode mode, void **kaddr,
pfn_t *pfn)
{
return -EIO;
}
Expand Down
7 changes: 4 additions & 3 deletions drivers/md/dm-writecache.c
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,8 @@ static int persistent_memory_claim(struct dm_writecache *wc)

id = dax_read_lock();

da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn);
da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
&wc->memory_map, &pfn);
if (da < 0) {
wc->memory_map = NULL;
r = da;
Expand All @@ -308,8 +309,8 @@ static int persistent_memory_claim(struct dm_writecache *wc)
i = 0;
do {
long daa;
daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i,
NULL, &pfn);
daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
p - i, DAX_ACCESS, NULL, &pfn);
if (daa <= 0) {
r = daa ? daa : -EINVAL;
goto err3;
Expand Down
Loading

0 comments on commit 35cdd86

Please sign in to comment.