Skip to content

Commit

Permalink
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/tip/tip

Pull x86 page table isolation updates from Thomas Gleixner:
 "This is the final set of enabling page table isolation on x86:

   - Infrastructure patches for handling the extra page tables.

   - Patches which map the various bits and pieces which are required to
     get in and out of user space into the user space visible page
     tables.

   - The required changes to have CR3 switching in the entry/exit code.

   - Optimizations for the CR3 switching along with documentation how
     the ASID/PCID mechanism works.

   - Updates to dump pagetables to cover the user space page tables for
     W+X scans and extra debugfs files to analyze both the kernel and
     the user space visible page tables

  The whole functionality is compile time controlled via a config switch
  and can be turned on/off on the command line as well"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  x86/ldt: Make the LDT mapping RO
  x86/mm/dump_pagetables: Allow dumping current pagetables
  x86/mm/dump_pagetables: Check user space page table for WX pages
  x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
  x86/mm/pti: Add Kconfig
  x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
  x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
  x86/mm: Use INVPCID for __native_flush_tlb_single()
  x86/mm: Optimize RESTORE_CR3
  x86/mm: Use/Fix PCID to optimize user/kernel switches
  x86/mm: Abstract switching CR3
  x86/mm: Allow flushing for future ASID switches
  x86/pti: Map the vsyscall page if needed
  x86/pti: Put the LDT in its own PGD if PTI is on
  x86/mm/64: Make a full PGD-entry size hole in the memory map
  x86/events/intel/ds: Map debug buffers in cpu_entry_area
  x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
  x86/mm/pti: Map ESPFIX into user space
  x86/mm/pti: Share entry text PMD
  x86/entry: Align entry text section to PMD boundary
  ...
  • Loading branch information
torvalds committed Dec 30, 2017
2 parents 6123358 + 9f5cb6b commit 5aa90a8
Show file tree
Hide file tree
Showing 45 changed files with 1,636 additions and 202 deletions.
8 changes: 8 additions & 0 deletions Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2708,6 +2708,8 @@
steal time is computed, but won't influence scheduler
behaviour

nopti [X86-64] Disable kernel page table isolation

nolapic [X86-32,APIC] Do not enable or use the local APIC.

nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
Expand Down Expand Up @@ -3282,6 +3284,12 @@
pt. [PARIDE]
See Documentation/blockdev/paride.txt.

pti= [X86_64]
Control user/kernel address space isolation:
on - enable
off - disable
auto - default setting

pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
Expand Down
5 changes: 3 additions & 2 deletions Documentation/x86/x86_64/mm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
Expand All @@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff91ffffffffffff (=49 bits) hole
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
Expand Down
3 changes: 3 additions & 0 deletions arch/x86/boot/compressed/pagetable.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
*/
#undef CONFIG_AMD_MEM_ENCRYPT

/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION

#include "misc.h"

/* These actually do the work of building the kernel identity maps. */
Expand Down
145 changes: 145 additions & 0 deletions arch/x86/entry/calling.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>

/*
Expand Down Expand Up @@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm

#ifdef CONFIG_PAGE_TABLE_ISOLATION

/*
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))

.macro SET_NOFLUSH_BIT reg:req
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
.endm

.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_SWITCH_MASK), \reg
.endm

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm

#define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask

.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg

ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

/*
* Test if the ASID needs a flush.
*/
movq \scratch_reg, \scratch_reg2
andq $(0x7FF), \scratch_reg /* mask ASID */
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@

/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@

.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg

.Lwrcr3_\@:
/* Flip the PGD and ASID to the user version */
orq $(PTI_SWITCH_MASK), \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm

.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
pushq %rax
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
.endm

.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg
/*
* Is the "switch mask" all zero? That means that both of
* these are zero:
*
* 1. The user/kernel PCID bit, and
* 2. The user/kernel "bit" that points CR3 to the
* bottom half of the 8k PGD
*
* That indicates a kernel CR3 value, not a user CR3.
*/
testq $(PTI_SWITCH_MASK), \scratch_reg
jz .Ldone_\@

ADJUST_KERNEL_CR3 \scratch_reg
movq \scratch_reg, %cr3

.Ldone_\@:
.endm

.macro RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI

ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

/*
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
*/
bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
jnc .Lnoflush_\@

/*
* Check if there's a pending flush for the user ASID we're
* about to set.
*/
movq \save_reg, \scratch_reg
andq $(0x7FF), \scratch_reg
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@

btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
jmp .Lwrcr3_\@

.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg

.Lwrcr3_\@:
/*
* The CR3 write could be avoided when not changing its value,
* but would require a CR3 read *and* a scratch register.
*/
movq \save_reg, %cr3
.Lend_\@:
.endm

#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
.endm

#endif

#endif /* CONFIG_X86_64 */

/*
Expand Down
48 changes: 41 additions & 7 deletions arch/x86/entry/entry_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
Expand All @@ -40,6 +39,8 @@
#include <asm/frame.h>
#include <linux/err.h>

#include "calling.h"

.code64
.section .entry.text, "ax"

Expand Down Expand Up @@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH

/* Note: using %rsp as a scratch reg. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp

/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp

Expand Down Expand Up @@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
*/

swapgs
/*
* This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
* is not required to switch CR3.
*/
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

Expand Down Expand Up @@ -403,6 +411,7 @@ syscall_return_via_sysret:
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

popq %rdi
popq %rsp
Expand Down Expand Up @@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
* We can do future final exit work right here.
*/

SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

/* Restore RDI. */
popq %rdi
SWAPGS
Expand Down Expand Up @@ -822,7 +833,9 @@ native_irq_return_ldt:
*/

pushq %rdi /* Stash user RDI */
SWAPGS
SWAPGS /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */

movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
Expand All @@ -838,7 +851,6 @@ native_irq_return_ldt:
/* Now RAX == RSP. */

andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
popq %rdi /* Restore user RDI */

/*
* espfix_stack[31:16] == 0. The page tables are set up such that
Expand All @@ -849,7 +861,11 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS

SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
SWAPGS /* to user GS */
popq %rdi /* Restore user RDI */

movq %rax, %rsp
UNWIND_HINT_IRET_REGS offset=8

Expand Down Expand Up @@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC

pushq %rdi
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
Expand Down Expand Up @@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1: ret

1:
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14

ret
END(paranoid_entry)

/*
Expand All @@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
Expand Down Expand Up @@ -1299,6 +1322,8 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
Expand Down Expand Up @@ -1345,6 +1370,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done

.Lbstep_iret:
Expand All @@ -1354,10 +1380,11 @@ ENTRY(error_entry)

.Lerror_bad_iret:
/*
* We came from an IRET to user mode, so we have user gsbase.
* Switch to kernel gsbase:
* We came from an IRET to user mode, so we have user
* gsbase and CR3. Switch to kernel gsbase and CR3:
*/
SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

/*
* Pretend that the exception came from user mode: set up pt_regs
Expand Down Expand Up @@ -1389,6 +1416,10 @@ END(error_exit)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here.
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
Expand Down Expand Up @@ -1452,6 +1483,7 @@ ENTRY(nmi)

swapgs
cld
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
Expand Down Expand Up @@ -1704,6 +1736,8 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi

RESTORE_CR3 scratch_reg=%r15 save_reg=%r14

testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
Expand Down
Loading

0 comments on commit 5aa90a8

Please sign in to comment.