Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Browse files Browse the repository at this point in the history
Pull kvm fixes from Paolo Bonzini:
 "Bugfixes for ARM, x86 and tools"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  tools/kvm_stat: Exempt time-based counters
  KVM: mmu: Fix SPTE encoding of MMIO generation upper half
  kvm: x86/mmu: Use cpuid to determine max gfn
  kvm: svm: de-allocate svm_cpu_data for all cpus in svm_cpu_uninit()
  selftests: kvm/set_memory_region_test: Fix race in move region test
  KVM: arm64: Add usage of stage 2 fault lookup level in user_mem_abort()
  KVM: arm64: Fix handling of merging tables into a block entry
  KVM: arm64: Fix memory leak on stage2 update of a valid PTE
  • Loading branch information
torvalds committed Dec 12, 2020
2 parents b53966f + 111d0bd commit 7b1b868
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 22 deletions.
2 changes: 1 addition & 1 deletion Documentation/virt/kvm/mmu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ If the generation number of the spte does not equal the global generation
number, it will ignore the cached MMIO information and handle the page
fault through the slow path.

Since only 19 bits are used to store generation-number on mmio spte, all
Since only 18 bits are used to store generation-number on mmio spte, all
pages are zapped when there is an overflow.

Unfortunately, a single memory access might access kvm_memslots(kvm) multiple
Expand Down
1 change: 1 addition & 0 deletions arch/arm64/include/asm/esr.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */
#define ESR_ELx_FSC (0x3F)
#define ESR_ELx_FSC_TYPE (0x3C)
#define ESR_ELx_FSC_LEVEL (0x03)
#define ESR_ELx_FSC_EXTABT (0x10)
#define ESR_ELx_FSC_SERROR (0x11)
#define ESR_ELx_FSC_ACCESS (0x08)
Expand Down
5 changes: 5 additions & 0 deletions arch/arm64/include/asm/kvm_emulate.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vc
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE;
}

static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL;
}

static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
{
switch (kvm_vcpu_trap_get_fault(vcpu)) {
Expand Down
17 changes: 16 additions & 1 deletion arch/arm64/kvm/hyp/pgtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,15 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;

/*
* If the PTE was already valid, drop the refcount on the table
* early, as it will be bumped-up again in stage2_map_walk_leaf().
* This ensures that the refcount stays constant across a valid to
* valid PTE update.
*/
if (kvm_pte_valid(*ptep))
put_page(virt_to_page(ptep));

if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
goto out;

Expand All @@ -493,7 +502,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
return 0;

kvm_set_invalid_pte(ptep);
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);

/*
* Invalidate the whole stage-2, as we may have numerous leaf
* entries below us which would otherwise need invalidating
* individually.
*/
kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
data->anchor = ptep;
return 0;
}
Expand Down
11 changes: 9 additions & 2 deletions arch/arm64/kvm/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -754,10 +754,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
unsigned long vma_pagesize;
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
unsigned long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;

fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
Expand Down Expand Up @@ -896,7 +898,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;

if (fault_status == FSC_PERM && !(logging_active && writable)) {
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kvm/mmu/spte.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ static u64 generation_mmio_spte_mask(u64 gen)
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);

mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}

Expand Down
25 changes: 18 additions & 7 deletions arch/x86/kvm/mmu/spte.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))

/*
* Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
* the memslots generation and is derived as follows:
*
* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
* Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
* Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
Expand All @@ -69,18 +69,29 @@
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)

#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)

#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
#define MMIO_SPTE_GEN_HIGH_END 62

#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)

#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)

/* remember to adjust the comment above as well if you change these */
static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);

#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)

#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)

extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
Expand Down Expand Up @@ -228,8 +239,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;

gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
return gen;
}

Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kvm/mmu/tdp_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,

void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

lockdep_assert_held(&kvm->mmu_lock);

Expand Down Expand Up @@ -456,7 +456,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)

void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush;

flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kvm/svm/svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -530,12 +530,12 @@ static int svm_hardware_enable(void)

static void svm_cpu_uninit(int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);

if (!sd)
return;

per_cpu(svm_data, raw_smp_processor_id()) = NULL;
per_cpu(svm_data, cpu) = NULL;
kfree(sd->sev_vmcbs);
__free_page(sd->save_area);
kfree(sd);
Expand Down
6 changes: 5 additions & 1 deletion tools/kvm/kvm_stat/kvm_stat
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,11 @@ class DebugfsProvider(Provider):
The fields are all available KVM debugfs files
"""
return self.walkdir(PATH_DEBUGFS_KVM)[2]
exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns']
fields = [field for field in self.walkdir(PATH_DEBUGFS_KVM)[2]
if field not in exempt_list]

return fields

def update_fields(self, fields_filter):
"""Refresh fields, applying fields_filter"""
Expand Down
17 changes: 13 additions & 4 deletions tools/testing/selftests/kvm/set_memory_region_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,23 @@ static void guest_code_move_memory_region(void)
GUEST_SYNC(0);

/*
* Spin until the memory region is moved to a misaligned address. This
* may or may not trigger MMIO, as the window where the memslot is
* invalid is quite small.
* Spin until the memory region starts getting moved to a
* misaligned address.
* Every region move may or may not trigger MMIO, as the
* window where the memslot is invalid is usually quite small.
*/
val = guest_spin_on_val(0);
GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);

/* Spin until the memory region is realigned. */
/* Spin until the misaligning memory region move completes. */
val = guest_spin_on_val(MMIO_VAL);
GUEST_ASSERT_1(val == 1 || val == 0, val);

/* Spin until the memory region starts to get re-aligned. */
val = guest_spin_on_val(0);
GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);

/* Spin until the re-aligning memory region move completes. */
val = guest_spin_on_val(MMIO_VAL);
GUEST_ASSERT_1(val == 1, val);

Expand Down

0 comments on commit 7b1b868

Please sign in to comment.