From 4d5f2c04c8a462a1c7ad3fdc138515c84fa65379 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 9 Feb 2017 17:15:41 +0100
Subject: [PATCH 001/227] KVM: s390: log runtime instrumentation enablement

We handle runtime instrumentation enablement either lazy or via
sync_regs on migration. Make sure to add a debug log entry for that
per CPU on the first occurrence.

Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 14 ++++++++------
 arch/s390/kvm/priv.c     |  1 +
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fd6cd05bb6a7c7..5fda4686e817e4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2719,6 +2719,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 
 static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct runtime_instr_cb *riccb;
+
+	riccb = (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
@@ -2747,12 +2750,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 * we should enable RI here instead of doing the lazy enablement.
 	 */
 	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
-	    test_kvm_facility(vcpu->kvm, 64)) {
-		struct runtime_instr_cb *riccb =
-			(struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
-
-		if (riccb->valid)
-			vcpu->arch.sie_block->ecb3 |= 0x01;
+	    test_kvm_facility(vcpu->kvm, 64) &&
+	    riccb->valid &&
+	    !(vcpu->arch.sie_block->ecb3 & 0x01)) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
+		vcpu->arch.sie_block->ecb3 |= 0x01;
 	}
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 64b6a309f2c47c..93d6cde8c3eae0 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -37,6 +37,7 @@
 static int handle_ri(struct kvm_vcpu *vcpu)
 {
 	if (test_kvm_facility(vcpu->kvm, 64)) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)");
 		vcpu->arch.sie_block->ecb3 |= 0x01;
 		kvm_s390_retry_instr(vcpu);
 		return 0;

From c0a6bfdc18b83290b65372a7687134052f382bdf Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 27 Feb 2017 21:14:47 +0100
Subject: [PATCH 002/227] KVM: s390: Handle sthyi also for instruction
 intercept

Right now we handle the STHYI only via the operation exception intercept
(illegal instruction). If hardware ever decides to provide an
instruction intercept for STHYI, we should handle that as well.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/intercept.c | 3 +--
 arch/s390/kvm/priv.c      | 1 +
 arch/s390/kvm/sthyi.c     | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 59920f96ebc064..0a176465515315 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -368,8 +368,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
 				      vcpu->arch.sie_block->ipb);
 
-	if (vcpu->arch.sie_block->ipa == 0xb256 &&
-	    test_kvm_facility(vcpu->kvm, 74))
+	if (vcpu->arch.sie_block->ipa == 0xb256)
 		return handle_sthyi(vcpu);
 
 	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 93d6cde8c3eae0..26f30bab9c2f4c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -760,6 +760,7 @@ static const intercept_handler_t b2_handlers[256] = {
 	[0x3b] = handle_io_inst,
 	[0x3c] = handle_io_inst,
 	[0x50] = handle_ipte_interlock,
+	[0x56] = handle_sthyi,
 	[0x5f] = handle_io_inst,
 	[0x74] = handle_io_inst,
 	[0x76] = handle_io_inst,
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index 05c98bb853cf97..926b5244263efd 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -404,6 +404,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 	u64 code, addr, cc = 0;
 	struct sthyi_sctns *sctns = NULL;
 
+	if (!test_kvm_facility(vcpu->kvm, 74))
+		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+
 	/*
 	 * STHYI requires extensive locking in the higher hypervisors
 	 * and is very computational/memory expensive. Therefore we

From 0c9d86833dfdafaf580cc49735661d002ef07ee3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 13 Mar 2017 11:48:28 +0100
Subject: [PATCH 003/227] KVM: s390: use defines for execution controls

Let's replace the bitmasks by defines. Reconstructed from code, comments
and commit messages.

Tried to keep the defines short and map them to feature names. In case
they don't completely map to features, keep them in the stye of ICTL
defines.

This effectively drops all "U" from the existing numbers. I think this
should be fine (as similarly done for e.g. ICTL defines).

I am not 100% sure about the ECA_MVPGI and ECA_PROTEXCI bits as they are
always used in pairs.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20170313104828.13362-1-david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
[some renames, add one missing place]
---
 arch/s390/include/asm/kvm_host.h | 18 +++++++++++++++-
 arch/s390/kvm/gaccess.c          |  6 +++---
 arch/s390/kvm/kvm-s390.c         | 36 ++++++++++++++++----------------
 arch/s390/kvm/kvm-s390.h         |  2 +-
 arch/s390/kvm/priv.c             |  2 +-
 arch/s390/kvm/vsie.c             | 33 ++++++++++++++---------------
 6 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index a41faf34b034c8..18e9c9d83b23e2 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -164,6 +164,13 @@ struct kvm_s390_sie_block {
 #define ICTL_RRBE	0x00001000
 #define ICTL_TPROT	0x00000200
 	__u32	ictl;			/* 0x0048 */
+#define ECA_CEI		0x80000000
+#define ECA_IB		0x40000000
+#define ECA_SIGPI	0x10000000
+#define ECA_MVPGI	0x01000000
+#define ECA_VX		0x00020000
+#define ECA_PROTEXCI	0x00002000
+#define ECA_SII		0x00000001
 	__u32	eca;			/* 0x004c */
 #define ICPT_INST	0x04
 #define ICPT_PROGI	0x08
@@ -182,10 +189,18 @@ struct kvm_s390_sie_block {
 	__u32	ipb;			/* 0x0058 */
 	__u32	scaoh;			/* 0x005c */
 	__u8	reserved60;		/* 0x0060 */
+#define ECB_TE		0x10
+#define ECB_SRSI	0x04
+#define ECB_HOSTPROTINT	0x02
 	__u8	ecb;			/* 0x0061 */
+#define ECB2_CMMA	0x80
+#define ECB2_IEP	0x20
+#define ECB2_PFMFI	0x08
+#define ECB2_ESCA	0x04
 	__u8    ecb2;                   /* 0x0062 */
-#define ECB3_AES 0x04
 #define ECB3_DEA 0x08
+#define ECB3_AES 0x04
+#define ECB3_RI  0x01
 	__u8    ecb3;			/* 0x0063 */
 	__u32	scaol;			/* 0x0064 */
 	__u8	reserved68[4];		/* 0x0068 */
@@ -224,6 +239,7 @@ struct kvm_s390_sie_block {
 	__u8	reserved1a4[20];	/* 0x01a4 */
 	__u64	cbrlo;			/* 0x01b8 */
 	__u8	reserved1c0[8];		/* 0x01c0 */
+#define ECD_HOSTREGMGMT	0x20000000
 	__u32	ecd;			/* 0x01c8 */
 	__u8	reserved1cc[18];	/* 0x01cc */
 	__u64	pp;			/* 0x01de */
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index d55c829a5944c2..709aca9ceb0571 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -262,7 +262,7 @@ struct aste {
 
 int ipte_lock_held(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.sie_block->eca & 1) {
+	if (vcpu->arch.sie_block->eca & ECA_SII) {
 		int rc;
 
 		read_lock(&vcpu->kvm->arch.sca_lock);
@@ -361,7 +361,7 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
 
 void ipte_lock(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.sie_block->eca & 1)
+	if (vcpu->arch.sie_block->eca & ECA_SII)
 		ipte_lock_siif(vcpu);
 	else
 		ipte_lock_simple(vcpu);
@@ -369,7 +369,7 @@ void ipte_lock(struct kvm_vcpu *vcpu)
 
 void ipte_unlock(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.sie_block->eca & 1)
+	if (vcpu->arch.sie_block->eca & ECA_SII)
 		ipte_unlock_siif(vcpu);
 	else
 		ipte_unlock_simple(vcpu);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5fda4686e817e4..976373c0313813 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1646,7 +1646,7 @@ static void sca_add_vcpu(struct kvm_vcpu *vcpu)
 		sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
 		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
 		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
-		vcpu->arch.sie_block->ecb2 |= 0x04U;
+		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
 		set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
 	} else {
 		struct bsca_block *sca = vcpu->kvm->arch.sca;
@@ -1700,7 +1700,7 @@ static int sca_switch_to_extended(struct kvm *kvm)
 	kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) {
 		vcpu->arch.sie_block->scaoh = scaoh;
 		vcpu->arch.sie_block->scaol = scaol;
-		vcpu->arch.sie_block->ecb2 |= 0x04U;
+		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
 	}
 	kvm->arch.sca = new_sca;
 	kvm->arch.use_esca = 1;
@@ -1939,8 +1939,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.sie_block->cbrlo)
 		return -ENOMEM;
 
-	vcpu->arch.sie_block->ecb2 |= 0x80;
-	vcpu->arch.sie_block->ecb2 &= ~0x08;
+	vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+	vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
 	return 0;
 }
 
@@ -1970,28 +1970,28 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	/* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
 	if (MACHINE_HAS_ESOP)
-		vcpu->arch.sie_block->ecb |= 0x02;
+		vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
 	if (test_kvm_facility(vcpu->kvm, 9))
-		vcpu->arch.sie_block->ecb |= 0x04;
+		vcpu->arch.sie_block->ecb |= ECB_SRSI;
 	if (test_kvm_facility(vcpu->kvm, 73))
-		vcpu->arch.sie_block->ecb |= 0x10;
+		vcpu->arch.sie_block->ecb |= ECB_TE;
 
 	if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
-		vcpu->arch.sie_block->ecb2 |= 0x08;
+		vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
 	if (test_kvm_facility(vcpu->kvm, 130))
-		vcpu->arch.sie_block->ecb2 |= 0x20;
-	vcpu->arch.sie_block->eca = 0x1002000U;
+		vcpu->arch.sie_block->ecb2 |= ECB2_IEP;
+	vcpu->arch.sie_block->eca = ECA_MVPGI | ECA_PROTEXCI;
 	if (sclp.has_cei)
-		vcpu->arch.sie_block->eca |= 0x80000000U;
+		vcpu->arch.sie_block->eca |= ECA_CEI;
 	if (sclp.has_ib)
-		vcpu->arch.sie_block->eca |= 0x40000000U;
+		vcpu->arch.sie_block->eca |= ECA_IB;
 	if (sclp.has_siif)
-		vcpu->arch.sie_block->eca |= 1;
+		vcpu->arch.sie_block->eca |= ECA_SII;
 	if (sclp.has_sigpif)
-		vcpu->arch.sie_block->eca |= 0x10000000U;
+		vcpu->arch.sie_block->eca |= ECA_SIGPI;
 	if (test_kvm_facility(vcpu->kvm, 129)) {
-		vcpu->arch.sie_block->eca |= 0x00020000;
-		vcpu->arch.sie_block->ecd |= 0x20000000;
+		vcpu->arch.sie_block->eca |= ECA_VX;
+		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
 	}
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
@@ -2752,9 +2752,9 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
 	    test_kvm_facility(vcpu->kvm, 64) &&
 	    riccb->valid &&
-	    !(vcpu->arch.sie_block->ecb3 & 0x01)) {
+	    !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) {
 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
-		vcpu->arch.sie_block->ecb3 |= 0x01;
+		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
 	}
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index af9fa91a0c917c..dfdcde125af6f7 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -25,7 +25,7 @@
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
 /* Transactional Memory Execution related macros */
-#define IS_TE_ENABLED(vcpu)	((vcpu->arch.sie_block->ecb & 0x10))
+#define IS_TE_ENABLED(vcpu)	((vcpu->arch.sie_block->ecb & ECB_TE))
 #define TDB_FORMAT1		1
 #define IS_ITDB_VALID(vcpu)	((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 26f30bab9c2f4c..d42bb2d0304105 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -38,7 +38,7 @@ static int handle_ri(struct kvm_vcpu *vcpu)
 {
 	if (test_kvm_facility(vcpu->kvm, 64)) {
 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)");
-		vcpu->arch.sie_block->ecb3 |= 0x01;
+		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
 		kvm_s390_retry_instr(vcpu);
 		return 0;
 	} else
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 5491be39776b66..d91f1df5a8544c 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -249,7 +249,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-	bool had_tx = scb_s->ecb & 0x10U;
+	bool had_tx = scb_s->ecb & ECB_TE;
 	unsigned long new_mso = 0;
 	int rc;
 
@@ -307,34 +307,34 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->ihcpu = scb_o->ihcpu;
 
 	/* MVPG and Protection Exception Interpretation are always available */
-	scb_s->eca |= scb_o->eca & 0x01002000U;
+	scb_s->eca |= scb_o->eca & (ECA_MVPGI | ECA_PROTEXCI);
 	/* Host-protection-interruption introduced with ESOP */
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
-		scb_s->ecb |= scb_o->ecb & 0x02U;
+		scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
 	/* transactional execution */
 	if (test_kvm_facility(vcpu->kvm, 73)) {
 		/* remap the prefix is tx is toggled on */
-		if ((scb_o->ecb & 0x10U) && !had_tx)
+		if ((scb_o->ecb & ECB_TE) && !had_tx)
 			prefix_unmapped(vsie_page);
-		scb_s->ecb |= scb_o->ecb & 0x10U;
+		scb_s->ecb |= scb_o->ecb & ECB_TE;
 	}
 	/* SIMD */
 	if (test_kvm_facility(vcpu->kvm, 129)) {
-		scb_s->eca |= scb_o->eca & 0x00020000U;
-		scb_s->ecd |= scb_o->ecd & 0x20000000U;
+		scb_s->eca |= scb_o->eca & ECA_VX;
+		scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
 	}
 	/* Run-time-Instrumentation */
 	if (test_kvm_facility(vcpu->kvm, 64))
-		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+		scb_s->ecb3 |= scb_o->ecb3 & ECB3_RI;
 	/* Instruction Execution Prevention */
 	if (test_kvm_facility(vcpu->kvm, 130))
-		scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
+		scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
-		scb_s->eca |= scb_o->eca & 0x00000001U;
+		scb_s->eca |= scb_o->eca & ECA_SII;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
-		scb_s->eca |= scb_o->eca & 0x40000000U;
+		scb_s->eca |= scb_o->eca & ECA_IB;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
-		scb_s->eca |= scb_o->eca & 0x80000000U;
+		scb_s->eca |= scb_o->eca & ECA_CEI;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
@@ -406,7 +406,7 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	prefix += scb_s->mso;
 
 	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
-	if (!rc && (scb_s->ecb & 0x10U))
+	if (!rc && (scb_s->ecb & ECB_TE))
 		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
 					   prefix + PAGE_SIZE);
 	/*
@@ -543,7 +543,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 
 	gpa = scb_o->itdba & ~0xffUL;
-	if (gpa && (scb_s->ecb & 0x10U)) {
+	if (gpa && (scb_s->ecb & ECB_TE)) {
 		if (!(gpa & ~0x1fffU)) {
 			rc = set_validity_icpt(scb_s, 0x0080U);
 			goto unpin;
@@ -558,8 +558,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 
 	gpa = scb_o->gvrd & ~0x1ffUL;
-	if (gpa && (scb_s->eca & 0x00020000U) &&
-	    !(scb_s->ecd & 0x20000000U)) {
+	if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
 		if (!(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x1310U);
 			goto unpin;
@@ -577,7 +576,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 
 	gpa = scb_o->riccbd & ~0x3fUL;
-	if (gpa && (scb_s->ecb3 & 0x01U)) {
+	if (gpa && (scb_s->ecb3 & ECB3_RI)) {
 		if (!(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x0043U);
 			goto unpin;

From 947b897204c417d22ad90e5f354858d78de15a07 Mon Sep 17 00:00:00 2001
From: Farhan Ali <alifm@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 09:59:32 -0400
Subject: [PATCH 004/227] KVM: s390: Use defines for intercept code

Let's use #define values for better readability.

Signed-off-by: Farhan Ali <alifm@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  3 +++
 arch/s390/kvm/intercept.c        | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 18e9c9d83b23e2..54e36e7cd201d6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -175,7 +175,10 @@ struct kvm_s390_sie_block {
 #define ICPT_INST	0x04
 #define ICPT_PROGI	0x08
 #define ICPT_INSTPROGI	0x0C
+#define ICPT_EXTREQ	0x10
 #define ICPT_EXTINT	0x14
+#define ICPT_IOREQ	0x18
+#define ICPT_WAIT	0x1c
 #define ICPT_VALIDITY	0x20
 #define ICPT_STOP	0x28
 #define ICPT_OPEREXC	0x2C
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 0a176465515315..e831f4b3e1c180 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -403,26 +403,26 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP;
 
 	switch (vcpu->arch.sie_block->icptcode) {
-	case 0x10:
-	case 0x18:
+	case ICPT_EXTREQ:
+	case ICPT_IOREQ:
 		return handle_noop(vcpu);
-	case 0x04:
+	case ICPT_INST:
 		rc = handle_instruction(vcpu);
 		break;
-	case 0x08:
+	case ICPT_PROGI:
 		return handle_prog(vcpu);
-	case 0x14:
+	case ICPT_EXTINT:
 		return handle_external_interrupt(vcpu);
-	case 0x1c:
+	case ICPT_WAIT:
 		return kvm_s390_handle_wait(vcpu);
-	case 0x20:
+	case ICPT_VALIDITY:
 		return handle_validity(vcpu);
-	case 0x28:
+	case ICPT_STOP:
 		return handle_stop(vcpu);
-	case 0x2c:
+	case ICPT_OPEREXC:
 		rc = handle_operexc(vcpu);
 		break;
-	case 0x38:
+	case ICPT_PARTEXEC:
 		rc = handle_partial_execution(vcpu);
 		break;
 	default:

From 4e0b1ab72b8af961bcaca9ec1475279c1cd9579c Mon Sep 17 00:00:00 2001
From: Fan Zhang <zhangfan@linux.vnet.ibm.com>
Date: Tue, 29 Nov 2016 07:17:55 +0100
Subject: [PATCH 005/227] KVM: s390: gs support for kvm guests

This patch adds guarded storage support for KVM guest. We need to
setup the necessary control blocks, the kvm_run structure for the
new registers, the necessary wrappers for VSIE, as well as the
machine check save areas.
GS is enabled lazily and the register saving and reloading is done in
KVM code.  As this feature adds new content for migration, we provide
a new capability for enablement (KVM_CAP_S390_GS).

Signed-off-by: Fan Zhang <zhangfan@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++
 arch/s390/include/asm/kvm_host.h  |  8 +++-
 arch/s390/include/uapi/asm/kvm.h  | 14 ++++++-
 arch/s390/kvm/intercept.c         |  1 +
 arch/s390/kvm/interrupt.c         | 33 +++++++++++++++-
 arch/s390/kvm/kvm-s390.c          | 62 +++++++++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.h          |  1 +
 arch/s390/kvm/priv.c              | 27 ++++++++++++++
 arch/s390/kvm/vsie.c              | 39 +++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 10 files changed, 191 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c248f772ae616..72525085847908 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4101,6 +4101,15 @@ to take care of that.
 This capability can be enabled dynamically even if VCPUs were already
 created and are running.
 
+7.9 KVM_CAP_S390_GS
+
+Architectures: s390
+Parameters: none
+Returns: 0 on success; -EINVAL if the machine does not support
+	 guarded storage; -EBUSY if a VCPU has already been created.
+
+Allows use of guarded storage for the KVM guest.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 54e36e7cd201d6..1af090d93bf581 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <asm/cpu.h>
 #include <asm/fpu/api.h>
 #include <asm/isc.h>
+#include <asm/guarded_storage.h>
 
 #define KVM_S390_BSCA_CPU_SLOTS 64
 #define KVM_S390_ESCA_CPU_SLOTS 248
@@ -192,6 +193,7 @@ struct kvm_s390_sie_block {
 	__u32	ipb;			/* 0x0058 */
 	__u32	scaoh;			/* 0x005c */
 	__u8	reserved60;		/* 0x0060 */
+#define ECB_GS		0x40
 #define ECB_TE		0x10
 #define ECB_SRSI	0x04
 #define ECB_HOSTPROTINT	0x02
@@ -237,7 +239,9 @@ struct kvm_s390_sie_block {
 	__u32	crycbd;			/* 0x00fc */
 	__u64	gcr[16];		/* 0x0100 */
 	__u64	gbea;			/* 0x0180 */
-	__u8	reserved188[24];	/* 0x0188 */
+	__u8    reserved188[8];		/* 0x0188 */
+	__u64   sdnxo;			/* 0x0190 */
+	__u8    reserved198[8];		/* 0x0198 */
 	__u32	fac;			/* 0x01a0 */
 	__u8	reserved1a4[20];	/* 0x01a4 */
 	__u64	cbrlo;			/* 0x01b8 */
@@ -573,6 +577,7 @@ struct kvm_vcpu_arch {
 	/* if vsie is active, currently executed shadow sie control block */
 	struct kvm_s390_sie_block *vsie_block;
 	unsigned int      host_acrs[NUM_ACRS];
+	struct gs_cb      *host_gscb;
 	struct fpu	  host_fpregs;
 	struct kvm_s390_local_interrupt local_int;
 	struct hrtimer    ckc_timer;
@@ -593,6 +598,7 @@ struct kvm_vcpu_arch {
 	 */
 	seqcount_t cputm_seqcount;
 	__u64 cputm_start;
+	bool gs_enabled;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index a2ffec4139ad1c..5bd23cfd9ae5d4 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -197,6 +197,10 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_VRS    (1UL << 6)
 #define KVM_SYNC_RICCB  (1UL << 7)
 #define KVM_SYNC_FPRS   (1UL << 8)
+#define KVM_SYNC_GSCB   (1UL << 9)
+/* length and alignment of the sdnx as a power of two */
+#define SDNXC 8
+#define SDNXL (1UL << SDNXC)
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
 	__u64 prefix;	/* prefix register */
@@ -217,8 +221,16 @@ struct kvm_sync_regs {
 	};
 	__u8  reserved[512];	/* for future vector expansion */
 	__u32 fpc;		/* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
-	__u8 padding[52];	/* riccb needs to be 64byte aligned */
+	__u8 padding1[52];	/* riccb needs to be 64byte aligned */
 	__u8 riccb[64];		/* runtime instrumentation controls block */
+	__u8 padding2[192];	/* sdnx needs to be 256byte aligned */
+	union {
+		__u8 sdnx[SDNXL];  /* state description annex */
+		struct {
+			__u64 reserved1[2];
+			__u64 gscb[4];
+		};
+	};
 };
 
 #define KVM_REG_S390_TODPR	(KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1)
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index e831f4b3e1c180..f5378f3361273b 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -35,6 +35,7 @@ static const intercept_handler_t instruction_handlers[256] = {
 	[0xb6] = kvm_s390_handle_stctl,
 	[0xb7] = kvm_s390_handle_lctl,
 	[0xb9] = kvm_s390_handle_b9,
+	[0xe3] = kvm_s390_handle_e3,
 	[0xe5] = kvm_s390_handle_e5,
 	[0xeb] = kvm_s390_handle_eb,
 };
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 169558dc7daf0d..311eef0df85565 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -410,6 +410,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 				 struct kvm_s390_mchk_info *mchk)
 {
 	unsigned long ext_sa_addr;
+	unsigned long lc;
 	freg_t fprs[NUM_FPRS];
 	union mci mci;
 	int rc;
@@ -422,8 +423,28 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	/* Extended save area */
 	rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr,
 			   sizeof(unsigned long));
-	/* Only bits 0-53 are used for address formation */
-	ext_sa_addr &= ~0x3ffUL;
+	/* Only bits 0 through 63-LC are used for address formation */
+	lc = ext_sa_addr & MCESA_LC_MASK;
+	if (test_kvm_facility(vcpu->kvm, 133)) {
+		switch (lc) {
+		case 0:
+		case 10:
+			ext_sa_addr &= ~0x3ffUL;
+			break;
+		case 11:
+			ext_sa_addr &= ~0x7ffUL;
+			break;
+		case 12:
+			ext_sa_addr &= ~0xfffUL;
+			break;
+		default:
+			ext_sa_addr = 0;
+			break;
+		}
+	} else {
+		ext_sa_addr &= ~0x3ffUL;
+	}
+
 	if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
 		if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs,
 				    512))
@@ -431,6 +452,14 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	} else {
 		mci.vr = 0;
 	}
+	if (!rc && mci.gs && ext_sa_addr && test_kvm_facility(vcpu->kvm, 133)
+	    && (lc == 11 || lc == 12)) {
+		if (write_guest_abs(vcpu, ext_sa_addr + 1024,
+				    &vcpu->run->s.regs.gscb, 32))
+			mci.gs = 0;
+	} else {
+		mci.gs = 0;
+	}
 
 	/* General interruption information */
 	rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 976373c0313813..f83f18b77f3d32 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -405,6 +405,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_RI:
 		r = test_facility(64);
 		break;
+	case KVM_CAP_S390_GS:
+		r = test_facility(133);
+		break;
 	default:
 		r = 0;
 	}
@@ -541,6 +544,20 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s",
 			 r ? "(not available)" : "(success)");
 		break;
+	case KVM_CAP_S390_GS:
+		r = -EINVAL;
+		mutex_lock(&kvm->lock);
+		if (atomic_read(&kvm->online_vcpus)) {
+			r = -EBUSY;
+		} else if (test_facility(133)) {
+			set_kvm_facility(kvm->arch.model.fac_mask, 133);
+			set_kvm_facility(kvm->arch.model.fac_list, 133);
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	case KVM_CAP_S390_USER_STSI:
 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
 		kvm->arch.user_stsi = 1;
@@ -1749,6 +1766,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_s390_set_prefix(vcpu, 0);
 	if (test_kvm_facility(vcpu->kvm, 64))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
+	if (test_kvm_facility(vcpu->kvm, 133))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
 	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
 	 */
@@ -1993,6 +2012,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 		vcpu->arch.sie_block->eca |= ECA_VX;
 		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
 	}
+	vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
+					| SDNXC;
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 
@@ -2720,8 +2741,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct runtime_instr_cb *riccb;
+	struct gs_cb *gscb;
 
 	riccb = (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+	gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
@@ -2756,6 +2779,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
 		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
 	}
+	/*
+	 * If userspace sets the gscb (e.g. after migration) to non-zero,
+	 * we should enable GS here instead of doing the lazy enablement.
+	 */
+	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_GSCB) &&
+	    test_kvm_facility(vcpu->kvm, 133) &&
+	    gscb->gssm &&
+	    !vcpu->arch.gs_enabled) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (sync_regs)");
+		vcpu->arch.sie_block->ecb |= ECB_GS;
+		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
+		vcpu->arch.gs_enabled = 1;
+	}
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
 	/* save host (userspace) fprs/vrs */
@@ -2770,6 +2806,20 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (test_fp_ctl(current->thread.fpu.fpc))
 		/* User space provided an invalid FPC, let's clear it */
 		current->thread.fpu.fpc = 0;
+	if (MACHINE_HAS_GS) {
+		preempt_disable();
+		__ctl_set_bit(2, 4);
+		if (current->thread.gs_cb) {
+			vcpu->arch.host_gscb = current->thread.gs_cb;
+			save_gs_cb(vcpu->arch.host_gscb);
+		}
+		if (vcpu->arch.gs_enabled) {
+			current->thread.gs_cb = (struct gs_cb *)
+						&vcpu->run->s.regs.gscb;
+			restore_gs_cb(current->thread.gs_cb);
+		}
+		preempt_enable();
+	}
 
 	kvm_run->kvm_dirty_regs = 0;
 }
@@ -2796,6 +2846,18 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	/* Restore will be done lazily at return */
 	current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
 	current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
+	if (MACHINE_HAS_GS) {
+		__ctl_set_bit(2, 4);
+		if (vcpu->arch.gs_enabled)
+			save_gs_cb(current->thread.gs_cb);
+		preempt_disable();
+		current->thread.gs_cb = vcpu->arch.host_gscb;
+		restore_gs_cb(vcpu->arch.host_gscb);
+		preempt_enable();
+		if (!vcpu->arch.host_gscb)
+			__ctl_clear_bit(2, 4);
+		vcpu->arch.host_gscb = NULL;
+	}
 
 }
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index dfdcde125af6f7..455124fe06476c 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -246,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 int is_valid_psw(psw_t *psw);
 int kvm_s390_handle_aa(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_e3(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d42bb2d0304105..0ffe973535fab8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -53,6 +53,33 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP;
 }
 
+static int handle_gs(struct kvm_vcpu *vcpu)
+{
+	if (test_kvm_facility(vcpu->kvm, 133)) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
+		preempt_disable();
+		__ctl_set_bit(2, 4);
+		current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb;
+		restore_gs_cb(current->thread.gs_cb);
+		preempt_enable();
+		vcpu->arch.sie_block->ecb |= ECB_GS;
+		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
+		vcpu->arch.gs_enabled = 1;
+		kvm_s390_retry_instr(vcpu);
+		return 0;
+	} else
+		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
+int kvm_s390_handle_e3(struct kvm_vcpu *vcpu)
+{
+	int code = vcpu->arch.sie_block->ipb & 0xff;
+
+	if (code == 0x49 || code == 0x4d)
+		return handle_gs(vcpu);
+	else
+		return -EOPNOTSUPP;
+}
 /* Handle SCK (SET CLOCK) interception */
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index d91f1df5a8544c..2fafc2be777fe4 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -329,6 +329,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* Instruction Execution Prevention */
 	if (test_kvm_facility(vcpu->kvm, 130))
 		scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP;
+	/* Guarded Storage */
+	if (test_kvm_facility(vcpu->kvm, 133)) {
+		scb_s->ecb |= scb_o->ecb & ECB_GS;
+		scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
+	}
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
 		scb_s->eca |= scb_o->eca & ECA_SII;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
@@ -496,6 +501,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		unpin_guest_page(vcpu->kvm, gpa, hpa);
 		scb_s->riccbd = 0;
 	}
+
+	hpa = scb_s->sdnxo;
+	if (hpa) {
+		gpa = scb_o->sdnxo;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->sdnxo = 0;
+	}
 }
 
 /*
@@ -590,6 +602,33 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			goto unpin;
 		scb_s->riccbd = hpa;
 	}
+	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
+		unsigned long sdnxc;
+
+		gpa = scb_o->sdnxo & ~0xfUL;
+		sdnxc = scb_o->sdnxo & 0xfUL;
+		if (!gpa || !(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x10b0U);
+			goto unpin;
+		}
+		if (sdnxc < 6 || sdnxc > 12) {
+			rc = set_validity_icpt(scb_s, 0x10b1U);
+			goto unpin;
+		}
+		if (gpa & ((1 << sdnxc) - 1)) {
+			rc = set_validity_icpt(scb_s, 0x10b2U);
+			goto unpin;
+		}
+		/* Due to alignment rules (checked above) this cannot
+		 * cross page boundaries
+		 */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x10b0U);
+		if (rc)
+			goto unpin;
+		scb_s->sdnxo = hpa;
+	}
 	return 0;
 unpin:
 	unpin_blocks(vcpu, vsie_page);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f51d5082a377ec..c9d522765f8f77 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -883,6 +883,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_S390_GS 137
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From 8db5d8f141db8c05d7e4ff44e15fedf142511fc9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 13 Jan 2017 17:25:11 +0000
Subject: [PATCH 006/227] KVM: arm64: add SYS_DESC()

This patch adds a macro enabling us to initialise sys_reg_desc
structures based on common sysreg encoding definitions in
<asm/sysreg.h>. Subsequent patches will use this to simplify the KVM
code.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 9c6ffd0f0196ed..66859a5b85e5f2 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -147,4 +147,9 @@ const struct sys_reg_desc *find_reg_by_id(u64 id,
 #define CRm(_x) 	.CRm = _x
 #define Op2(_x) 	.Op2 = _x
 
+#define SYS_DESC(reg)					\
+	Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)),	\
+	CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)),	\
+	Op2(sys_reg_Op2(reg))
+
 #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */

From ee1b64e6ccddbd31c0549851e8a6c00eef138acd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 13 Jan 2017 17:19:12 +0000
Subject: [PATCH 007/227] KVM: arm64: Use common debug sysreg definitions

Now that we have common definitions for the debug register encodings,
make the KVM code use these, simplifying the sys_reg_descs table.

The table previously erroneously referred to MDCCSR_EL0 as MDCCSR_EL1.
This is corrected (as is necessary in order to use the common sysreg
definition).

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c | 73 +++++++++++----------------------------
 1 file changed, 21 insertions(+), 52 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 0e26f8c2b56f82..5fa23fd500bae9 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -793,17 +793,13 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 
 /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)					\
-	/* DBGBVRn_EL1 */						\
-	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100),	\
+	{ SYS_DESC(SYS_DBGBVRn_EL1(n)),					\
 	  trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr },		\
-	/* DBGBCRn_EL1 */						\
-	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101),	\
+	{ SYS_DESC(SYS_DBGBCRn_EL1(n)),					\
 	  trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr },		\
-	/* DBGWVRn_EL1 */						\
-	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110),	\
+	{ SYS_DESC(SYS_DBGWVRn_EL1(n)),					\
 	  trap_wvr, reset_wvr, n, 0,  get_wvr, set_wvr },		\
-	/* DBGWCRn_EL1 */						\
-	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),	\
+	{ SYS_DESC(SYS_DBGWCRn_EL1(n)),					\
 	  trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
 /* Macro to expand the PMEVCNTRn_EL0 register */
@@ -899,12 +895,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	DBG_BCR_BVR_WCR_WVR_EL1(0),
 	DBG_BCR_BVR_WCR_WVR_EL1(1),
-	/* MDCCINT_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000),
-	  trap_debug_regs, reset_val, MDCCINT_EL1, 0 },
-	/* MDSCR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010),
-	  trap_debug_regs, reset_val, MDSCR_EL1, 0 },
+	{ SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 },
+	{ SYS_DESC(SYS_MDSCR_EL1), trap_debug_regs, reset_val, MDSCR_EL1, 0 },
 	DBG_BCR_BVR_WCR_WVR_EL1(2),
 	DBG_BCR_BVR_WCR_WVR_EL1(3),
 	DBG_BCR_BVR_WCR_WVR_EL1(4),
@@ -920,44 +912,21 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	DBG_BCR_BVR_WCR_WVR_EL1(14),
 	DBG_BCR_BVR_WCR_WVR_EL1(15),
 
-	/* MDRAR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
-	  trap_raz_wi },
-	/* OSLAR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b100),
-	  trap_raz_wi },
-	/* OSLSR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0001), Op2(0b100),
-	  trap_oslsr_el1 },
-	/* OSDLR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0011), Op2(0b100),
-	  trap_raz_wi },
-	/* DBGPRCR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0100), Op2(0b100),
-	  trap_raz_wi },
-	/* DBGCLAIMSET_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0111), CRm(0b1000), Op2(0b110),
-	  trap_raz_wi },
-	/* DBGCLAIMCLR_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0111), CRm(0b1001), Op2(0b110),
-	  trap_raz_wi },
-	/* DBGAUTHSTATUS_EL1 */
-	{ Op0(0b10), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b110),
-	  trap_dbgauthstatus_el1 },
-
-	/* MDCCSR_EL1 */
-	{ Op0(0b10), Op1(0b011), CRn(0b0000), CRm(0b0001), Op2(0b000),
-	  trap_raz_wi },
-	/* DBGDTR_EL0 */
-	{ Op0(0b10), Op1(0b011), CRn(0b0000), CRm(0b0100), Op2(0b000),
-	  trap_raz_wi },
-	/* DBGDTR[TR]X_EL0 */
-	{ Op0(0b10), Op1(0b011), CRn(0b0000), CRm(0b0101), Op2(0b000),
-	  trap_raz_wi },
-
-	/* DBGVCR32_EL2 */
-	{ Op0(0b10), Op1(0b100), CRn(0b0000), CRm(0b0111), Op2(0b000),
-	  NULL, reset_val, DBGVCR32_EL2, 0 },
+	{ SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_OSLAR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1 },
+	{ SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_DBGCLAIMCLR_EL1), trap_raz_wi },
+	{ SYS_DESC(SYS_DBGAUTHSTATUS_EL1), trap_dbgauthstatus_el1 },
+
+	{ SYS_DESC(SYS_MDCCSR_EL0), trap_raz_wi },
+	{ SYS_DESC(SYS_DBGDTR_EL0), trap_raz_wi },
+	// DBGDTR[TR]X_EL0 share the same encoding
+	{ SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi },
+
+	{ SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 },
 
 	/* MPIDR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b101),

From 174ed3e47591d58b4ad04250e11891618ea0048b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 20 Jan 2017 17:43:20 +0000
Subject: [PATCH 008/227] KVM: arm64: Use common performance monitor sysreg
 definitions

Now that we have common definitions for the performance monitor register
encodings, make the KVM code use these, simplifying the sys_reg_descs
table.

The comments for PMUSERENR_EL0 and PMCCFILTR_EL0 are kept, as these
describe non-obvious details regarding the registers. However, a slight
fixup is applied to bring these into line with the usual comment style.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c | 78 +++++++++++----------------------------
 1 file changed, 22 insertions(+), 56 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 5fa23fd500bae9..63b0785ec32598 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -804,16 +804,12 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 
 /* Macro to expand the PMEVCNTRn_EL0 register */
 #define PMU_PMEVCNTR_EL0(n)						\
-	/* PMEVCNTRn_EL0 */						\
-	{ Op0(0b11), Op1(0b011), CRn(0b1110),				\
-	  CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),		\
+	{ SYS_DESC(SYS_PMEVCNTRn_EL0(n)),					\
 	  access_pmu_evcntr, reset_unknown, (PMEVCNTR0_EL0 + n), }
 
 /* Macro to expand the PMEVTYPERn_EL0 register */
 #define PMU_PMEVTYPER_EL0(n)						\
-	/* PMEVTYPERn_EL0 */						\
-	{ Op0(0b11), Op1(0b011), CRn(0b1110),				\
-	  CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),		\
+	{ SYS_DESC(SYS_PMEVTYPERn_EL0(n)),					\
 	  access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
 
 static bool access_cntp_tval(struct kvm_vcpu *vcpu,
@@ -963,12 +959,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000),
 	  NULL, reset_unknown, PAR_EL1 },
 
-	/* PMINTENSET_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001),
-	  access_pminten, reset_unknown, PMINTENSET_EL1 },
-	/* PMINTENCLR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010),
-	  access_pminten, NULL, PMINTENSET_EL1 },
+	{ SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 },
+	{ SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 },
 
 	/* MAIR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
@@ -1003,48 +995,23 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ Op0(0b11), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000),
 	  NULL, reset_unknown, CSSELR_EL1 },
 
-	/* PMCR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000),
-	  access_pmcr, reset_pmcr, },
-	/* PMCNTENSET_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001),
-	  access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
-	/* PMCNTENCLR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010),
-	  access_pmcnten, NULL, PMCNTENSET_EL0 },
-	/* PMOVSCLR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011),
-	  access_pmovs, NULL, PMOVSSET_EL0 },
-	/* PMSWINC_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100),
-	  access_pmswinc, reset_unknown, PMSWINC_EL0 },
-	/* PMSELR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101),
-	  access_pmselr, reset_unknown, PMSELR_EL0 },
-	/* PMCEID0_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110),
-	  access_pmceid },
-	/* PMCEID1_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111),
-	  access_pmceid },
-	/* PMCCNTR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
-	  access_pmu_evcntr, reset_unknown, PMCCNTR_EL0 },
-	/* PMXEVTYPER_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001),
-	  access_pmu_evtyper },
-	/* PMXEVCNTR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010),
-	  access_pmu_evcntr },
-	/* PMUSERENR_EL0
-	 * This register resets as unknown in 64bit mode while it resets as zero
+	{ SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, },
+	{ SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
+	{ SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 },
+	{ SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 },
+	{ SYS_DESC(SYS_PMSWINC_EL0), access_pmswinc, reset_unknown, PMSWINC_EL0 },
+	{ SYS_DESC(SYS_PMSELR_EL0), access_pmselr, reset_unknown, PMSELR_EL0 },
+	{ SYS_DESC(SYS_PMCEID0_EL0), access_pmceid },
+	{ SYS_DESC(SYS_PMCEID1_EL0), access_pmceid },
+	{ SYS_DESC(SYS_PMCCNTR_EL0), access_pmu_evcntr, reset_unknown, PMCCNTR_EL0 },
+	{ SYS_DESC(SYS_PMXEVTYPER_EL0), access_pmu_evtyper },
+	{ SYS_DESC(SYS_PMXEVCNTR_EL0), access_pmu_evcntr },
+	/*
+	 * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero
 	 * in 32bit mode. Here we choose to reset it as zero for consistency.
 	 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000),
-	  access_pmuserenr, reset_val, PMUSERENR_EL0, 0 },
-	/* PMOVSSET_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011),
-	  access_pmovs, reset_unknown, PMOVSSET_EL0 },
+	{ SYS_DESC(SYS_PMUSERENR_EL0), access_pmuserenr, reset_val, PMUSERENR_EL0, 0 },
+	{ SYS_DESC(SYS_PMOVSSET_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 },
 
 	/* TPIDR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010),
@@ -1127,12 +1094,11 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	PMU_PMEVTYPER_EL0(28),
 	PMU_PMEVTYPER_EL0(29),
 	PMU_PMEVTYPER_EL0(30),
-	/* PMCCFILTR_EL0
-	 * This register resets as unknown in 64bit mode while it resets as zero
+	/*
+	 * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero
 	 * in 32bit mode. Here we choose to reset it as zero for consistency.
 	 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b1111), Op2(0b111),
-	  access_pmu_evtyper, reset_val, PMCCFILTR_EL0, 0 },
+	{ SYS_DESC(SYS_PMCCFILTR_EL0), access_pmu_evtyper, reset_val, PMCCFILTR_EL0, 0 },
 
 	/* DACR32_EL2 */
 	{ Op0(0b11), Op1(0b100), CRn(0b0011), CRm(0b0000), Op2(0b000),

From e804d208979db63be87305fac6e659162d719d97 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 20 Jan 2017 18:02:39 +0000
Subject: [PATCH 009/227] KVM: arm64: Use common GICv3 sysreg definitions

Now that we have common definitions for the GICv3 register encodings,
make the KVM code use these, simplifying the sys_reg_descs table.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 63b0785ec32598..1f3062bfa0718d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -973,12 +973,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
 	  NULL, reset_val, VBAR_EL1, 0 },
 
-	/* ICC_SGI1R_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b101),
-	  access_gic_sgi },
-	/* ICC_SRE_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101),
-	  access_gic_sre },
+	{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
+	{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
 
 	/* CONTEXTIDR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),

From b2d693ced214f808059d539b6cc0fe551d1bc0e4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 9 Mar 2017 16:50:51 +0000
Subject: [PATCH 010/227] KVM: arm64: Use common physical timer sysreg
 definitions

Now that we have common definitions for the physical timer control
registers, make the KVM code use these, simplifying the sys_reg_descs
table.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1f3062bfa0718d..860707fdb912fc 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1016,15 +1016,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011),
 	  NULL, reset_unknown, TPIDRRO_EL0 },
 
-	/* CNTP_TVAL_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b000),
-	  access_cntp_tval },
-	/* CNTP_CTL_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b001),
-	  access_cntp_ctl },
-	/* CNTP_CVAL_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b010),
-	  access_cntp_cval },
+	{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval },
+	{ SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl },
+	{ SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval },
 
 	/* PMEVCNTRn_EL0 */
 	PMU_PMEVCNTR_EL0(0),

From 0d449541c185f23347feed6f9ea1994fd1707a06 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 13 Jan 2017 18:36:26 +0000
Subject: [PATCH 011/227] KVM: arm64: use common invariant sysreg definitions

Now that we have common definitions for the register encodings used by
KVM, make the KVM code uses thse for invariant sysreg definitions. This
makes said definitions a reasonable amount shorter, especially as many
comments are rendered redundant and can be removed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c | 57 +++++++++++++--------------------------
 1 file changed, 19 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 860707fdb912fc..e637e1d28ae886 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1857,44 +1857,25 @@ FUNCTION_INVARIANT(aidr_el1)
 
 /* ->val is filled in by kvm_sys_reg_table_init() */
 static struct sys_reg_desc invariant_sys_regs[] = {
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b000),
-	  NULL, get_midr_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b110),
-	  NULL, get_revidr_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b000),
-	  NULL, get_id_pfr0_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b001),
-	  NULL, get_id_pfr1_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b010),
-	  NULL, get_id_dfr0_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b011),
-	  NULL, get_id_afr0_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b100),
-	  NULL, get_id_mmfr0_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b101),
-	  NULL, get_id_mmfr1_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b110),
-	  NULL, get_id_mmfr2_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b111),
-	  NULL, get_id_mmfr3_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000),
-	  NULL, get_id_isar0_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b001),
-	  NULL, get_id_isar1_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010),
-	  NULL, get_id_isar2_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b011),
-	  NULL, get_id_isar3_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b100),
-	  NULL, get_id_isar4_el1 },
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b101),
-	  NULL, get_id_isar5_el1 },
-	{ Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b001),
-	  NULL, get_clidr_el1 },
-	{ Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b111),
-	  NULL, get_aidr_el1 },
-	{ Op0(0b11), Op1(0b011), CRn(0b0000), CRm(0b0000), Op2(0b001),
-	  NULL, get_ctr_el0 },
+	{ SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
+	{ SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
+	{ SYS_DESC(SYS_ID_PFR0_EL1), NULL, get_id_pfr0_el1 },
+	{ SYS_DESC(SYS_ID_PFR1_EL1), NULL, get_id_pfr1_el1 },
+	{ SYS_DESC(SYS_ID_DFR0_EL1), NULL, get_id_dfr0_el1 },
+	{ SYS_DESC(SYS_ID_AFR0_EL1), NULL, get_id_afr0_el1 },
+	{ SYS_DESC(SYS_ID_MMFR0_EL1), NULL, get_id_mmfr0_el1 },
+	{ SYS_DESC(SYS_ID_MMFR1_EL1), NULL, get_id_mmfr1_el1 },
+	{ SYS_DESC(SYS_ID_MMFR2_EL1), NULL, get_id_mmfr2_el1 },
+	{ SYS_DESC(SYS_ID_MMFR3_EL1), NULL, get_id_mmfr3_el1 },
+	{ SYS_DESC(SYS_ID_ISAR0_EL1), NULL, get_id_isar0_el1 },
+	{ SYS_DESC(SYS_ID_ISAR1_EL1), NULL, get_id_isar1_el1 },
+	{ SYS_DESC(SYS_ID_ISAR2_EL1), NULL, get_id_isar2_el1 },
+	{ SYS_DESC(SYS_ID_ISAR3_EL1), NULL, get_id_isar3_el1 },
+	{ SYS_DESC(SYS_ID_ISAR4_EL1), NULL, get_id_isar4_el1 },
+	{ SYS_DESC(SYS_ID_ISAR5_EL1), NULL, get_id_isar5_el1 },
+	{ SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 },
+	{ SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
+	{ SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
 };
 
 static int reg_from_user(u64 *val, const void __user *uaddr, u64 id)

From 851050a573e1d2c192e6f59921e36fcbfcbcf207 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 19 Jan 2017 18:39:39 +0000
Subject: [PATCH 012/227] KVM: arm64: Use common sysreg definitions

Now that we have common definitions for the remaining register encodings
required by KVM, make the KVM code use these, simplifying the
sys_reg_descs table and the genericv8_sys_regs table.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c            | 94 +++++++---------------------
 arch/arm64/kvm/sys_regs_generic_v8.c |  4 +-
 2 files changed, 25 insertions(+), 73 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e637e1d28ae886..effa5ce02b7d8c 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -924,72 +924,36 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	{ SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 },
 
-	/* MPIDR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b101),
-	  NULL, reset_mpidr, MPIDR_EL1 },
-	/* SCTLR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
-	  access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
-	/* CPACR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
-	  NULL, reset_val, CPACR_EL1, 0 },
-	/* TTBR0_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000),
-	  access_vm_reg, reset_unknown, TTBR0_EL1 },
-	/* TTBR1_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001),
-	  access_vm_reg, reset_unknown, TTBR1_EL1 },
-	/* TCR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010),
-	  access_vm_reg, reset_val, TCR_EL1, 0 },
-
-	/* AFSR0_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000),
-	  access_vm_reg, reset_unknown, AFSR0_EL1 },
-	/* AFSR1_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001),
-	  access_vm_reg, reset_unknown, AFSR1_EL1 },
-	/* ESR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000),
-	  access_vm_reg, reset_unknown, ESR_EL1 },
-	/* FAR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000),
-	  access_vm_reg, reset_unknown, FAR_EL1 },
-	/* PAR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000),
-	  NULL, reset_unknown, PAR_EL1 },
+	{ SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
+	{ SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
+	{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
+	{ SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
+	{ SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
+	{ SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
+
+	{ SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
+	{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
+	{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
+	{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
+	{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
 
 	{ SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 },
 	{ SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 },
 
-	/* MAIR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
-	  access_vm_reg, reset_unknown, MAIR_EL1 },
-	/* AMAIR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000),
-	  access_vm_reg, reset_amair_el1, AMAIR_EL1 },
+	{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
+	{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
 
-	/* VBAR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
-	  NULL, reset_val, VBAR_EL1, 0 },
+	{ SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
 
 	{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
 	{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
 
-	/* CONTEXTIDR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),
-	  access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
-	/* TPIDR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100),
-	  NULL, reset_unknown, TPIDR_EL1 },
+	{ SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
+	{ SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 },
 
-	/* CNTKCTL_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b1110), CRm(0b0001), Op2(0b000),
-	  NULL, reset_val, CNTKCTL_EL1, 0},
+	{ SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0},
 
-	/* CSSELR_EL1 */
-	{ Op0(0b11), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000),
-	  NULL, reset_unknown, CSSELR_EL1 },
+	{ SYS_DESC(SYS_CSSELR_EL1), NULL, reset_unknown, CSSELR_EL1 },
 
 	{ SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, },
 	{ SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
@@ -1009,12 +973,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_PMUSERENR_EL0), access_pmuserenr, reset_val, PMUSERENR_EL0, 0 },
 	{ SYS_DESC(SYS_PMOVSSET_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 },
 
-	/* TPIDR_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010),
-	  NULL, reset_unknown, TPIDR_EL0 },
-	/* TPIDRRO_EL0 */
-	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011),
-	  NULL, reset_unknown, TPIDRRO_EL0 },
+	{ SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 },
+	{ SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 },
 
 	{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval },
 	{ SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl },
@@ -1090,15 +1050,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	 */
 	{ SYS_DESC(SYS_PMCCFILTR_EL0), access_pmu_evtyper, reset_val, PMCCFILTR_EL0, 0 },
 
-	/* DACR32_EL2 */
-	{ Op0(0b11), Op1(0b100), CRn(0b0011), CRm(0b0000), Op2(0b000),
-	  NULL, reset_unknown, DACR32_EL2 },
-	/* IFSR32_EL2 */
-	{ Op0(0b11), Op1(0b100), CRn(0b0101), CRm(0b0000), Op2(0b001),
-	  NULL, reset_unknown, IFSR32_EL2 },
-	/* FPEXC32_EL2 */
-	{ Op0(0b11), Op1(0b100), CRn(0b0101), CRm(0b0011), Op2(0b000),
-	  NULL, reset_val, FPEXC32_EL2, 0x70 },
+	{ SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 },
+	{ SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 },
+	{ SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x70 },
 };
 
 static bool trap_dbgidr(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c
index 46af7186bca617..969ade1d333dac 100644
--- a/arch/arm64/kvm/sys_regs_generic_v8.c
+++ b/arch/arm64/kvm/sys_regs_generic_v8.c
@@ -52,9 +52,7 @@ static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
  * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
  */
 static const struct sys_reg_desc genericv8_sys_regs[] = {
-	/* ACTLR_EL1 */
-	{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b001),
-	  access_actlr, reset_actlr, ACTLR_EL1 },
+	{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
 };
 
 static const struct sys_reg_desc genericv8_cp15_regs[] = {

From 7606e07856e0148e540fb67d68418400fd6b33b4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 13 Jan 2017 17:51:27 +0000
Subject: [PATCH 013/227] KVM: arm64: Use common Set/Way sys definitions

Now that we have common definitions for the encoding of Set/Way cache
maintenance operations, make the KVM code use these, simplifying the
sys_reg_descs table.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
---
 arch/arm64/kvm/sys_regs.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index effa5ce02b7d8c..0e6c477fa2a077 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -879,15 +879,9 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
  * more demanding guest...
  */
 static const struct sys_reg_desc sys_reg_descs[] = {
-	/* DC ISW */
-	{ Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b0110), Op2(0b010),
-	  access_dcsw },
-	/* DC CSW */
-	{ Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1010), Op2(0b010),
-	  access_dcsw },
-	/* DC CISW */
-	{ Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b010),
-	  access_dcsw },
+	{ SYS_DESC(SYS_DC_ISW), access_dcsw },
+	{ SYS_DESC(SYS_DC_CSW), access_dcsw },
+	{ SYS_DESC(SYS_DC_CISW), access_dcsw },
 
 	DBG_BCR_BVR_WCR_WVR_EL1(0),
 	DBG_BCR_BVR_WCR_WVR_EL1(1),

From 4e87580e6d144e44820a0d23e756136b6218f4f0 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:08 +0000
Subject: [PATCH 014/227] MIPS: Add defs & probing of UFR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add definitions and probing of the UFR bit in Config5. This bit allows
user mode control of the FR bit (floating point register mode). It is
present if the UFRP bit is set in the floating point implementation
register.

This is a capability KVM may want to expose to guest kernels, even
though Linux is unlikely to ever use it due to the implications for
multi-threaded programs.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/cpu-features.h | 4 ++++
 arch/mips/include/asm/cpu.h          | 1 +
 arch/mips/kernel/cpu-probe.c         | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index e961c8a7ea6626..e12d4ec6854d60 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -444,6 +444,10 @@
 # define cpu_has_msa		0
 #endif
 
+#ifndef cpu_has_ufr
+# define cpu_has_ufr		(cpu_data[0].options & MIPS_CPU_UFR)
+#endif
+
 #ifndef cpu_has_fre
 # define cpu_has_fre		(cpu_data[0].options & MIPS_CPU_FRE)
 #endif
diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h
index 9a8372484edc0f..98f59307e6a354 100644
--- a/arch/mips/include/asm/cpu.h
+++ b/arch/mips/include/asm/cpu.h
@@ -415,6 +415,7 @@ enum cpu_type_enum {
 #define MIPS_CPU_GUESTCTL2	MBIT_ULL(50)	/* CPU has VZ GuestCtl2 register */
 #define MIPS_CPU_GUESTID	MBIT_ULL(51)	/* CPU uses VZ ASE GuestID feature */
 #define MIPS_CPU_DRG		MBIT_ULL(52)	/* CPU has VZ Direct Root to Guest (DRG) */
+#define MIPS_CPU_UFR		MBIT_ULL(53)	/* CPU supports User mode FR switching */
 
 /*
  * CPU ASE encodings
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 07718bb5fc9d86..708f5913a8fe6a 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -289,6 +289,8 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c)
 			    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
 		if (c->fpu_id & MIPS_FPIR_3D)
 			c->ases |= MIPS_ASE_MIPS3D;
+		if (c->fpu_id & MIPS_FPIR_UFRP)
+			c->options |= MIPS_CPU_UFR;
 		if (c->fpu_id & MIPS_FPIR_FREP)
 			c->options |= MIPS_CPU_FRE;
 	}

From f359a1115519122c7c3bc7fc4a01797dcf576709 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:09 +0000
Subject: [PATCH 015/227] MIPS: Separate MAAR V bit into VL and VH for XPA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MAAR V bit has been renamed VL since another bit called VH is added
at the top of the register when it is extended to 64-bits on a 32-bit
processor with XPA. Rename the V definition, fix the various users, and
add definitions for the VH bit. Also add a definition for the MAARI
Index field.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/maar.h     | 10 +++++-----
 arch/mips/include/asm/mipsregs.h |  8 +++++++-
 arch/mips/mm/init.c              |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/mips/include/asm/maar.h b/arch/mips/include/asm/maar.h
index 21d9607c80d7de..e10f78befbd9b2 100644
--- a/arch/mips/include/asm/maar.h
+++ b/arch/mips/include/asm/maar.h
@@ -36,7 +36,7 @@ unsigned platform_maar_init(unsigned num_pairs);
  * @upper:	The highest address that the MAAR pair will affect. Must be
  *		aligned to one byte before a 2^16 byte boundary.
  * @attrs:	The accessibility attributes to program, eg. MIPS_MAAR_S. The
- *		MIPS_MAAR_V attribute will automatically be set.
+ *		MIPS_MAAR_VL attribute will automatically be set.
  *
  * Program the pair of MAAR registers specified by idx to apply the attributes
  * specified by attrs to the range of addresses from lower to higher.
@@ -49,10 +49,10 @@ static inline void write_maar_pair(unsigned idx, phys_addr_t lower,
 	BUG_ON(((upper & 0xffff) != 0xffff)
 		|| ((upper & ~0xffffull) & ~(MIPS_MAAR_ADDR << 4)));
 
-	/* Automatically set MIPS_MAAR_V */
-	attrs |= MIPS_MAAR_V;
+	/* Automatically set MIPS_MAAR_VL */
+	attrs |= MIPS_MAAR_VL;
 
-	/* Write the upper address & attributes (only MIPS_MAAR_V matters) */
+	/* Write the upper address & attributes (only MIPS_MAAR_VL matters) */
 	write_c0_maari(idx << 1);
 	back_to_back_c0_hazard();
 	write_c0_maar(((upper >> 4) & MIPS_MAAR_ADDR) | attrs);
@@ -81,7 +81,7 @@ extern void maar_init(void);
  * @upper:	The highest address that the MAAR pair will affect. Must be
  *		aligned to one byte before a 2^16 byte boundary.
  * @attrs:	The accessibility attributes to program, eg. MIPS_MAAR_S. The
- *		MIPS_MAAR_V attribute will automatically be set.
+ *		MIPS_MAAR_VL attribute will automatically be set.
  *
  * Describes the configuration of a pair of Memory Accessibility Attribute
  * Registers - applying attributes from attrs to the range of physical
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index f8d1d2f1d80d55..c20df608147977 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -34,8 +34,10 @@
  */
 #ifdef __ASSEMBLY__
 #define _ULCAST_
+#define _U64CAST_
 #else
 #define _ULCAST_ (unsigned long)
+#define _U64CAST_ (u64)
 #endif
 
 /*
@@ -719,10 +721,14 @@
 #define XLR_PERFCTRL_ALLTHREADS	(_ULCAST_(1) << 13)
 
 /* MAAR bit definitions */
+#define MIPS_MAAR_VH		(_U64CAST_(1) << 63)
 #define MIPS_MAAR_ADDR		((BIT_ULL(BITS_PER_LONG - 12) - 1) << 12)
 #define MIPS_MAAR_ADDR_SHIFT	12
 #define MIPS_MAAR_S		(_ULCAST_(1) << 1)
-#define MIPS_MAAR_V		(_ULCAST_(1) << 0)
+#define MIPS_MAAR_VL		(_ULCAST_(1) << 0)
+
+/* MAARI bit definitions */
+#define MIPS_MAARI_INDEX	(_ULCAST_(0x3f) << 0)
 
 /* EBase bit definitions */
 #define MIPS_EBASE_CPUNUM_SHIFT	0
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index aa75849c36bcdd..3ca20283b31eaf 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -348,7 +348,7 @@ void maar_init(void)
 		upper = ((upper & MIPS_MAAR_ADDR) << 4) | 0xffff;
 
 		pr_info("  [%d]: ", i / 2);
-		if (!(attr & MIPS_MAAR_V)) {
+		if (!(attr & MIPS_MAAR_VL)) {
 			pr_cont("disabled\n");
 			continue;
 		}

From a7c7ad6c3e0a2ae88625df4e41c1c7305aa26f04 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:10 +0000
Subject: [PATCH 016/227] MIPS: Probe guest CP0_UserLocal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Probe for presence of guest CP0_UserLocal register and expose via
cpu_guest_has_userlocal. This register is optional pre-r6, so this will
allow KVM to only save/restore/expose the guest CP0_UserLocal register
if it exists.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/cpu-features.h | 3 +++
 arch/mips/kernel/cpu-probe.c         | 6 +++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index e12d4ec6854d60..e898f441cc2227 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -547,6 +547,9 @@
 #ifndef cpu_guest_has_maar
 #define cpu_guest_has_maar	(cpu_data[0].guest.options & MIPS_CPU_MAAR)
 #endif
+#ifndef cpu_guest_has_userlocal
+#define cpu_guest_has_userlocal	(cpu_data[0].guest.options & MIPS_CPU_ULRI)
+#endif
 
 /*
  * Guest dynamic capabilities
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 708f5913a8fe6a..29dfdb64ad0b63 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -1005,7 +1005,8 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c)
 	unsigned int config3, config3_dyn;
 
 	probe_gc0_config_dyn(config3, config3, config3_dyn,
-			     MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_CTXTC);
+			     MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_ULRI |
+			     MIPS_CONF3_CTXTC);
 
 	if (config3 & MIPS_CONF3_CTXTC)
 		c->guest.options |= MIPS_CPU_CTXTC;
@@ -1015,6 +1016,9 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c)
 	if (config3 & MIPS_CONF3_PW)
 		c->guest.options |= MIPS_CPU_HTW;
 
+	if (config3 & MIPS_CONF3_ULRI)
+		c->guest.options |= MIPS_CPU_ULRI;
+
 	if (config3 & MIPS_CONF3_SC)
 		c->guest.options |= MIPS_CPU_SEGMENTS;
 

From a929bdc52be658fd622c2d0f9fdf1d333aea34fd Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:11 +0000
Subject: [PATCH 017/227] MIPS: Probe guest MVH
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Probe for availablility of M{T,F}HC0 instructions used with e.g. XPA in
the VZ guest context, and make it available via cpu_guest_has_mvh. This
will be helpful in properly emulating the MAAR registers in KVM for MIPS
VZ.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/cpu-features.h | 3 +++
 arch/mips/kernel/cpu-probe.c         | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index e898f441cc2227..494d3827414269 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -532,6 +532,9 @@
 #ifndef cpu_guest_has_htw
 #define cpu_guest_has_htw	(cpu_data[0].guest.options & MIPS_CPU_HTW)
 #endif
+#ifndef cpu_guest_has_mvh
+#define cpu_guest_has_mvh	(cpu_data[0].guest.options & MIPS_CPU_MVH)
+#endif
 #ifndef cpu_guest_has_msa
 #define cpu_guest_has_msa	(cpu_data[0].guest.ases & MIPS_ASE_MSA)
 #endif
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 29dfdb64ad0b63..c72a4cda389ce8 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -1057,7 +1057,7 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c)
 	unsigned int config5, config5_dyn;
 
 	probe_gc0_config_dyn(config5, config5, config5_dyn,
-			 MIPS_CONF_M | MIPS_CONF5_MRP);
+			 MIPS_CONF_M | MIPS_CONF5_MVH | MIPS_CONF5_MRP);
 
 	if (config5 & MIPS_CONF5_MRP)
 		c->guest.options |= MIPS_CPU_MAAR;
@@ -1067,6 +1067,9 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c)
 	if (config5 & MIPS_CONF5_LLB)
 		c->guest.options |= MIPS_CPU_RW_LLB;
 
+	if (config5 & MIPS_CONF5_MVH)
+		c->guest.options |= MIPS_CPU_MVH;
+
 	if (config5 & MIPS_CONF_M)
 		c->guest.conf |= BIT(6);
 	return config5 & MIPS_CONF_M;

From eb0bab386622ecd5626462dc9eeeab082ac8b717 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:12 +0000
Subject: [PATCH 018/227] MIPS: Add some missing guest CP0 accessors & defs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add some missing guest accessors and register field definitions for KVM
for MIPS VZ to make use of.

Guest CP0_LLAddr register accessors and definitions for the LLB field
allow KVM to clear the guest LLB to cancel in-progress LL/SC atomics on
restore, and to emulate accesses by the guest to the CP0_LLAddr
register.

Bitwise modifiers and definitions for the guest CP0_Wired and
CP0_Config1 registers allow KVM to modify fields within the CP0_Wired
and CP0_Config1 registers.

Finally a definition for the CP0_Config5.SBRI bit allows KVM to
initialise and allow modification of the guest version of the SBRI bit.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/mipsregs.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index c20df608147977..c6b8f96b80f963 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -219,8 +219,10 @@
 /*
  * Wired register bits
  */
-#define MIPSR6_WIRED_LIMIT	(_ULCAST_(0xffff) << 16)
-#define MIPSR6_WIRED_WIRED	(_ULCAST_(0xffff) << 0)
+#define MIPSR6_WIRED_LIMIT_SHIFT 16
+#define MIPSR6_WIRED_LIMIT	(_ULCAST_(0xffff) << MIPSR6_WIRED_LIMIT_SHIFT)
+#define MIPSR6_WIRED_WIRED_SHIFT 0
+#define MIPSR6_WIRED_WIRED	(_ULCAST_(0xffff) << MIPSR6_WIRED_WIRED_SHIFT)
 
 /*
  * Values used for computation of new tlb entries
@@ -647,6 +649,7 @@
 #define MIPS_CONF5_LLB		(_ULCAST_(1) << 4)
 #define MIPS_CONF5_MVH		(_ULCAST_(1) << 5)
 #define MIPS_CONF5_VP		(_ULCAST_(1) << 7)
+#define MIPS_CONF5_SBRI		(_ULCAST_(1) << 6)
 #define MIPS_CONF5_FRE		(_ULCAST_(1) << 8)
 #define MIPS_CONF5_UFE		(_ULCAST_(1) << 9)
 #define MIPS_CONF5_MSAEN	(_ULCAST_(1) << 27)
@@ -742,6 +745,10 @@
 #define MIPS_CMGCRB_BASE	11
 #define MIPS_CMGCRF_BASE	(~_ULCAST_((1 << MIPS_CMGCRB_BASE) - 1))
 
+/* LLAddr bit definitions */
+#define MIPS_LLADDR_LLB_SHIFT	0
+#define MIPS_LLADDR_LLB		(_ULCAST_(1) << MIPS_LLADDR_LLB_SHIFT)
+
 /*
  * Bits in the MIPS32 Memory Segmentation registers.
  */
@@ -2018,6 +2025,9 @@ do {									\
 #define write_gc0_config6(val)		__write_32bit_gc0_register(16, 6, val)
 #define write_gc0_config7(val)		__write_32bit_gc0_register(16, 7, val)
 
+#define read_gc0_lladdr()		__read_ulong_gc0_register(17, 0)
+#define write_gc0_lladdr(val)		__write_ulong_gc0_register(17, 0, val)
+
 #define read_gc0_watchlo0()		__read_ulong_gc0_register(18, 0)
 #define read_gc0_watchlo1()		__read_ulong_gc0_register(18, 1)
 #define read_gc0_watchlo2()		__read_ulong_gc0_register(18, 2)
@@ -2702,9 +2712,11 @@ __BUILD_SET_C0(brcm_mode)
  */
 #define __BUILD_SET_GC0(name)	__BUILD_SET_COMMON(gc0_##name)
 
+__BUILD_SET_GC0(wired)
 __BUILD_SET_GC0(status)
 __BUILD_SET_GC0(cause)
 __BUILD_SET_GC0(ebase)
+__BUILD_SET_GC0(config1)
 
 /*
  * Return low 10 bits of ebase.

From a6c09b9f9d1c3b1daf8e24e5556d73b9303843ef Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:13 +0000
Subject: [PATCH 019/227] MIPS: asm/tlb.h: Add UNIQUE_GUEST_ENTRYHI() macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a distinct UNIQUE_GUEST_ENTRYHI() macro for invalidation of guest
TLB entries by KVM, using addresses in KSeg1 rather than KSeg0. This
avoids conflicts with guest invalidation routines when there is no EHINV
bit to mark the whole entry as invalid, avoiding guest machine check
exceptions on Cavium Octeon III.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/tlb.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h
index dd179fd8acdac4..939734de435908 100644
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -21,9 +21,11 @@
  */
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
-#define UNIQUE_ENTRYHI(idx)						\
-		((CKSEG0 + ((idx) << (PAGE_SHIFT + 1))) |		\
+#define _UNIQUE_ENTRYHI(base, idx)					\
+		(((base) + ((idx) << (PAGE_SHIFT + 1))) |		\
 		 (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0))
+#define UNIQUE_ENTRYHI(idx)		_UNIQUE_ENTRYHI(CKSEG0, idx)
+#define UNIQUE_GUEST_ENTRYHI(idx)	_UNIQUE_ENTRYHI(CKSEG1, idx)
 
 static inline unsigned int num_wired_entries(void)
 {

From 955d8dc3ee555e9320fabbeab0969f9cf7660f9d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:14 +0000
Subject: [PATCH 020/227] KVM: MIPS: Implement HYPCALL emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Emulate the HYPCALL instruction added in the VZ ASE and used by the MIPS
paravirtualised guest support that is already merged. The new hypcall.c
handles arguments and the return value. No actual hypercalls are yet
supported, but this still allows us to safely step over hypercalls and
set an error code in the return value for forward compatibility.

Non-zero HYPCALL codes are not handled.

We also document the hypercall ABI which asm/kvm_para.h uses.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/hypercalls.txt |  5 +++
 arch/mips/include/asm/kvm_host.h         |  7 ++++
 arch/mips/include/uapi/asm/inst.h        |  2 +-
 arch/mips/kvm/Makefile                   |  1 +
 arch/mips/kvm/emulate.c                  |  3 ++
 arch/mips/kvm/hypcall.c                  | 53 ++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c                |  4 ++
 7 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 arch/mips/kvm/hypcall.c

diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index feaaa634f154bb..a890529c63ed6a 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -28,6 +28,11 @@ S390:
   property inside the device tree's /hypervisor node.
   For more information refer to Documentation/virtual/kvm/ppc-pv.txt
 
+MIPS:
+  KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall
+  number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and
+  the return value is placed in $2 (v0).
+
 KVM Hypercalls Documentation
 ===========================
 The template for each hypercall is:
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 05e785fc061daa..0d308d4f2429f9 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -229,6 +229,7 @@ enum emulation_result {
 	EMULATE_WAIT,		/* WAIT instruction */
 	EMULATE_PRIV_FAIL,
 	EMULATE_EXCEPT,		/* A guest exception has been generated */
+	EMULATE_HYPERCALL,	/* HYPCALL instruction */
 };
 
 #define mips3_paddr_to_tlbpfn(x) \
@@ -832,6 +833,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
+/* Hypercalls (hypcall.c) */
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+					    union mips_instruction inst);
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu);
+
 /* Dynamic binary translation */
 extern int kvm_mips_trans_cache_index(union mips_instruction inst,
 				      u32 *opc, struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 77429d1622b343..b5e46ae872d3b5 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -179,7 +179,7 @@ enum cop0_coi_func {
 	tlbr_op	      = 0x01, tlbwi_op	    = 0x02,
 	tlbwr_op      = 0x06, tlbp_op	    = 0x08,
 	rfe_op	      = 0x10, eret_op	    = 0x18,
-	wait_op       = 0x20,
+	wait_op       = 0x20, hypcall_op    = 0x28
 };
 
 /*
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 847429de780d3b..e56403c8a3f5f9 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -10,6 +10,7 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
 	    interrupt.o stats.o commpage.o \
 	    dyntrans.o trap_emul.o fpu.o
+kvm-objs += hypcall.o
 kvm-objs += mmu.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index d40cfaad45295c..637753ea0a00e7 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1143,6 +1143,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 		case wait_op:
 			er = kvm_mips_emul_wait(vcpu);
 			break;
+		case hypcall_op:
+			er = kvm_mips_emul_hypcall(vcpu, inst);
+			break;
 		}
 	} else {
 		rt = inst.c0r_format.rt;
diff --git a/arch/mips/kvm/hypcall.c b/arch/mips/kvm/hypcall.c
new file mode 100644
index 00000000000000..83063435195f57
--- /dev/null
+++ b/arch/mips/kvm/hypcall.c
@@ -0,0 +1,53 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Hypercall handling.
+ *
+ * Copyright (C) 2015  Imagination Technologies Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_para.h>
+
+#define MAX_HYPCALL_ARGS	4
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+					    union mips_instruction inst)
+{
+	unsigned int code = (inst.co_format.code >> 5) & 0x3ff;
+
+	kvm_debug("[%#lx] HYPCALL %#03x\n", vcpu->arch.pc, code);
+
+	switch (code) {
+	case 0:
+		return EMULATE_HYPERCALL;
+	default:
+		return EMULATE_FAIL;
+	};
+}
+
+static int kvm_mips_hypercall(struct kvm_vcpu *vcpu, unsigned long num,
+			      const unsigned long *args, unsigned long *hret)
+{
+	/* Report unimplemented hypercall to guest */
+	*hret = -KVM_ENOSYS;
+	return RESUME_GUEST;
+}
+
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long num, args[MAX_HYPCALL_ARGS];
+
+	/* read hypcall number and arguments */
+	num = vcpu->arch.gprs[2];	/* v0 */
+	args[0] = vcpu->arch.gprs[4];	/* a0 */
+	args[1] = vcpu->arch.gprs[5];	/* a1 */
+	args[2] = vcpu->arch.gprs[6];	/* a2 */
+	args[3] = vcpu->arch.gprs[7];	/* a3 */
+
+	return kvm_mips_hypercall(vcpu, num,
+				  args, &vcpu->arch.gprs[2] /* v0 */);
+}
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index b1fa53b252eab2..3a854bb9e606be 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -82,6 +82,10 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 		ret = RESUME_HOST;
 		break;
 
+	case EMULATE_HYPERCALL:
+		ret = kvm_mips_handle_hypcall(vcpu);
+		break;
+
 	default:
 		BUG();
 	}

From 8b48d5b75dc4cbae8cfe83648f9e4e10cb407dce Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:15 +0000
Subject: [PATCH 021/227] KVM: MIPS/Emulate: De-duplicate MMIO emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor MIPS KVM MMIO load/store emulation to reduce code duplication.
Each duplicate differed slightly anyway, and it will simplify adding
64-bit MMIO support for VZ.

kvm_mips_emulate_store() and kvm_mips_emulate_load() can now return
EMULATE_DO_MMIO (as possibly originally intended). We therefore stop
calling either of these from kvm_mips_emulate_inst(), which is now only
used by kvm_trap_emul_handle_cop_unusable() which is picky about return
values.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 206 ++++++++++------------------------------
 1 file changed, 50 insertions(+), 156 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 637753ea0a00e7..e0f74ee2aad862 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1477,9 +1477,8 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
-	enum emulation_result er = EMULATE_DO_MMIO;
+	enum emulation_result er;
 	u32 rt;
-	u32 bytes;
 	void *data = run->mmio.data;
 	unsigned long curr_pc;
 
@@ -1494,103 +1493,63 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 
 	rt = inst.i_format.rt;
 
-	switch (inst.i_format.opcode) {
-	case sb_op:
-		bytes = 1;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.len = bytes;
-		run->mmio.is_write = 1;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 1;
-		*(u8 *) data = vcpu->arch.gprs[rt];
-		kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
-			  vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
-			  *(u8 *) data);
-
-		break;
+	run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
+						vcpu->arch.host_cp0_badvaddr);
+	if (run->mmio.phys_addr == KVM_INVALID_ADDR)
+		goto out_fail;
 
+	switch (inst.i_format.opcode) {
 	case sw_op:
-		bytes = 4;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 1;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 1;
-		*(u32 *) data = vcpu->arch.gprs[rt];
+		run->mmio.len = 4;
+		*(u32 *)data = vcpu->arch.gprs[rt];
 
 		kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-			  vcpu->arch.gprs[rt], *(u32 *) data);
+			  vcpu->arch.gprs[rt], *(u32 *)data);
 		break;
 
 	case sh_op:
-		bytes = 2;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 1;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 1;
-		*(u16 *) data = vcpu->arch.gprs[rt];
+		run->mmio.len = 2;
+		*(u16 *)data = vcpu->arch.gprs[rt];
 
 		kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-			  vcpu->arch.gprs[rt], *(u32 *) data);
+			  vcpu->arch.gprs[rt], *(u16 *)data);
+		break;
+
+	case sb_op:
+		run->mmio.len = 1;
+		*(u8 *)data = vcpu->arch.gprs[rt];
+
+		kvm_debug("[%#lx] OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
+			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
+			  vcpu->arch.gprs[rt], *(u8 *)data);
 		break;
 
 	default:
 		kvm_err("Store not yet supported (inst=0x%08x)\n",
 			inst.word);
-		er = EMULATE_FAIL;
-		break;
+		goto out_fail;
 	}
 
-	/* Rollback PC if emulation was unsuccessful */
-	if (er == EMULATE_FAIL)
-		vcpu->arch.pc = curr_pc;
+	run->mmio.is_write = 1;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = 1;
+	return EMULATE_DO_MMIO;
 
-	return er;
+out_fail:
+	/* Rollback PC if emulation was unsuccessful */
+	vcpu->arch.pc = curr_pc;
+	return EMULATE_FAIL;
 }
 
 enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 					    u32 cause, struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
-	enum emulation_result er = EMULATE_DO_MMIO;
+	enum emulation_result er;
 	unsigned long curr_pc;
 	u32 op, rt;
-	u32 bytes;
 
 	rt = inst.i_format.rt;
 	op = inst.i_format.opcode;
@@ -1609,94 +1568,41 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 
 	vcpu->arch.io_gpr = rt;
 
+	run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
+						vcpu->arch.host_cp0_badvaddr);
+	if (run->mmio.phys_addr == KVM_INVALID_ADDR)
+		return EMULATE_FAIL;
+
+	vcpu->mmio_needed = 2;	/* signed */
 	switch (op) {
 	case lw_op:
-		bytes = 4;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 0;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 0;
+		run->mmio.len = 4;
 		break;
 
-	case lh_op:
 	case lhu_op:
-		bytes = 2;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 0;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 0;
-
-		if (op == lh_op)
-			vcpu->mmio_needed = 2;
-		else
-			vcpu->mmio_needed = 1;
-
+		vcpu->mmio_needed = 1;	/* unsigned */
+		/* fall through */
+	case lh_op:
+		run->mmio.len = 2;
 		break;
 
 	case lbu_op:
+		vcpu->mmio_needed = 1;	/* unsigned */
+		/* fall through */
 	case lb_op:
-		bytes = 1;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 0;
-		vcpu->mmio_is_write = 0;
-
-		if (op == lb_op)
-			vcpu->mmio_needed = 2;
-		else
-			vcpu->mmio_needed = 1;
-
+		run->mmio.len = 1;
 		break;
 
 	default:
 		kvm_err("Load not yet supported (inst=0x%08x)\n",
 			inst.word);
-		er = EMULATE_FAIL;
-		break;
+		vcpu->mmio_needed = 0;
+		return EMULATE_FAIL;
 	}
 
-	return er;
+	run->mmio.is_write = 0;
+	vcpu->mmio_is_write = 0;
+	return EMULATE_DO_MMIO;
 }
 
 static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
@@ -1873,18 +1779,6 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 	case cop0_op:
 		er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
 		break;
-	case sb_op:
-	case sh_op:
-	case sw_op:
-		er = kvm_mips_emulate_store(inst, cause, run, vcpu);
-		break;
-	case lb_op:
-	case lbu_op:
-	case lhu_op:
-	case lh_op:
-	case lw_op:
-		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
-		break;
 
 #ifndef CONFIG_CPU_MIPSR6
 	case cache_op:

From 59d7814ab26f925f209d532ae40f797f20359cec Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:16 +0000
Subject: [PATCH 022/227] KVM: MIPS/Emulate: Implement 64-bit MMIO emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement additional MMIO emulation for MIPS64, including 64-bit
loads/stores, and 32-bit unsigned loads. These are only exposed on
64-bit VZ hosts.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index e0f74ee2aad862..d75ab8940e1fc5 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1499,6 +1499,17 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 		goto out_fail;
 
 	switch (inst.i_format.opcode) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
+	case sd_op:
+		run->mmio.len = 8;
+		*(u64 *)data = vcpu->arch.gprs[rt];
+
+		kvm_debug("[%#lx] OP_SD: eaddr: %#lx, gpr: %#lx, data: %#llx\n",
+			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
+			  vcpu->arch.gprs[rt], *(u64 *)data);
+		break;
+#endif
+
 	case sw_op:
 		run->mmio.len = 4;
 		*(u32 *)data = vcpu->arch.gprs[rt];
@@ -1575,6 +1586,15 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 
 	vcpu->mmio_needed = 2;	/* signed */
 	switch (op) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
+	case ld_op:
+		run->mmio.len = 8;
+		break;
+
+	case lwu_op:
+		vcpu->mmio_needed = 1;	/* unsigned */
+		/* fall through */
+#endif
 	case lw_op:
 		run->mmio.len = 4;
 		break;
@@ -2421,8 +2441,15 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 	vcpu->arch.pc = vcpu->arch.io_pc;
 
 	switch (run->mmio.len) {
+	case 8:
+		*gpr = *(s64 *)run->mmio.data;
+		break;
+
 	case 4:
-		*gpr = *(s32 *) run->mmio.data;
+		if (vcpu->mmio_needed == 2)
+			*gpr = *(s32 *)run->mmio.data;
+		else
+			*gpr = *(u32 *)run->mmio.data;
 		break;
 
 	case 2:

From c58cf741795ad5df08dcc13d1d0ccd05a853b2a6 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:17 +0000
Subject: [PATCH 023/227] KVM: MIPS: Update kvm_lose_fpu() for VZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the implementation of kvm_lose_fpu() for VZ, where there is no
need to enable the FPU/MSA in the root context if the FPU/MSA state is
loaded but disabled in the guest context.

The trap & emulate implementation needs to disable FPU/MSA in the root
context when the guest disables them in order to catch the COP1 unusable
or MSA disabled exception when they're used and pass it on to the guest.

For VZ however as long as the context is loaded and enabled in the root
context, the guest can enable and disable it in the guest context
without the hypervisor having to do much, and will take guest exceptions
without hypervisor intervention if used without being enabled in the
guest context.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 15a1b1716c2eee..a743f67378ba76 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1527,16 +1527,18 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 {
 	/*
-	 * FPU & MSA get disabled in root context (hardware) when it is disabled
-	 * in guest context (software), but the register state in the hardware
-	 * may still be in use. This is why we explicitly re-enable the hardware
-	 * before saving.
+	 * With T&E, FPU & MSA get disabled in root context (hardware) when it
+	 * is disabled in guest context (software), but the register state in
+	 * the hardware may still be in use.
+	 * This is why we explicitly re-enable the hardware before saving.
 	 */
 
 	preempt_disable();
 	if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
-		set_c0_config5(MIPS_CONF5_MSAEN);
-		enable_fpu_hazard();
+		if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+			set_c0_config5(MIPS_CONF5_MSAEN);
+			enable_fpu_hazard();
+		}
 
 		__kvm_save_msa(&vcpu->arch);
 		trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
@@ -1549,8 +1551,10 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 		}
 		vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
 	} else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
-		set_c0_status(ST0_CU1);
-		enable_fpu_hazard();
+		if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+			set_c0_status(ST0_CU1);
+			enable_fpu_hazard();
+		}
 
 		__kvm_save_fpu(&vcpu->arch);
 		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;

From a7244920d1096c267c991a7506a519cdb92d7a24 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:18 +0000
Subject: [PATCH 024/227] KVM: MIPS: Extend counters & events for VZ GExcCodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend MIPS KVM stats counters and kvm_transition trace event codes to
cover hypervisor exceptions, which have their own GExcCode field in
CP0_GuestCtl0 with up to 32 hypervisor exception cause codes.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 10 ++++++++++
 arch/mips/kvm/mips.c             | 10 ++++++++++
 arch/mips/kvm/trace.h            | 18 +++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 0d308d4f2429f9..e8843784b784a3 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -145,6 +145,16 @@ struct kvm_vcpu_stat {
 	u64 fpe_exits;
 	u64 msa_disabled_exits;
 	u64 flush_dcache_exits;
+#ifdef CONFIG_KVM_MIPS_VZ
+	u64 vz_gpsi_exits;
+	u64 vz_gsfc_exits;
+	u64 vz_hc_exits;
+	u64 vz_grr_exits;
+	u64 vz_gva_exits;
+	u64 vz_ghfc_exits;
+	u64 vz_gpa_exits;
+	u64 vz_resvd_exits;
+#endif
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
 	u64 halt_poll_invalid;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a743f67378ba76..c507533ef6ea97 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -59,6 +59,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "fpe",	  VCPU_STAT(fpe_exits),		 KVM_STAT_VCPU },
 	{ "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
 	{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+#ifdef CONFIG_KVM_MIPS_VZ
+	{ "vz_gpsi",	  VCPU_STAT(vz_gpsi_exits),	 KVM_STAT_VCPU },
+	{ "vz_gsfc",	  VCPU_STAT(vz_gsfc_exits),	 KVM_STAT_VCPU },
+	{ "vz_hc",	  VCPU_STAT(vz_hc_exits),	 KVM_STAT_VCPU },
+	{ "vz_grr",	  VCPU_STAT(vz_grr_exits),	 KVM_STAT_VCPU },
+	{ "vz_gva",	  VCPU_STAT(vz_gva_exits),	 KVM_STAT_VCPU },
+	{ "vz_ghfc",	  VCPU_STAT(vz_ghfc_exits),	 KVM_STAT_VCPU },
+	{ "vz_gpa",	  VCPU_STAT(vz_gpa_exits),	 KVM_STAT_VCPU },
+	{ "vz_resvd",	  VCPU_STAT(vz_resvd_exits),	 KVM_STAT_VCPU },
+#endif
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
 	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index c858cf16807840..6e43c89114b831 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -66,6 +66,15 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 #define KVM_TRACE_EXIT_WAIT		32
 #define KVM_TRACE_EXIT_CACHE		33
 #define KVM_TRACE_EXIT_SIGNAL		34
+/* 32 exit reasons correspond to GuestCtl0.GExcCode (VZ) */
+#define KVM_TRACE_EXIT_GEXCCODE_BASE	64
+#define KVM_TRACE_EXIT_GPSI		64	/*  0 */
+#define KVM_TRACE_EXIT_GSFC		65	/*  1 */
+#define KVM_TRACE_EXIT_HC		66	/*  2 */
+#define KVM_TRACE_EXIT_GRR		67	/*  3 */
+#define KVM_TRACE_EXIT_GVA		72	/*  8 */
+#define KVM_TRACE_EXIT_GHFC		73	/*  9 */
+#define KVM_TRACE_EXIT_GPA		74	/* 10 */
 
 /* Tracepoints for VM exits */
 #define kvm_trace_symbol_exit_types				\
@@ -85,7 +94,14 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 	{ KVM_TRACE_EXIT_MSA_DISABLED,	"MSA Disabled" },	\
 	{ KVM_TRACE_EXIT_WAIT,		"WAIT" },		\
 	{ KVM_TRACE_EXIT_CACHE,		"CACHE" },		\
-	{ KVM_TRACE_EXIT_SIGNAL,	"Signal" }
+	{ KVM_TRACE_EXIT_SIGNAL,	"Signal" },		\
+	{ KVM_TRACE_EXIT_GPSI,		"GPSI" },		\
+	{ KVM_TRACE_EXIT_GSFC,		"GSFC" },		\
+	{ KVM_TRACE_EXIT_HC,		"HC" },			\
+	{ KVM_TRACE_EXIT_GRR,		"GRR" },		\
+	{ KVM_TRACE_EXIT_GVA,		"GVA" },		\
+	{ KVM_TRACE_EXIT_GHFC,		"GHFC" },		\
+	{ KVM_TRACE_EXIT_GPA,		"GPA" }
 
 TRACE_EVENT(kvm_exit,
 	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),

From a8a3c426772e55ae9c3209f061cb6317268f932c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:19 +0000
Subject: [PATCH 025/227] KVM: MIPS: Add VZ & TE capabilities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add new KVM_CAP_MIPS_VZ and KVM_CAP_MIPS_TE capabilities, and in order
to allow MIPS KVM to support VZ without confusing old users (which
expect the trap & emulate implementation), define and start checking
KVM_CREATE_VM type codes.

The codes available are:

 - KVM_VM_MIPS_TE = 0

   This is the current value expected from the user, and will create a
   VM using trap & emulate in user mode, confined to the user mode
   address space. This may in future become unavailable if the kernel is
   only configured to support VZ, in which case the EINVAL error will be
   returned and KVM_CAP_MIPS_TE won't be available even though
   KVM_CAP_MIPS_VZ is.

 - KVM_VM_MIPS_VZ = 1

   This can be provided when the KVM_CAP_MIPS_VZ capability is available
   to create a VM using VZ, with a fully virtualized guest virtual
   address space. If VZ support is unavailable in the kernel, the EINVAL
   error will be returned (although old kernels without the
   KVM_CAP_MIPS_VZ capability may well succeed and create a trap &
   emulate VM).

This is designed to allow the desired implementation (T&E vs VZ) to be
potentially chosen at runtime rather than being fixed in the kernel
configuration.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt | 47 ++++++++++++++++++++++++++++++-
 arch/mips/kvm/mips.c              |  9 ++++++
 include/uapi/linux/kvm.h          |  6 ++++
 3 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c248f772ae616..4b5fa2571efa45 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -115,12 +115,17 @@ will access the virtual machine's physical address space; offset zero
 corresponds to guest physical address zero.  Use of mmap() on a VM fd
 is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
 available.
-You most certainly want to use 0 as machine type.
+You probably want to use 0 as machine type.
 
 In order to create user controlled virtual machines on S390, check
 KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
 privileged user (CAP_SYS_ADMIN).
 
+To use hardware assisted virtualization on MIPS (VZ ASE) rather than
+the default trap & emulate implementation (which changes the virtual
+memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
+flag KVM_VM_MIPS_VZ.
+
 
 4.3 KVM_GET_MSR_INDEX_LIST
 
@@ -4147,3 +4152,43 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is
 available, means that that the kernel can support guests using the
 hashed page table MMU defined in Power ISA V3.00 (as implemented in
 the POWER9 processor), including in-memory segment tables.
+
+8.5 KVM_CAP_MIPS_VZ
+
+Architectures: mips
+
+This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that
+it is available, means that full hardware assisted virtualization capabilities
+of the hardware are available for use through KVM. An appropriate
+KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which
+utilises it.
+
+If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
+available, it means that the VM is using full hardware assisted virtualization
+capabilities of the hardware. This is useful to check after creating a VM with
+KVM_VM_MIPS_DEFAULT.
+
+The value returned by KVM_CHECK_EXTENSION should be compared against known
+values (see below). All other values are reserved. This is to allow for the
+possibility of other hardware assisted virtualization implementations which
+may be incompatible with the MIPS VZ ASE.
+
+ 0: The trap & emulate implementation is in use to run guest code in user
+    mode. Guest virtual memory segments are rearranged to fit the guest in the
+    user mode address space.
+
+ 1: The MIPS VZ ASE is in use, providing full hardware assisted
+    virtualization, including standard guest virtual memory segments.
+
+8.6 KVM_CAP_MIPS_TE
+
+Architectures: mips
+
+This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that
+it is available, means that the trap & emulate implementation is available to
+run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware
+assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed
+to KVM_CREATE_VM to create a VM which utilises it.
+
+If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
+available, it means that the VM is using trap & emulate.
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index c507533ef6ea97..476ece99bf3be4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -107,6 +107,14 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	switch (type) {
+	case KVM_VM_MIPS_TE:
+		break;
+	default:
+		/* Unsupported KVM type */
+		return -EINVAL;
+	};
+
 	/* Allocate page table to map GPA -> RPA */
 	kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
 	if (!kvm->arch.gpa_mm.pgd)
@@ -1038,6 +1046,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_IMMEDIATE_EXIT:
+	case KVM_CAP_MIPS_TE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f51d5082a377ec..58ddedce423521 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -702,6 +702,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_VM_PPC_HV 1
 #define KVM_VM_PPC_PR 2
 
+/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
+#define KVM_VM_MIPS_TE		0
+#define KVM_VM_MIPS_VZ		1
+
 #define KVM_S390_SIE_PAGE_OFFSET 1
 
 /*
@@ -883,6 +887,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_MIPS_VZ 137
+#define KVM_CAP_MIPS_TE 138
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From 578fd61d2d210a3b58dc107f5382b965922ac253 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:20 +0000
Subject: [PATCH 026/227] KVM: MIPS: Add 64BIT capability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new KVM_CAP_MIPS_64BIT capability to indicate that 64-bit MIPS
guests are available and supported. In this case it should still be
possible to run 32-bit guest code. If not available it won't be possible
to run 64-bit guest code and the instructions may not be available, or
the kernel may not support full context switching of 64-bit registers.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt | 25 +++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 2 files changed, 26 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4b5fa2571efa45..1b8486c094b431 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4192,3 +4192,28 @@ to KVM_CREATE_VM to create a VM which utilises it.
 
 If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
 available, it means that the VM is using trap & emulate.
+
+8.7 KVM_CAP_MIPS_64BIT
+
+Architectures: mips
+
+This capability indicates the supported architecture type of the guest, i.e. the
+supported register and address width.
+
+The values returned when this capability is checked by KVM_CHECK_EXTENSION on a
+kvm VM handle correspond roughly to the CP0_Config.AT register field, and should
+be checked specifically against known values (see below). All other values are
+reserved.
+
+ 0: MIPS32 or microMIPS32.
+    Both registers and addresses are 32-bits wide.
+    It will only be possible to run 32-bit guest code.
+
+ 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments.
+    Registers are 64-bits wide, but addresses are 32-bits wide.
+    64-bit guest code may run but cannot access MIPS64 memory segments.
+    It will also be possible to run 32-bit guest code.
+
+ 2: MIPS64 or microMIPS64 with access to all address segments.
+    Both registers and addresses are 64-bits wide.
+    It will be possible to run 64-bit or 32-bit guest code.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 58ddedce423521..1e1a6c728a1835 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -889,6 +889,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_IMMEDIATE_EXIT 136
 #define KVM_CAP_MIPS_VZ 137
 #define KVM_CAP_MIPS_TE 138
+#define KVM_CAP_MIPS_64BIT 139
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From a517c1ad65921765d5ac8594567c7bb6f352655c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:21 +0000
Subject: [PATCH 027/227] KVM: MIPS: Init timer frequency from callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the software emulated timer is initialised to a frequency of
100MHz by kvm_mips_init_count(), but this isn't suitable for VZ where
the frequency of the guest timer matches that of the host.

Add a count_hz argument so the caller can specify the default frequency,
and move the call from kvm_arch_vcpu_create() to the implementation
specific vcpu_setup() callback, so that VZ can specify a different
frequency.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  2 +-
 arch/mips/kvm/emulate.c          | 13 ++++++-------
 arch/mips/kvm/mips.c             |  3 ---
 arch/mips/kvm/trap_emul.c        |  3 +++
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index e8843784b784a3..f0227c0d27801c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -806,7 +806,7 @@ extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
 void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
 void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
-void kvm_mips_init_count(struct kvm_vcpu *vcpu);
+void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz);
 int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
 int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
 int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index d75ab8940e1fc5..f09a161926e709 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -543,16 +543,15 @@ void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
 /**
  * kvm_mips_init_count() - Initialise timer.
  * @vcpu:	Virtual CPU.
+ * @count_hz:	Frequency of timer.
  *
- * Initialise the timer to a sensible frequency, namely 100MHz, zero it, and set
- * it going if it's enabled.
+ * Initialise the timer to the specified frequency, zero it, and set it going if
+ * it's enabled.
  */
-void kvm_mips_init_count(struct kvm_vcpu *vcpu)
+void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz)
 {
-	/* 100 MHz */
-	vcpu->arch.count_hz = 100*1000*1000;
-	vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32,
-					  vcpu->arch.count_hz);
+	vcpu->arch.count_hz = count_hz;
+	vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, count_hz);
 	vcpu->arch.count_dyn_bias = 0;
 
 	/* Starting at 0 */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 476ece99bf3be4..36561df8a7a792 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -372,9 +372,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Init */
 	vcpu->arch.last_sched_cpu = -1;
 
-	/* Start off the timer */
-	kvm_mips_init_count(vcpu);
-
 	return vcpu;
 
 out_free_gebase:
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 3a854bb9e606be..0e3d260aafcb5e 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -565,6 +565,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	u32 config, config1;
 	int vcpu_id = vcpu->vcpu_id;
 
+	/* Start off the timer at 100 MHz */
+	kvm_mips_init_count(vcpu, 100*1000*1000);
+
 	/*
 	 * Arch specific stuff, set up config registers properly so that the
 	 * guest will come up as expected

From 607ef2fdeb95ae63322321ca15b41f76870f48f1 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:22 +0000
Subject: [PATCH 028/227] KVM: MIPS: Add callback to check extension
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an implementation callback for checking presence of KVM extensions.
This allows implementation specific extensions to be provided without
ifdefs in mips.c.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  1 +
 arch/mips/kvm/mips.c             |  3 +--
 arch/mips/kvm/trap_emul.c        | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f0227c0d27801c..3a52a6f0573b6f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -542,6 +542,7 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
+	int (*check_extension)(struct kvm *kvm, long ext);
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
 	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 36561df8a7a792..78d58c2528a9b2 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1043,7 +1043,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_IMMEDIATE_EXIT:
-	case KVM_CAP_MIPS_TE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1075,7 +1074,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = cpu_has_msa && !(boot_cpu_data.msa_id & MSA_IR_WRPF);
 		break;
 	default:
-		r = 0;
+		r = kvm_mips_callbacks->check_extension(kvm, ext);
 		break;
 	}
 	return r;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 0e3d260aafcb5e..db3c48634eda1c 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -488,6 +488,22 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int kvm_trap_emul_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_MIPS_TE:
+		r = 1;
+		break;
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -1238,6 +1254,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.handle_fpe = kvm_trap_emul_handle_fpe,
 	.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
 
+	.check_extension = kvm_trap_emul_check_extension,
 	.vcpu_init = kvm_trap_emul_vcpu_init,
 	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
 	.vcpu_setup = kvm_trap_emul_vcpu_setup,

From edab4fe1f6419fc3a1de3b134929ae2123bea3ad Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:23 +0000
Subject: [PATCH 029/227] KVM: MIPS: Add hardware_{enable,disable} callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an implementation callback for the kvm_arch_hardware_enable() and
kvm_arch_hardware_disable() architecture functions, with simple stubs
for trap & emulate. This is in preparation for VZ which will make use of
them.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  3 ++-
 arch/mips/kvm/mips.c             |  7 ++++++-
 arch/mips/kvm/trap_emul.c        | 11 +++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 3a52a6f0573b6f..acfe7e4e8a3c36 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -542,6 +542,8 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
+	int (*hardware_enable)(void);
+	void (*hardware_disable)(void);
 	int (*check_extension)(struct kvm *kvm, long ext);
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
 	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
@@ -864,7 +866,6 @@ extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
 extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu);
 extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
 
-static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 78d58c2528a9b2..5681117083af55 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -92,7 +92,12 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void)
 {
-	return 0;
+	return kvm_mips_callbacks->hardware_enable();
+}
+
+void kvm_arch_hardware_disable(void)
+{
+	kvm_mips_callbacks->hardware_disable();
 }
 
 int kvm_arch_hardware_setup(void)
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index db3c48634eda1c..50b812bfe08321 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -488,6 +488,15 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int kvm_trap_emul_hardware_enable(void)
+{
+	return 0;
+}
+
+static void kvm_trap_emul_hardware_disable(void)
+{
+}
+
 static int kvm_trap_emul_check_extension(struct kvm *kvm, long ext)
 {
 	int r;
@@ -1254,6 +1263,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.handle_fpe = kvm_trap_emul_handle_fpe,
 	.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
 
+	.hardware_enable = kvm_trap_emul_hardware_enable,
+	.hardware_disable = kvm_trap_emul_hardware_disable,
 	.check_extension = kvm_trap_emul_check_extension,
 	.vcpu_init = kvm_trap_emul_vcpu_init,
 	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,

From 28c1e762b01eee56ada8148c88c4f1e99beb5584 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:24 +0000
Subject: [PATCH 030/227] KVM: MIPS: Add guest exit exception callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a callback for MIPS KVM implementations to handle the VZ guest
exit exception. Currently the trap & emulate implementation contains a
stub which reports an internal error, but the callback will be used
properly by the VZ implementation.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  1 +
 arch/mips/kvm/mips.c             |  5 +++++
 arch/mips/kvm/trace.h            |  2 ++
 arch/mips/kvm/trap_emul.c        | 24 ++++++++++++++++++++++++
 4 files changed, 32 insertions(+)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index acfe7e4e8a3c36..32229a1a4a7953 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -542,6 +542,7 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
+	int (*handle_guest_exit)(struct kvm_vcpu *vcpu);
 	int (*hardware_enable)(void);
 	void (*hardware_disable)(void);
 	int (*check_extension)(struct kvm *kvm, long ext);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 5681117083af55..6e91c24162788e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1348,6 +1348,11 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
 		break;
 
+	case EXCCODE_GE:
+		/* defer exit accounting to handler */
+		ret = kvm_mips_callbacks->handle_guest_exit(vcpu);
+		break;
+
 	default:
 		if (cause & CAUSEF_BD)
 			opc += 1;
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index 6e43c89114b831..0c59282a2f7d4b 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -62,6 +62,7 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 #define KVM_TRACE_EXIT_MSA_FPE		14
 #define KVM_TRACE_EXIT_FPE		15
 #define KVM_TRACE_EXIT_MSA_DISABLED	21
+#define KVM_TRACE_EXIT_GUEST_EXIT	27
 /* Further exit reasons */
 #define KVM_TRACE_EXIT_WAIT		32
 #define KVM_TRACE_EXIT_CACHE		33
@@ -92,6 +93,7 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 	{ KVM_TRACE_EXIT_MSA_FPE,	"MSA FPE" },		\
 	{ KVM_TRACE_EXIT_FPE,		"FPE" },		\
 	{ KVM_TRACE_EXIT_MSA_DISABLED,	"MSA Disabled" },	\
+	{ KVM_TRACE_EXIT_GUEST_EXIT,	"Guest Exit" },		\
 	{ KVM_TRACE_EXIT_WAIT,		"WAIT" },		\
 	{ KVM_TRACE_EXIT_CACHE,		"CACHE" },		\
 	{ KVM_TRACE_EXIT_SIGNAL,	"Signal" },		\
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 50b812bfe08321..fda45b4bdebcb6 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -40,6 +40,29 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 	return gpa;
 }
 
+static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu)
+{
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+	u32 inst = 0;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n",
+		exccode, opc, inst, badvaddr,
+		kvm_read_c0_guest_status(vcpu->arch.cop0));
+	kvm_arch_vcpu_dump_regs(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	return RESUME_HOST;
+}
+
 static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -1262,6 +1285,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.handle_msa_fpe = kvm_trap_emul_handle_msa_fpe,
 	.handle_fpe = kvm_trap_emul_handle_fpe,
 	.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
+	.handle_guest_exit = kvm_trap_emul_no_handler,
 
 	.hardware_enable = kvm_trap_emul_hardware_enable,
 	.hardware_disable = kvm_trap_emul_hardware_disable,

From a27660f3e4b3a0740df3f86a556eecab4433ba9f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:25 +0000
Subject: [PATCH 031/227] KVM: MIPS: Abstract guest CP0 register access for VZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Abstract the MIPS KVM guest CP0 register access macros into inline
functions which are generated by macros. This allows them to be
generated differently for VZ, where they will usually need to access the
hardware guest CP0 context rather than the saved values in RAM.

Accessors for each individual register are generated using these macros:

 - __BUILD_KVM_*_SW() for registers which are not present in the VZ
   hardware guest context, so kvm_{read,write}_c0_guest_##name() will
   access the saved value in RAM regardless of whether VZ is enabled.

 - __BUILD_KVM_*_HW() for registers which are present in the VZ hardware
   guest context, so kvm_{read,write}_c0_guest_##name() will access the
   hardware register when VZ is enabled.

These build the underlying accessors using further macros:

 - __BUILD_KVM_*_SAVED() builds e.g. kvm_{read,write}_sw_gc0_##name()
   functions for accessing the saved versions of the registers in RAM.
   This is used for implementing the common
   kvm_{read,write}_c0_guest_##name() accessors with T&E where registers
   are always stored in RAM, but are also available with VZ HW registers
   to allow them to be accessed while saved.

 - __BUILD_KVM_*_VZ() builds e.g. kvm_{read,write}_vz_gc0_##name()
   functions for accessing the VZ hardware guest context registers
   directly. This is used for implementing the common
   kvm_{read,write}_c0_guest_##name() accessors with VZ.

 - __BUILD_KVM_*_WRAP() builds wrappers with different names, which
   allows the common kvm_{read,write}_c0_guest_##name() functions to be
   implemented using the VZ accessors while still having the SAVED
   accessors available too.

 - __BUILD_KVM_SAVE_VZ() builds functions for saving and restoring VZ
   hardware guest context register state to RAM, improving conciseness
   of VZ context saving and restoring.

Similar macros exist for generating modifiers (set, clear, change),
either with a normal unlocked read/modify/write, or using atomic LL/SC
sequences.

These changes change the types of 32-bit registers to u32 instead of
unsigned long, which requires some changes to printk() functions in MIPS
KVM.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 343 +++++++++++++++++++++++--------
 arch/mips/kvm/emulate.c          |   2 +-
 arch/mips/kvm/mips.c             |   6 +-
 arch/mips/kvm/trap_emul.c        |   8 +-
 4 files changed, 264 insertions(+), 95 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 32229a1a4a7953..cd7488641db8d3 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -360,78 +360,6 @@ struct kvm_vcpu_arch {
 	u8 msa_enabled;
 };
 
-
-#define kvm_read_c0_guest_index(cop0)		(cop0->reg[MIPS_CP0_TLB_INDEX][0])
-#define kvm_write_c0_guest_index(cop0, val)	(cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
-#define kvm_read_c0_guest_entrylo0(cop0)	(cop0->reg[MIPS_CP0_TLB_LO0][0])
-#define kvm_write_c0_guest_entrylo0(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
-#define kvm_read_c0_guest_entrylo1(cop0)	(cop0->reg[MIPS_CP0_TLB_LO1][0])
-#define kvm_write_c0_guest_entrylo1(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
-#define kvm_read_c0_guest_context(cop0)		(cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
-#define kvm_write_c0_guest_context(cop0, val)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
-#define kvm_read_c0_guest_userlocal(cop0)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
-#define kvm_write_c0_guest_userlocal(cop0, val)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][2] = (val))
-#define kvm_read_c0_guest_pagemask(cop0)	(cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
-#define kvm_write_c0_guest_pagemask(cop0, val)	(cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
-#define kvm_read_c0_guest_wired(cop0)		(cop0->reg[MIPS_CP0_TLB_WIRED][0])
-#define kvm_write_c0_guest_wired(cop0, val)	(cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
-#define kvm_read_c0_guest_hwrena(cop0)		(cop0->reg[MIPS_CP0_HWRENA][0])
-#define kvm_write_c0_guest_hwrena(cop0, val)	(cop0->reg[MIPS_CP0_HWRENA][0] = (val))
-#define kvm_read_c0_guest_badvaddr(cop0)	(cop0->reg[MIPS_CP0_BAD_VADDR][0])
-#define kvm_write_c0_guest_badvaddr(cop0, val)	(cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
-#define kvm_read_c0_guest_count(cop0)		(cop0->reg[MIPS_CP0_COUNT][0])
-#define kvm_write_c0_guest_count(cop0, val)	(cop0->reg[MIPS_CP0_COUNT][0] = (val))
-#define kvm_read_c0_guest_entryhi(cop0)		(cop0->reg[MIPS_CP0_TLB_HI][0])
-#define kvm_write_c0_guest_entryhi(cop0, val)	(cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
-#define kvm_read_c0_guest_compare(cop0)		(cop0->reg[MIPS_CP0_COMPARE][0])
-#define kvm_write_c0_guest_compare(cop0, val)	(cop0->reg[MIPS_CP0_COMPARE][0] = (val))
-#define kvm_read_c0_guest_status(cop0)		(cop0->reg[MIPS_CP0_STATUS][0])
-#define kvm_write_c0_guest_status(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][0] = (val))
-#define kvm_read_c0_guest_intctl(cop0)		(cop0->reg[MIPS_CP0_STATUS][1])
-#define kvm_write_c0_guest_intctl(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][1] = (val))
-#define kvm_read_c0_guest_cause(cop0)		(cop0->reg[MIPS_CP0_CAUSE][0])
-#define kvm_write_c0_guest_cause(cop0, val)	(cop0->reg[MIPS_CP0_CAUSE][0] = (val))
-#define kvm_read_c0_guest_epc(cop0)		(cop0->reg[MIPS_CP0_EXC_PC][0])
-#define kvm_write_c0_guest_epc(cop0, val)	(cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
-#define kvm_read_c0_guest_prid(cop0)		(cop0->reg[MIPS_CP0_PRID][0])
-#define kvm_write_c0_guest_prid(cop0, val)	(cop0->reg[MIPS_CP0_PRID][0] = (val))
-#define kvm_read_c0_guest_ebase(cop0)		(cop0->reg[MIPS_CP0_PRID][1])
-#define kvm_write_c0_guest_ebase(cop0, val)	(cop0->reg[MIPS_CP0_PRID][1] = (val))
-#define kvm_read_c0_guest_config(cop0)		(cop0->reg[MIPS_CP0_CONFIG][0])
-#define kvm_read_c0_guest_config1(cop0)		(cop0->reg[MIPS_CP0_CONFIG][1])
-#define kvm_read_c0_guest_config2(cop0)		(cop0->reg[MIPS_CP0_CONFIG][2])
-#define kvm_read_c0_guest_config3(cop0)		(cop0->reg[MIPS_CP0_CONFIG][3])
-#define kvm_read_c0_guest_config4(cop0)		(cop0->reg[MIPS_CP0_CONFIG][4])
-#define kvm_read_c0_guest_config5(cop0)		(cop0->reg[MIPS_CP0_CONFIG][5])
-#define kvm_read_c0_guest_config7(cop0)		(cop0->reg[MIPS_CP0_CONFIG][7])
-#define kvm_write_c0_guest_config(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][0] = (val))
-#define kvm_write_c0_guest_config1(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][1] = (val))
-#define kvm_write_c0_guest_config2(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][2] = (val))
-#define kvm_write_c0_guest_config3(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][3] = (val))
-#define kvm_write_c0_guest_config4(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][4] = (val))
-#define kvm_write_c0_guest_config5(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][5] = (val))
-#define kvm_write_c0_guest_config7(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][7] = (val))
-#define kvm_read_c0_guest_errorepc(cop0)	(cop0->reg[MIPS_CP0_ERROR_PC][0])
-#define kvm_write_c0_guest_errorepc(cop0, val)	(cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
-#define kvm_read_c0_guest_kscratch1(cop0)	(cop0->reg[MIPS_CP0_DESAVE][2])
-#define kvm_read_c0_guest_kscratch2(cop0)	(cop0->reg[MIPS_CP0_DESAVE][3])
-#define kvm_read_c0_guest_kscratch3(cop0)	(cop0->reg[MIPS_CP0_DESAVE][4])
-#define kvm_read_c0_guest_kscratch4(cop0)	(cop0->reg[MIPS_CP0_DESAVE][5])
-#define kvm_read_c0_guest_kscratch5(cop0)	(cop0->reg[MIPS_CP0_DESAVE][6])
-#define kvm_read_c0_guest_kscratch6(cop0)	(cop0->reg[MIPS_CP0_DESAVE][7])
-#define kvm_write_c0_guest_kscratch1(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][2] = (val))
-#define kvm_write_c0_guest_kscratch2(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][3] = (val))
-#define kvm_write_c0_guest_kscratch3(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][4] = (val))
-#define kvm_write_c0_guest_kscratch4(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][5] = (val))
-#define kvm_write_c0_guest_kscratch5(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][6] = (val))
-#define kvm_write_c0_guest_kscratch6(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][7] = (val))
-
-/*
- * Some of the guest registers may be modified asynchronously (e.g. from a
- * hrtimer callback in hard irq context) and therefore need stronger atomicity
- * guarantees than other registers.
- */
-
 static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
 						unsigned long val)
 {
@@ -482,26 +410,265 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 	} while (unlikely(!temp));
 }
 
-#define kvm_set_c0_guest_status(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][0] |= (val))
-#define kvm_clear_c0_guest_status(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
+/* Guest register types, used in accessor build below */
+#define __KVMT32	u32
+#define __KVMTl	unsigned long
 
-/* Cause can be modified asynchronously from hardirq hrtimer callback */
-#define kvm_set_c0_guest_cause(cop0, val)				\
-	_kvm_atomic_set_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
-#define kvm_clear_c0_guest_cause(cop0, val)				\
-	_kvm_atomic_clear_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
-#define kvm_change_c0_guest_cause(cop0, change, val)			\
-	_kvm_atomic_change_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0],	\
-					change, val)
-
-#define kvm_set_c0_guest_ebase(cop0, val)	(cop0->reg[MIPS_CP0_PRID][1] |= (val))
-#define kvm_clear_c0_guest_ebase(cop0, val)	(cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
-#define kvm_change_c0_guest_ebase(cop0, change, val)			\
+/*
+ * __BUILD_KVM_$ops_SAVED(): kvm_$op_sw_gc0_$reg()
+ * These operate on the saved guest C0 state in RAM.
+ */
+
+/* Generate saved context simple accessors */
+#define __BUILD_KVM_RW_SAVED(name, type, _reg, sel)			\
+static inline __KVMT##type kvm_read_sw_gc0_##name(struct mips_coproc *cop0) \
+{									\
+	return cop0->reg[(_reg)][(sel)];				\
+}									\
+static inline void kvm_write_sw_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	cop0->reg[(_reg)][(sel)] = val;					\
+}
+
+/* Generate saved context bitwise modifiers */
+#define __BUILD_KVM_SET_SAVED(name, type, _reg, sel)			\
+static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0,	\
+					 __KVMT##type val)		\
+{									\
+	cop0->reg[(_reg)][(sel)] |= val;				\
+}									\
+static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	cop0->reg[(_reg)][(sel)] &= ~val;				\
+}									\
+static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0,	\
+					    __KVMT##type mask,		\
+					    __KVMT##type val)		\
+{									\
+	unsigned long _mask = mask;					\
+	cop0->reg[(_reg)][(sel)] &= ~_mask;				\
+	cop0->reg[(_reg)][(sel)] |= val & _mask;			\
+}
+
+/* Generate saved context atomic bitwise modifiers */
+#define __BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel)			\
+static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0,	\
+					 __KVMT##type val)		\
+{									\
+	_kvm_atomic_set_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val);	\
+}									\
+static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	_kvm_atomic_clear_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val);	\
+}									\
+static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0,	\
+					    __KVMT##type mask,		\
+					    __KVMT##type val)		\
 {									\
-	kvm_clear_c0_guest_ebase(cop0, change);				\
-	kvm_set_c0_guest_ebase(cop0, ((val) & (change)));		\
+	_kvm_atomic_change_c0_guest_reg(&cop0->reg[(_reg)][(sel)], mask, \
+					val);				\
 }
 
+/*
+ * __BUILD_KVM_$ops_VZ(): kvm_$op_vz_gc0_$reg()
+ * These operate on the VZ guest C0 context in hardware.
+ */
+
+/* Generate VZ guest context simple accessors */
+#define __BUILD_KVM_RW_VZ(name, type, _reg, sel)			\
+static inline __KVMT##type kvm_read_vz_gc0_##name(struct mips_coproc *cop0) \
+{									\
+	return read_gc0_##name();					\
+}									\
+static inline void kvm_write_vz_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	write_gc0_##name(val);						\
+}
+
+/* Generate VZ guest context bitwise modifiers */
+#define __BUILD_KVM_SET_VZ(name, type, _reg, sel)			\
+static inline void kvm_set_vz_gc0_##name(struct mips_coproc *cop0,	\
+					 __KVMT##type val)		\
+{									\
+	set_gc0_##name(val);						\
+}									\
+static inline void kvm_clear_vz_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	clear_gc0_##name(val);						\
+}									\
+static inline void kvm_change_vz_gc0_##name(struct mips_coproc *cop0,	\
+					    __KVMT##type mask,		\
+					    __KVMT##type val)		\
+{									\
+	change_gc0_##name(mask, val);					\
+}
+
+/* Generate VZ guest context save/restore to/from saved context */
+#define __BUILD_KVM_SAVE_VZ(name, _reg, sel)			\
+static inline void kvm_restore_gc0_##name(struct mips_coproc *cop0)	\
+{									\
+	write_gc0_##name(cop0->reg[(_reg)][(sel)]);			\
+}									\
+static inline void kvm_save_gc0_##name(struct mips_coproc *cop0)	\
+{									\
+	cop0->reg[(_reg)][(sel)] = read_gc0_##name();			\
+}
+
+/*
+ * __BUILD_KVM_$ops_WRAP(): kvm_$op_$name1() -> kvm_$op_$name2()
+ * These wrap a set of operations to provide them with a different name.
+ */
+
+/* Generate simple accessor wrapper */
+#define __BUILD_KVM_RW_WRAP(name1, name2, type)				\
+static inline __KVMT##type kvm_read_##name1(struct mips_coproc *cop0)	\
+{									\
+	return kvm_read_##name2(cop0);					\
+}									\
+static inline void kvm_write_##name1(struct mips_coproc *cop0,		\
+				     __KVMT##type val)			\
+{									\
+	kvm_write_##name2(cop0, val);					\
+}
+
+/* Generate bitwise modifier wrapper */
+#define __BUILD_KVM_SET_WRAP(name1, name2, type)			\
+static inline void kvm_set_##name1(struct mips_coproc *cop0,		\
+				   __KVMT##type val)			\
+{									\
+	kvm_set_##name2(cop0, val);					\
+}									\
+static inline void kvm_clear_##name1(struct mips_coproc *cop0,		\
+				     __KVMT##type val)			\
+{									\
+	kvm_clear_##name2(cop0, val);					\
+}									\
+static inline void kvm_change_##name1(struct mips_coproc *cop0,		\
+				      __KVMT##type mask,		\
+				      __KVMT##type val)			\
+{									\
+	kvm_change_##name2(cop0, mask, val);				\
+}
+
+/*
+ * __BUILD_KVM_$ops_SW(): kvm_$op_c0_guest_$reg() -> kvm_$op_sw_gc0_$reg()
+ * These generate accessors operating on the saved context in RAM, and wrap them
+ * with the common guest C0 accessors (for use by common emulation code).
+ */
+
+#define __BUILD_KVM_RW_SW(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#define __BUILD_KVM_SET_SW(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#define __BUILD_KVM_ATOMIC_SW(name, type, _reg, sel)			\
+	__BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#ifndef CONFIG_KVM_MIPS_VZ
+
+/*
+ * T&E (trap & emulate software based virtualisation)
+ * We generate the common accessors operating exclusively on the saved context
+ * in RAM.
+ */
+
+#define __BUILD_KVM_RW_HW	__BUILD_KVM_RW_SW
+#define __BUILD_KVM_SET_HW	__BUILD_KVM_SET_SW
+#define __BUILD_KVM_ATOMIC_HW	__BUILD_KVM_ATOMIC_SW
+
+#else
+
+/*
+ * VZ (hardware assisted virtualisation)
+ * These macros use the active guest state in VZ mode (hardware registers),
+ */
+
+/*
+ * __BUILD_KVM_$ops_HW(): kvm_$op_c0_guest_$reg() -> kvm_$op_vz_gc0_$reg()
+ * These generate accessors operating on the VZ guest context in hardware, and
+ * wrap them with the common guest C0 accessors (for use by common emulation
+ * code).
+ *
+ * Accessors operating on the saved context in RAM are also generated to allow
+ * convenient explicit saving and restoring of the state.
+ */
+
+#define __BUILD_KVM_RW_HW(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_VZ(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_WRAP(c0_guest_##name, vz_gc0_##name, type)	\
+	__BUILD_KVM_SAVE_VZ(name, _reg, sel)
+
+#define __BUILD_KVM_SET_HW(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_VZ(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_WRAP(c0_guest_##name, vz_gc0_##name, type)
+
+/*
+ * We can't do atomic modifications of COP0 state if hardware can modify it.
+ * Races must be handled explicitly.
+ */
+#define __BUILD_KVM_ATOMIC_HW	__BUILD_KVM_SET_HW
+
+#endif
+
+/*
+ * Define accessors for CP0 registers that are accessible to the guest. These
+ * are primarily used by common emulation code, which may need to access the
+ * registers differently depending on the implementation.
+ *
+ *    fns_hw/sw    name     type    reg num         select
+ */
+__BUILD_KVM_RW_HW(index,          32, MIPS_CP0_TLB_INDEX,    0)
+__BUILD_KVM_RW_HW(entrylo0,       l,  MIPS_CP0_TLB_LO0,      0)
+__BUILD_KVM_RW_HW(entrylo1,       l,  MIPS_CP0_TLB_LO1,      0)
+__BUILD_KVM_RW_HW(context,        l,  MIPS_CP0_TLB_CONTEXT,  0)
+__BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
+__BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
+__BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
+__BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
+__BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
+__BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
+__BUILD_KVM_RW_SW(count,          32, MIPS_CP0_COUNT,        0)
+__BUILD_KVM_RW_HW(entryhi,        l,  MIPS_CP0_TLB_HI,       0)
+__BUILD_KVM_RW_HW(compare,        32, MIPS_CP0_COMPARE,      0)
+__BUILD_KVM_RW_HW(status,         32, MIPS_CP0_STATUS,       0)
+__BUILD_KVM_RW_HW(intctl,         32, MIPS_CP0_STATUS,       1)
+__BUILD_KVM_RW_HW(cause,          32, MIPS_CP0_CAUSE,        0)
+__BUILD_KVM_RW_HW(epc,            l,  MIPS_CP0_EXC_PC,       0)
+__BUILD_KVM_RW_SW(prid,           32, MIPS_CP0_PRID,         0)
+__BUILD_KVM_RW_HW(ebase,          l,  MIPS_CP0_PRID,         1)
+__BUILD_KVM_RW_HW(config,         32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_RW_HW(config1,        32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_RW_HW(config2,        32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_RW_HW(config3,        32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
+__BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
+__BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
+__BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
+__BUILD_KVM_RW_HW(kscratch2,      l,  MIPS_CP0_DESAVE,       3)
+__BUILD_KVM_RW_HW(kscratch3,      l,  MIPS_CP0_DESAVE,       4)
+__BUILD_KVM_RW_HW(kscratch4,      l,  MIPS_CP0_DESAVE,       5)
+__BUILD_KVM_RW_HW(kscratch5,      l,  MIPS_CP0_DESAVE,       6)
+__BUILD_KVM_RW_HW(kscratch6,      l,  MIPS_CP0_DESAVE,       7)
+
+/* Bitwise operations (on HW state) */
+__BUILD_KVM_SET_HW(status,        32, MIPS_CP0_STATUS,       0)
+/* Cause can be modified asynchronously from hardirq hrtimer callback */
+__BUILD_KVM_ATOMIC_HW(cause,      32, MIPS_CP0_CAUSE,        0)
+__BUILD_KVM_SET_HW(ebase,         l,  MIPS_CP0_PRID,         1)
+
 /* Helpers */
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index f09a161926e709..e6fce30eb44035 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -881,7 +881,7 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	unsigned long pc = vcpu->arch.pc;
 
-	kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
+	kvm_err("[%#lx] COP0_TLBR [%d]\n", pc, kvm_read_c0_guest_index(cop0));
 	return EMULATE_FAIL;
 }
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 6e91c24162788e..ab689df283b39a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1112,7 +1112,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
 	kvm_debug("\tlo: 0x%08lx\n", vcpu->arch.lo);
 
 	cop0 = vcpu->arch.cop0;
-	kvm_debug("\tStatus: 0x%08lx, Cause: 0x%08lx\n",
+	kvm_debug("\tStatus: 0x%08x, Cause: 0x%08x\n",
 		  kvm_read_c0_guest_status(cop0),
 		  kvm_read_c0_guest_cause(cop0));
 
@@ -1287,7 +1287,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	case EXCCODE_TLBS:
-		kvm_debug("TLB ST fault:  cause %#x, status %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_debug("TLB ST fault:  cause %#x, status %#x, PC: %p, BadVaddr: %#lx\n",
 			  cause, kvm_read_c0_guest_status(vcpu->arch.cop0), opc,
 			  badvaddr);
 
@@ -1358,7 +1358,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			opc += 1;
 		inst = 0;
 		kvm_get_badinstr(opc, vcpu, &inst);
-		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
+		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#x\n",
 			exccode, opc, inst, badvaddr,
 			kvm_read_c0_guest_status(vcpu->arch.cop0));
 		kvm_arch_vcpu_dump_regs(vcpu);
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index fda45b4bdebcb6..75ba3c4b7cd57f 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -55,7 +55,7 @@ static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu)
 		opc += 1;
 	kvm_get_badinstr(opc, vcpu, &inst);
 
-	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n",
+	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
 		exccode, opc, inst, badvaddr,
 		kvm_read_c0_guest_status(vcpu->arch.cop0));
 	kvm_arch_vcpu_dump_regs(vcpu);
@@ -947,10 +947,12 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 			if (v & CAUSEF_DC) {
 				/* disable timer first */
 				kvm_mips_count_disable_cause(vcpu);
-				kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+				kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC,
+							  v);
 			} else {
 				/* enable timer last */
-				kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+				kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC,
+							  v);
 				kvm_mips_count_enable_cause(vcpu);
 			}
 		} else {

From 1934a3ad091fb6b07a46b4a9f02d11a9a9f829b9 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:26 +0000
Subject: [PATCH 032/227] KVM: MIPS/Entry: Update entry code to support VZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update MIPS KVM entry code to support VZ:

 - We need to set GuestCtl0.GM while in guest mode.

 - For cores supporting GuestID, we need to set the root GuestID to
   match the main GuestID while in guest mode so that the root TLB
   refill handler writes the correct GuestID into the TLB.

 - For cores without GuestID where the root ASID dealiases RVA/GPA
   mappings, we need to load that ASID from the gpa_mm rather than the
   per-VCPU guest_kernel_mm or guest_user_mm, since the root TLB maps
   guest physical addresses. We also need to restore the normal process
   ASID on exit.

 - The normal linux process pgd needs restoring on exit, as we can't
   leave the GPA mappings active for kernel code.

 - GuestCtl0 needs saving on exit for the GExcCode field, as it may be
   clobbered if a preemption occurs.

We also need to move the TLB refill handler to the XTLB vector at offset
0x80 on 64-bit VZ kernels, as hardware will use Root.Status.KX to
determine whether a TLB refill or XTLB Refill exception is to be taken
on a root TLB miss from guest mode, and KX needs to be set for kernel
code to be able to access the 64-bit segments.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |   5 ++
 arch/mips/kvm/entry.c            | 132 ++++++++++++++++++++++++++++---
 arch/mips/kvm/mips.c             |   4 +-
 3 files changed, 131 insertions(+), 10 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index cd7488641db8d3..c52279d89ae47a 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -287,13 +287,18 @@ struct kvm_mmu_memory_cache {
 struct kvm_vcpu_arch {
 	void *guest_ebase;
 	int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+	/* Host registers preserved across guest mode execution */
 	unsigned long host_stack;
 	unsigned long host_gp;
+	unsigned long host_pgd;
+	unsigned long host_entryhi;
 
 	/* Host CP0 registers used when handling exits from guest */
 	unsigned long host_cp0_badvaddr;
 	unsigned long host_cp0_epc;
 	u32 host_cp0_cause;
+	u32 host_cp0_guestctl0;
 	u32 host_cp0_badinstr;
 	u32 host_cp0_badinstrp;
 
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index c5b254c4d0da89..16e1c93b484f43 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -51,12 +51,15 @@
 #define RA		31
 
 /* Some CP0 registers */
+#define C0_PWBASE	5, 5
 #define C0_HWRENA	7, 0
 #define C0_BADVADDR	8, 0
 #define C0_BADINSTR	8, 1
 #define C0_BADINSTRP	8, 2
 #define C0_ENTRYHI	10, 0
+#define C0_GUESTCTL1	10, 4
 #define C0_STATUS	12, 0
+#define C0_GUESTCTL0	12, 6
 #define C0_CAUSE	13, 0
 #define C0_EPC		14, 0
 #define C0_EBASE	15, 1
@@ -292,8 +295,8 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	unsigned int i;
 	struct uasm_label labels[2];
 	struct uasm_reloc relocs[2];
-	struct uasm_label *l = labels;
-	struct uasm_reloc *r = relocs;
+	struct uasm_label __maybe_unused *l = labels;
+	struct uasm_reloc __maybe_unused *r = relocs;
 
 	memset(labels, 0, sizeof(labels));
 	memset(relocs, 0, sizeof(relocs));
@@ -302,7 +305,67 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
 	UASM_i_MTC0(&p, T0, C0_EPC);
 
-	/* Set the ASID for the Guest Kernel */
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* Save normal linux process pgd (VZ guarantees pgd_reg is set) */
+	UASM_i_MFC0(&p, K0, c0_kscratch(), pgd_reg);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_pgd), K1);
+
+	/*
+	 * Set up KVM GPA pgd.
+	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+	 * - call tlbmiss_handler_setup_pgd(mm->pgd)
+	 * - write mm->pgd into CP0_PWBase
+	 *
+	 * We keep S0 pointing at struct kvm so we can load the ASID below.
+	 */
+	UASM_i_LW(&p, S0, (int)offsetof(struct kvm_vcpu, kvm) -
+			  (int)offsetof(struct kvm_vcpu, arch), K1);
+	UASM_i_LW(&p, A0, offsetof(struct kvm, arch.gpa_mm.pgd), S0);
+	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+	uasm_i_jalr(&p, RA, T9);
+	/* delay slot */
+	if (cpu_has_htw)
+		UASM_i_MTC0(&p, A0, C0_PWBASE);
+	else
+		uasm_i_nop(&p);
+
+	/* Set GM bit to setup eret to VZ guest context */
+	uasm_i_addiu(&p, V1, ZERO, 1);
+	uasm_i_mfc0(&p, K0, C0_GUESTCTL0);
+	uasm_i_ins(&p, K0, V1, MIPS_GCTL0_GM_SHIFT, 1);
+	uasm_i_mtc0(&p, K0, C0_GUESTCTL0);
+
+	if (cpu_has_guestid) {
+		/*
+		 * Set root mode GuestID, so that root TLB refill handler can
+		 * use the correct GuestID in the root TLB.
+		 */
+
+		/* Get current GuestID */
+		uasm_i_mfc0(&p, T0, C0_GUESTCTL1);
+		/* Set GuestCtl1.RID = GuestCtl1.ID */
+		uasm_i_ext(&p, T1, T0, MIPS_GCTL1_ID_SHIFT,
+			   MIPS_GCTL1_ID_WIDTH);
+		uasm_i_ins(&p, T0, T1, MIPS_GCTL1_RID_SHIFT,
+			   MIPS_GCTL1_RID_WIDTH);
+		uasm_i_mtc0(&p, T0, C0_GUESTCTL1);
+
+		/* GuestID handles dealiasing so we don't need to touch ASID */
+		goto skip_asid_restore;
+	}
+
+	/* Root ASID Dealias (RAD) */
+
+	/* Save host ASID */
+	UASM_i_MFC0(&p, K0, C0_ENTRYHI);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi),
+		  K1);
+
+	/* Set the root ASID for the Guest */
+	UASM_i_ADDIU(&p, T1, S0,
+		     offsetof(struct kvm, arch.gpa_mm.context.asid));
+#else
+	/* Set the ASID for the Guest Kernel or User */
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
 	UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
 		  T0);
@@ -315,6 +378,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
 					  guest_user_mm.context.asid));
 	uasm_l_kernel_asid(&l, p);
+#endif
 
 	/* t1: contains the base of the ASID array, need to get the cpu id  */
 	/* smp_processor_id */
@@ -339,6 +403,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
 #endif
 
+#ifndef CONFIG_KVM_MIPS_VZ
 	/*
 	 * Set up KVM T&E GVA pgd.
 	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
@@ -351,7 +416,11 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
 	uasm_i_jalr(&p, RA, T9);
 	 uasm_i_mtc0(&p, K0, C0_ENTRYHI);
-
+#else
+	/* Set up KVM VZ root ASID (!guestid) */
+	uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+skip_asid_restore:
+#endif
 	uasm_i_ehb(&p);
 
 	/* Disable RDHWR access */
@@ -559,13 +628,10 @@ void *kvm_mips_build_exit(void *addr)
 	/* Now that context has been saved, we can use other registers */
 
 	/* Restore vcpu */
-	UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
-	uasm_i_move(&p, S1, A1);
+	UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
 
 	/* Restore run (vcpu->run) */
-	UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
-	/* Save pointer to run in s0, will be saved by the compiler */
-	uasm_i_move(&p, S0, A0);
+	UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
 
 	/*
 	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
@@ -641,6 +707,52 @@ void *kvm_mips_build_exit(void *addr)
 		uasm_l_msa_1(&l, p);
 	}
 
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* Restore host ASID */
+	if (!cpu_has_guestid) {
+		UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi),
+			  K1);
+		UASM_i_MTC0(&p, K0, C0_ENTRYHI);
+	}
+
+	/*
+	 * Set up normal Linux process pgd.
+	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+	 * - call tlbmiss_handler_setup_pgd(mm->pgd)
+	 * - write mm->pgd into CP0_PWBase
+	 */
+	UASM_i_LW(&p, A0,
+		  offsetof(struct kvm_vcpu_arch, host_pgd), K1);
+	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+	uasm_i_jalr(&p, RA, T9);
+	/* delay slot */
+	if (cpu_has_htw)
+		UASM_i_MTC0(&p, A0, C0_PWBASE);
+	else
+		uasm_i_nop(&p);
+
+	/* Clear GM bit so we don't enter guest mode when EXL is cleared */
+	uasm_i_mfc0(&p, K0, C0_GUESTCTL0);
+	uasm_i_ins(&p, K0, ZERO, MIPS_GCTL0_GM_SHIFT, 1);
+	uasm_i_mtc0(&p, K0, C0_GUESTCTL0);
+
+	/* Save GuestCtl0 so we can access GExcCode after CPU migration */
+	uasm_i_sw(&p, K0,
+		  offsetof(struct kvm_vcpu_arch, host_cp0_guestctl0), K1);
+
+	if (cpu_has_guestid) {
+		/*
+		 * Clear root mode GuestID, so that root TLB operations use the
+		 * root GuestID in the root TLB.
+		 */
+		uasm_i_mfc0(&p, T0, C0_GUESTCTL1);
+		/* Set GuestCtl1.RID = MIPS_GCTL1_ROOT_GUESTID (i.e. 0) */
+		uasm_i_ins(&p, T0, ZERO, MIPS_GCTL1_RID_SHIFT,
+			   MIPS_GCTL1_RID_WIDTH);
+		uasm_i_mtc0(&p, T0, C0_GUESTCTL1);
+	}
+#endif
+
 	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
 	uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
 	uasm_i_and(&p, V0, V0, AT);
@@ -680,6 +792,8 @@ void *kvm_mips_build_exit(void *addr)
 	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
 	 * with this in the kernel
 	 */
+	uasm_i_move(&p, A0, S0);
+	uasm_i_move(&p, A1, S1);
 	UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
 	uasm_i_jalr(&p, RA, T9);
 	 UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index ab689df283b39a..1fa088854cb66e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -324,8 +324,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Build guest exception vectors dynamically in unmapped memory */
 	handler = gebase + 0x2000;
 
-	/* TLB refill */
+	/* TLB refill (or XTLB refill on 64-bit VZ where KX=1) */
 	refill_start = gebase;
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && IS_ENABLED(CONFIG_64BIT))
+		refill_start += 0x080;
 	refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
 
 	/* General Exception Entry point */

From 372582a6c6fcced38219d06545dd26ad7904bc6f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:27 +0000
Subject: [PATCH 033/227] KVM: MIPS/TLB: Add VZ TLB management
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add functions for MIPS VZ TLB management to tlb.c.

kvm_vz_host_tlb_inv() will be used for invalidating root TLB entries
after GPA page tables have been modified due to a KVM page fault. It
arranges for a root GPA mapping to be flushed from the TLB, using the
gpa_mm ASID or the current GuestID to do the probe.

kvm_vz_local_flush_roottlb_all_guests() and
kvm_vz_local_flush_guesttlb_all() flush all TLB entries in the
corresponding TLB for guest mappings (GPA->RPA for root TLB with
GuestID, and all entries for guest TLB). They will be used when starting
a new GuestID cycle, when VZ hardware is enabled/disabled, and also when
switching to a guest when the guest TLB contents may be stale or belong
to a different VM.

kvm_vz_guest_tlb_lookup() converts a guest virtual address to a guest
physical address using the guest TLB. This will be used to decode guest
virtual addresses which are sometimes provided by VZ hardware in
CP0_BadVAddr for certain exceptions when the guest physical address is
unavailable.

kvm_vz_save_guesttlb() and kvm_vz_load_guesttlb() will be used to
preserve wired guest VTLB entries while a guest isn't running.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/cpu-info.h |   1 +
 arch/mips/include/asm/kvm_host.h |  12 +
 arch/mips/kvm/tlb.c              | 404 +++++++++++++++++++++++++++++++
 3 files changed, 417 insertions(+)

diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index edbe2734a1bf07..4113796e0ef476 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -33,6 +33,7 @@ struct guest_info {
 	unsigned long		ases_dyn;
 	unsigned long long	options;
 	unsigned long long	options_dyn;
+	int			tlbsize;
 	u8			conf;
 	u8			kscratch_mask;
 };
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index c52279d89ae47a..b47571850aacfb 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -812,6 +812,18 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
 				     unsigned long entryhi);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
+int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva,
+			    unsigned long *gpa);
+void kvm_vz_local_flush_roottlb_all_guests(void);
+void kvm_vz_local_flush_guesttlb_all(void);
+void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count);
+void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count);
+#endif
+
 void kvm_mips_suspend_mm(int cpu);
 void kvm_mips_resume_mm(int cpu);
 
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 2819eb793345ab..a28fcb1e507261 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -33,6 +33,18 @@
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
+#ifdef CONFIG_KVM_MIPS_VZ
+static u32 kvm_mips_get_root_asid(struct kvm_vcpu *vcpu)
+{
+	struct mm_struct *gpa_mm = &vcpu->kvm->arch.gpa_mm;
+
+	if (cpu_has_guestid)
+		return 0;
+	else
+		return cpu_asid(smp_processor_id(), gpa_mm);
+}
+#endif
+
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -179,6 +191,398 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 }
 EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+
+/* GuestID management */
+
+/**
+ * clear_root_gid() - Set GuestCtl1.RID for normal root operation.
+ */
+static inline void clear_root_gid(void)
+{
+	if (cpu_has_guestid) {
+		clear_c0_guestctl1(MIPS_GCTL1_RID);
+		mtc0_tlbw_hazard();
+	}
+}
+
+/**
+ * set_root_gid_to_guest_gid() - Set GuestCtl1.RID to match GuestCtl1.ID.
+ *
+ * Sets the root GuestID to match the current guest GuestID, for TLB operation
+ * on the GPA->RPA mappings in the root TLB.
+ *
+ * The caller must be sure to disable HTW while the root GID is set, and
+ * possibly longer if TLB registers are modified.
+ */
+static inline void set_root_gid_to_guest_gid(void)
+{
+	unsigned int guestctl1;
+
+	if (cpu_has_guestid) {
+		back_to_back_c0_hazard();
+		guestctl1 = read_c0_guestctl1();
+		guestctl1 = (guestctl1 & ~MIPS_GCTL1_RID) |
+			((guestctl1 & MIPS_GCTL1_ID) >> MIPS_GCTL1_ID_SHIFT)
+						     << MIPS_GCTL1_RID_SHIFT;
+		write_c0_guestctl1(guestctl1);
+		mtc0_tlbw_hazard();
+	}
+}
+
+int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
+{
+	int idx;
+	unsigned long flags, old_entryhi;
+
+	local_irq_save(flags);
+	htw_stop();
+
+	/* Set root GuestID for root probe and write of guest TLB entry */
+	set_root_gid_to_guest_gid();
+
+	old_entryhi = read_c0_entryhi();
+
+	idx = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+				     kvm_mips_get_root_asid(vcpu));
+
+	write_c0_entryhi(old_entryhi);
+	clear_root_gid();
+	mtc0_tlbw_hazard();
+
+	htw_start();
+	local_irq_restore(flags);
+
+	if (idx > 0)
+		kvm_debug("%s: Invalidated root entryhi %#lx @ idx %d\n",
+			  __func__, (va & VPN2_MASK) |
+				    kvm_mips_get_root_asid(vcpu), idx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vz_host_tlb_inv);
+
+/**
+ * kvm_vz_guest_tlb_lookup() - Lookup a guest VZ TLB mapping.
+ * @vcpu:	KVM VCPU pointer.
+ * @gpa:	Guest virtual address in a TLB mapped guest segment.
+ * @gpa:	Ponter to output guest physical address it maps to.
+ *
+ * Converts a guest virtual address in a guest TLB mapped segment to a guest
+ * physical address, by probing the guest TLB.
+ *
+ * Returns:	0 if guest TLB mapping exists for @gva. *@gpa will have been
+ *		written.
+ *		-EFAULT if no guest TLB mapping exists for @gva. *@gpa may not
+ *		have been written.
+ */
+int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva,
+			    unsigned long *gpa)
+{
+	unsigned long o_entryhi, o_entrylo[2], o_pagemask;
+	unsigned int o_index;
+	unsigned long entrylo[2], pagemask, pagemaskbit, pa;
+	unsigned long flags;
+	int index;
+
+	/* Probe the guest TLB for a mapping */
+	local_irq_save(flags);
+	/* Set root GuestID for root probe of guest TLB entry */
+	htw_stop();
+	set_root_gid_to_guest_gid();
+
+	o_entryhi = read_gc0_entryhi();
+	o_index = read_gc0_index();
+
+	write_gc0_entryhi((o_entryhi & 0x3ff) | (gva & ~0xfffl));
+	mtc0_tlbw_hazard();
+	guest_tlb_probe();
+	tlb_probe_hazard();
+
+	index = read_gc0_index();
+	if (index < 0) {
+		/* No match, fail */
+		write_gc0_entryhi(o_entryhi);
+		write_gc0_index(o_index);
+
+		clear_root_gid();
+		htw_start();
+		local_irq_restore(flags);
+		return -EFAULT;
+	}
+
+	/* Match! read the TLB entry */
+	o_entrylo[0] = read_gc0_entrylo0();
+	o_entrylo[1] = read_gc0_entrylo1();
+	o_pagemask = read_gc0_pagemask();
+
+	mtc0_tlbr_hazard();
+	guest_tlb_read();
+	tlb_read_hazard();
+
+	entrylo[0] = read_gc0_entrylo0();
+	entrylo[1] = read_gc0_entrylo1();
+	pagemask = ~read_gc0_pagemask() & ~0x1fffl;
+
+	write_gc0_entryhi(o_entryhi);
+	write_gc0_index(o_index);
+	write_gc0_entrylo0(o_entrylo[0]);
+	write_gc0_entrylo1(o_entrylo[1]);
+	write_gc0_pagemask(o_pagemask);
+
+	clear_root_gid();
+	htw_start();
+	local_irq_restore(flags);
+
+	/* Select one of the EntryLo values and interpret the GPA */
+	pagemaskbit = (pagemask ^ (pagemask & (pagemask - 1))) >> 1;
+	pa = entrylo[!!(gva & pagemaskbit)];
+
+	/*
+	 * TLB entry may have become invalid since TLB probe if physical FTLB
+	 * entries are shared between threads (e.g. I6400).
+	 */
+	if (!(pa & ENTRYLO_V))
+		return -EFAULT;
+
+	/*
+	 * Note, this doesn't take guest MIPS32 XPA into account, where PFN is
+	 * split with XI/RI in the middle.
+	 */
+	pa = (pa << 6) & ~0xfffl;
+	pa |= gva & ~(pagemask | pagemaskbit);
+
+	*gpa = pa;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vz_guest_tlb_lookup);
+
+/**
+ * kvm_vz_local_flush_roottlb_all_guests() - Flush all root TLB entries for
+ * guests.
+ *
+ * Invalidate all entries in root tlb which are GPA mappings.
+ */
+void kvm_vz_local_flush_roottlb_all_guests(void)
+{
+	unsigned long flags;
+	unsigned long old_entryhi, old_pagemask, old_guestctl1;
+	int entry;
+
+	if (WARN_ON(!cpu_has_guestid))
+		return;
+
+	local_irq_save(flags);
+	htw_stop();
+
+	/* TLBR may clobber EntryHi.ASID, PageMask, and GuestCtl1.RID */
+	old_entryhi = read_c0_entryhi();
+	old_pagemask = read_c0_pagemask();
+	old_guestctl1 = read_c0_guestctl1();
+
+	/*
+	 * Invalidate guest entries in root TLB while leaving root entries
+	 * intact when possible.
+	 */
+	for (entry = 0; entry < current_cpu_data.tlbsize; entry++) {
+		write_c0_index(entry);
+		mtc0_tlbw_hazard();
+		tlb_read();
+		tlb_read_hazard();
+
+		/* Don't invalidate non-guest (RVA) mappings in the root TLB */
+		if (!(read_c0_guestctl1() & MIPS_GCTL1_RID))
+			continue;
+
+		/* Make sure all entries differ. */
+		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
+		write_c0_entrylo0(0);
+		write_c0_entrylo1(0);
+		write_c0_guestctl1(0);
+		mtc0_tlbw_hazard();
+		tlb_write_indexed();
+	}
+
+	write_c0_entryhi(old_entryhi);
+	write_c0_pagemask(old_pagemask);
+	write_c0_guestctl1(old_guestctl1);
+	tlbw_use_hazard();
+
+	htw_start();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(kvm_vz_local_flush_roottlb_all_guests);
+
+/**
+ * kvm_vz_local_flush_guesttlb_all() - Flush all guest TLB entries.
+ *
+ * Invalidate all entries in guest tlb irrespective of guestid.
+ */
+void kvm_vz_local_flush_guesttlb_all(void)
+{
+	unsigned long flags;
+	unsigned long old_index;
+	unsigned long old_entryhi;
+	unsigned long old_entrylo[2];
+	unsigned long old_pagemask;
+	int entry;
+
+	local_irq_save(flags);
+
+	/* Preserve all clobbered guest registers */
+	old_index = read_gc0_index();
+	old_entryhi = read_gc0_entryhi();
+	old_entrylo[0] = read_gc0_entrylo0();
+	old_entrylo[1] = read_gc0_entrylo1();
+	old_pagemask = read_gc0_pagemask();
+
+	/* Invalidate guest entries in guest TLB */
+	write_gc0_entrylo0(0);
+	write_gc0_entrylo1(0);
+	write_gc0_pagemask(0);
+	for (entry = 0; entry < current_cpu_data.guest.tlbsize; entry++) {
+		/* Make sure all entries differ. */
+		write_gc0_index(entry);
+		write_gc0_entryhi(UNIQUE_GUEST_ENTRYHI(entry));
+		mtc0_tlbw_hazard();
+		guest_tlb_write_indexed();
+	}
+	write_gc0_index(old_index);
+	write_gc0_entryhi(old_entryhi);
+	write_gc0_entrylo0(old_entrylo[0]);
+	write_gc0_entrylo1(old_entrylo[1]);
+	write_gc0_pagemask(old_pagemask);
+	tlbw_use_hazard();
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(kvm_vz_local_flush_guesttlb_all);
+
+/**
+ * kvm_vz_save_guesttlb() - Save a range of guest TLB entries.
+ * @buf:	Buffer to write TLB entries into.
+ * @index:	Start index.
+ * @count:	Number of entries to save.
+ *
+ * Save a range of guest TLB entries. The caller must ensure interrupts are
+ * disabled.
+ */
+void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count)
+{
+	unsigned int end = index + count;
+	unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask;
+	unsigned int guestctl1 = 0;
+	int old_index, i;
+
+	/* Save registers we're about to clobber */
+	old_index = read_gc0_index();
+	old_entryhi = read_gc0_entryhi();
+	old_entrylo0 = read_gc0_entrylo0();
+	old_entrylo1 = read_gc0_entrylo1();
+	old_pagemask = read_gc0_pagemask();
+
+	/* Set root GuestID for root probe */
+	htw_stop();
+	set_root_gid_to_guest_gid();
+	if (cpu_has_guestid)
+		guestctl1 = read_c0_guestctl1();
+
+	/* Read each entry from guest TLB */
+	for (i = index; i < end; ++i, ++buf) {
+		write_gc0_index(i);
+
+		mtc0_tlbr_hazard();
+		guest_tlb_read();
+		tlb_read_hazard();
+
+		if (cpu_has_guestid &&
+		    (read_c0_guestctl1() ^ guestctl1) & MIPS_GCTL1_RID) {
+			/* Entry invalid or belongs to another guest */
+			buf->tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+			buf->tlb_lo[0] = 0;
+			buf->tlb_lo[1] = 0;
+			buf->tlb_mask = 0;
+		} else {
+			/* Entry belongs to the right guest */
+			buf->tlb_hi = read_gc0_entryhi();
+			buf->tlb_lo[0] = read_gc0_entrylo0();
+			buf->tlb_lo[1] = read_gc0_entrylo1();
+			buf->tlb_mask = read_gc0_pagemask();
+		}
+	}
+
+	/* Clear root GuestID again */
+	clear_root_gid();
+	htw_start();
+
+	/* Restore clobbered registers */
+	write_gc0_index(old_index);
+	write_gc0_entryhi(old_entryhi);
+	write_gc0_entrylo0(old_entrylo0);
+	write_gc0_entrylo1(old_entrylo1);
+	write_gc0_pagemask(old_pagemask);
+
+	tlbw_use_hazard();
+}
+EXPORT_SYMBOL_GPL(kvm_vz_save_guesttlb);
+
+/**
+ * kvm_vz_load_guesttlb() - Save a range of guest TLB entries.
+ * @buf:	Buffer to read TLB entries from.
+ * @index:	Start index.
+ * @count:	Number of entries to load.
+ *
+ * Load a range of guest TLB entries. The caller must ensure interrupts are
+ * disabled.
+ */
+void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count)
+{
+	unsigned int end = index + count;
+	unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask;
+	int old_index, i;
+
+	/* Save registers we're about to clobber */
+	old_index = read_gc0_index();
+	old_entryhi = read_gc0_entryhi();
+	old_entrylo0 = read_gc0_entrylo0();
+	old_entrylo1 = read_gc0_entrylo1();
+	old_pagemask = read_gc0_pagemask();
+
+	/* Set root GuestID for root probe */
+	htw_stop();
+	set_root_gid_to_guest_gid();
+
+	/* Write each entry to guest TLB */
+	for (i = index; i < end; ++i, ++buf) {
+		write_gc0_index(i);
+		write_gc0_entryhi(buf->tlb_hi);
+		write_gc0_entrylo0(buf->tlb_lo[0]);
+		write_gc0_entrylo1(buf->tlb_lo[1]);
+		write_gc0_pagemask(buf->tlb_mask);
+
+		mtc0_tlbw_hazard();
+		guest_tlb_write_indexed();
+	}
+
+	/* Clear root GuestID again */
+	clear_root_gid();
+	htw_start();
+
+	/* Restore clobbered registers */
+	write_gc0_index(old_index);
+	write_gc0_entryhi(old_entryhi);
+	write_gc0_entrylo0(old_entrylo0);
+	write_gc0_entrylo1(old_entrylo1);
+	write_gc0_pagemask(old_pagemask);
+
+	tlbw_use_hazard();
+}
+EXPORT_SYMBOL_GPL(kvm_vz_load_guesttlb);
+
+#endif
+
 /**
  * kvm_mips_suspend_mm() - Suspend the active mm.
  * @cpu		The CPU we're running on.

From 5dee99b24eeffbe673a71237faf551978cd311d3 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:28 +0000
Subject: [PATCH 034/227] KVM: MIPS/Emulate: Update CP0_Compare emulation for
 VZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update emulation of guest writes to CP0_Compare for VZ. There are two
main differences compared to trap & emulate:

 - Writing to CP0_Compare in the VZ hardware guest context acks any
   pending timer, clearing CP0_Cause.TI. If we don't want an ack to take
   place we must carefully restore the TI bit if it was previously set.

 - Even with guest timer access disabled in CP0_GuestCtl0.GT, if the
   guest CP0_Count reaches the guest CP0_Compare the timer interrupt
   will assert. To prevent this we must set CP0_GTOffset to move the
   guest CP0_Count out of the way of the new guest CP0_Compare, either
   before or after depending on whether it is a forwards or backwards
   change.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 43 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index e6fce30eb44035..42424822898c45 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -621,7 +621,9 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	int dc;
 	u32 old_compare = kvm_read_c0_guest_compare(cop0);
-	ktime_t now;
+	s32 delta = compare - old_compare;
+	u32 cause;
+	ktime_t now = ktime_set(0, 0); /* silence bogus GCC warning */
 	u32 count;
 
 	/* if unchanged, must just be an ack */
@@ -633,6 +635,21 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 		return;
 	}
 
+	/*
+	 * If guest CP0_Compare moves forward, CP0_GTOffset should be adjusted
+	 * too to prevent guest CP0_Count hitting guest CP0_Compare.
+	 *
+	 * The new GTOffset corresponds to the new value of CP0_Compare, and is
+	 * set prior to it being written into the guest context. We disable
+	 * preemption until the new value is written to prevent restore of a
+	 * GTOffset corresponding to the old CP0_Compare value.
+	 */
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta > 0) {
+		preempt_disable();
+		write_c0_gtoffset(compare - read_c0_count());
+		back_to_back_c0_hazard();
+	}
+
 	/* freeze_hrtimer() takes care of timer interrupts <= count */
 	dc = kvm_mips_count_disabled(vcpu);
 	if (!dc)
@@ -640,12 +657,36 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 
 	if (ack)
 		kvm_mips_callbacks->dequeue_timer_int(vcpu);
+	else if (IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+		/*
+		 * With VZ, writing CP0_Compare acks (clears) CP0_Cause.TI, so
+		 * preserve guest CP0_Cause.TI if we don't want to ack it.
+		 */
+		cause = kvm_read_c0_guest_cause(cop0);
 
 	kvm_write_c0_guest_compare(cop0, compare);
 
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+		if (delta > 0)
+			preempt_enable();
+
+		back_to_back_c0_hazard();
+
+		if (!ack && cause & CAUSEF_TI)
+			kvm_write_c0_guest_cause(cop0, cause);
+	}
+
 	/* resume_hrtimer() takes care of timer interrupts > count */
 	if (!dc)
 		kvm_mips_resume_hrtimer(vcpu, now, count);
+
+	/*
+	 * If guest CP0_Compare is moving backward, we delay CP0_GTOffset change
+	 * until after the new CP0_Compare is written, otherwise new guest
+	 * CP0_Count could hit new guest CP0_Compare.
+	 */
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta <= 0)
+		write_c0_gtoffset(compare - read_c0_count());
 }
 
 /**

From 60c7aa3320dffba4929ef3f46a2daeb5feea6d07 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:29 +0000
Subject: [PATCH 035/227] KVM: MIPS/Emulate: Drop CACHE emulation for VZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ifdef out the trap & emulate CACHE instruction emulation functions for
VZ. We will provide separate CACHE instruction emulation in vz.c, and we
need to avoid linker errors due to the use of T&E specific MMU helpers.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 42424822898c45..bb0449296cd621 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1665,6 +1665,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 	return EMULATE_DO_MMIO;
 }
 
+#ifndef CONFIG_KVM_MIPS_VZ
 static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
 						     unsigned long curr_pc,
 						     unsigned long addr,
@@ -1872,6 +1873,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 
 	return er;
 }
+#endif /* CONFIG_KVM_MIPS_VZ */
 
 /**
  * kvm_mips_guest_exception_base() - Find guest exception vector base address.

From ea1bdbf683e6a12bd60204d245053bca0b52185c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:30 +0000
Subject: [PATCH 036/227] KVM: MIPS: Update exit handler for VZ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The general guest exit handler needs a few tweaks for VZ compared to
trap & emulate, which for now are made directly depending on
CONFIG_KVM_MIPS_VZ:

- There is no need to re-enable the hardware page table walker (HTW), as
  it can be left enabled during guest mode operation with VZ.

- There is no need to perform a privilege check, as any guest privilege
  violations should have already been detected by the hardware and
  triggered the appropriate guest exception.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 1fa088854cb66e..8c10148a345c5e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1230,7 +1230,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 
 	/* re-enable HTW before enabling interrupts */
-	htw_start();
+	if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+		htw_start();
 
 	/* Set a default exit reason */
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -1248,17 +1249,20 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			cause, opc, run, vcpu);
 	trace_kvm_exit(vcpu, exccode);
 
-	/*
-	 * Do a privilege check, if in UM most of these exit conditions end up
-	 * causing an exception to be delivered to the Guest Kernel
-	 */
-	er = kvm_mips_check_privilege(cause, opc, run, vcpu);
-	if (er == EMULATE_PRIV_FAIL) {
-		goto skip_emul;
-	} else if (er == EMULATE_FAIL) {
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
-		goto skip_emul;
+	if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+		/*
+		 * Do a privilege check, if in UM most of these exit conditions
+		 * end up causing an exception to be delivered to the Guest
+		 * Kernel
+		 */
+		er = kvm_mips_check_privilege(cause, opc, run, vcpu);
+		if (er == EMULATE_PRIV_FAIL) {
+			goto skip_emul;
+		} else if (er == EMULATE_FAIL) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			ret = RESUME_HOST;
+			goto skip_emul;
+		}
 	}
 
 	switch (exccode) {
@@ -1418,7 +1422,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	/* Disable HTW before returning to guest or host */
-	htw_stop();
+	if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+		htw_stop();
 
 	return ret;
 }

From c992a4f6a9b0a37c8bd7dfc727ecc3fed125c16b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:31 +0000
Subject: [PATCH 037/227] KVM: MIPS: Implement VZ support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the main support for the MIPS Virtualization ASE (A.K.A. VZ) to MIPS
KVM. The bulk of this work is in vz.c, with various new state and
definitions elsewhere.

Enough is implemented to be able to run on a minimal VZ core. Further
patches will fill out support for guest features which are optional or
can be disabled.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |    2 +
 arch/mips/include/asm/cpu-info.h  |    1 +
 arch/mips/include/asm/kvm_host.h  |   42 +
 arch/mips/kernel/time.c           |    1 +
 arch/mips/kvm/interrupt.h         |    5 +
 arch/mips/kvm/mips.c              |    5 +
 arch/mips/kvm/mmu.c               |   20 +
 arch/mips/kvm/tlb.c               |    7 +
 arch/mips/kvm/trace.h             |   15 +
 arch/mips/kvm/vz.c                | 2381 +++++++++++++++++++++++++++++
 10 files changed, 2479 insertions(+)
 create mode 100644 arch/mips/kvm/vz.c

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 1b8486c094b431..d2827864827fca 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2075,6 +2075,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
+  MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
@@ -2094,6 +2095,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONFIG4      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
+  MIPS  | KVM_REG_MIPS_CP0_XCONTEXT     | 64
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH1    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH2    | 64
diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index 4113796e0ef476..be3b4c25f3359a 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -110,6 +110,7 @@ struct cpuinfo_mips {
 	struct guest_info	guest;
 	unsigned int		gtoffset_mask;
 	unsigned int		guestid_mask;
+	unsigned int		guestid_cache;
 } __attribute__((aligned(SMP_CACHE_BYTES)));
 
 extern struct cpuinfo_mips cpu_data[];
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b47571850aacfb..bc3f9dedaac802 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -10,6 +10,7 @@
 #ifndef __MIPS_KVM_HOST_H__
 #define __MIPS_KVM_HOST_H__
 
+#include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <linux/hrtimer.h>
 #include <linux/interrupt.h>
@@ -73,6 +74,11 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
+#ifdef CONFIG_KVM_MIPS_VZ
+extern unsigned long GUESTID_MASK;
+extern unsigned long GUESTID_FIRST_VERSION;
+extern unsigned long GUESTID_VERSION_MASK;
+#endif
 
 
 /*
@@ -167,6 +173,8 @@ struct kvm_arch_memory_slot {
 struct kvm_arch {
 	/* Guest physical mm */
 	struct mm_struct gpa_mm;
+	/* Mask of CPUs needing GPA ASID flush */
+	cpumask_t asid_flush_mask;
 };
 
 #define N_MIPS_COPROC_REGS	32
@@ -224,6 +232,11 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL	4
 #define MIPS_CP0_CONFIG5_SEL	5
 
+#define MIPS_CP0_GUESTCTL2	10
+#define MIPS_CP0_GUESTCTL2_SEL	5
+#define MIPS_CP0_GTOFFSET	12
+#define MIPS_CP0_GTOFFSET_SEL	7
+
 /* Resume Flags */
 #define RESUME_FLAG_DR		(1<<0)	/* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST	(1<<1)	/* Resume host? */
@@ -356,7 +369,20 @@ struct kvm_vcpu_arch {
 	/* Cache some mmu pages needed inside spinlock regions */
 	struct kvm_mmu_memory_cache mmu_page_cache;
 
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* vcpu's vzguestid is different on each host cpu in an smp system */
+	u32 vzguestid[NR_CPUS];
+
+	/* wired guest TLB entries */
+	struct kvm_mips_tlb *wired_tlb;
+	unsigned int wired_tlb_limit;
+	unsigned int wired_tlb_used;
+#endif
+
+	/* Last CPU the VCPU state was loaded on */
 	int last_sched_cpu;
+	/* Last CPU the VCPU actually executed guest code on */
+	int last_exec_cpu;
 
 	/* WAIT executed */
 	int wait;
@@ -660,6 +686,7 @@ __BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
 __BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
 __BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
 __BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_HW(xcontext,       l,  MIPS_CP0_TLB_XCONTEXT, 0)
 __BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
 __BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
 __BUILD_KVM_RW_HW(kscratch2,      l,  MIPS_CP0_DESAVE,       3)
@@ -674,6 +701,14 @@ __BUILD_KVM_SET_HW(status,        32, MIPS_CP0_STATUS,       0)
 __BUILD_KVM_ATOMIC_HW(cause,      32, MIPS_CP0_CAUSE,        0)
 __BUILD_KVM_SET_HW(ebase,         l,  MIPS_CP0_PRID,         1)
 
+/* Bitwise operations (on saved state) */
+__BUILD_KVM_SET_SAVED(config,     32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_SET_SAVED(config1,    32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_SET_SAVED(config2,    32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_SET_SAVED(config3,    32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_SET_SAVED(config4,    32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_SET_SAVED(config5,    32, MIPS_CP0_CONFIG,       5)
+
 /* Helpers */
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
@@ -786,6 +821,10 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
 u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+				      struct kvm_vcpu *vcpu, bool write_fault);
+#endif
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
 					   struct kvm_vcpu *vcpu,
 					   bool write_fault);
@@ -1026,6 +1065,9 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu);
 
+/* COP0 */
+enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu);
+
 unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index a7f81261c781f0..c036157fb891ff 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -70,6 +70,7 @@ EXPORT_SYMBOL(perf_irq);
  */
 
 unsigned int mips_hpt_frequency;
+EXPORT_SYMBOL_GPL(mips_hpt_frequency);
 
 /*
  * This function exists in order to cause an error due to a duplicate
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index fb118a2c8379f8..3bf0a49725e81a 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -30,8 +30,13 @@
 
 #define C_TI        (_ULCAST_(1) << 30)
 
+#ifdef CONFIG_KVM_MIPS_VZ
+#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (1)
+#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (1)
+#else
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
+#endif
 
 void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8c10148a345c5e..1bb5d01d5dec05 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -113,7 +113,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	switch (type) {
+#ifdef CONFIG_KVM_MIPS_VZ
+	case KVM_VM_MIPS_VZ:
+#else
 	case KVM_VM_MIPS_TE:
+#endif
 		break;
 	default:
 		/* Unsupported KVM type */
@@ -378,6 +382,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	/* Init */
 	vcpu->arch.last_sched_cpu = -1;
+	vcpu->arch.last_exec_cpu = -1;
 
 	return vcpu;
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index cb0faade311e12..ee64db03279336 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -992,6 +992,22 @@ static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
 	return kvm_mips_gpa_pte_to_gva_unmapped(pte);
 }
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+				      struct kvm_vcpu *vcpu,
+				      bool write_fault)
+{
+	int ret;
+
+	ret = kvm_mips_map_page(vcpu, badvaddr, write_fault, NULL, NULL);
+	if (ret)
+		return ret;
+
+	/* Invalidate this entry in the TLB */
+	return kvm_vz_host_tlb_inv(vcpu, badvaddr);
+}
+#endif
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu,
@@ -1225,6 +1241,10 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
 	int err;
 
+	if (WARN(IS_ENABLED(CONFIG_KVM_MIPS_VZ),
+		 "Expect BadInstr/BadInstrP registers to be used with VZ\n"))
+		return -EINVAL;
+
 retry:
 	kvm_trap_emul_gva_lockless_begin(vcpu);
 	err = get_user(*out, opc);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index a28fcb1e507261..c215470fdcb0c9 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -34,6 +34,13 @@
 #define KVM_GUEST_SP_TLB    1
 
 #ifdef CONFIG_KVM_MIPS_VZ
+unsigned long GUESTID_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_MASK);
+unsigned long GUESTID_FIRST_VERSION;
+EXPORT_SYMBOL_GPL(GUESTID_FIRST_VERSION);
+unsigned long GUESTID_VERSION_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_VERSION_MASK);
+
 static u32 kvm_mips_get_root_asid(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *gpa_mm = &vcpu->kvm->arch.gpa_mm;
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index 0c59282a2f7d4b..d80d37a1b82eb8 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -286,6 +286,21 @@ TRACE_EVENT(kvm_asid_change,
 		      __entry->new_asid)
 );
 
+TRACE_EVENT(kvm_guestid_change,
+	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int guestid),
+	    TP_ARGS(vcpu, guestid),
+	    TP_STRUCT__entry(
+			__field(unsigned int, guestid)
+	    ),
+
+	    TP_fast_assign(
+			__entry->guestid = guestid;
+	    ),
+
+	    TP_printk("GuestID: 0x%02x",
+		      __entry->guestid)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
new file mode 100644
index 00000000000000..cfed234be1e384
--- /dev/null
+++ b/arch/mips/kvm/vz.c
@@ -0,0 +1,2381 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Support for hardware virtualization extensions
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Yann Le Du <ledu@kymasys.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/cacheops.h>
+#include <asm/cmpxchg.h>
+#include <asm/fpu.h>
+#include <asm/hazards.h>
+#include <asm/inst.h>
+#include <asm/mmu_context.h>
+#include <asm/r4kcache.h>
+#include <asm/time.h>
+#include <asm/tlb.h>
+#include <asm/tlbex.h>
+
+#include <linux/kvm_host.h>
+
+#include "interrupt.h"
+
+#include "trace.h"
+
+/* Pointers to last VCPU loaded on each physical CPU */
+static struct kvm_vcpu *last_vcpu[NR_CPUS];
+/* Pointers to last VCPU executed on each physical CPU */
+static struct kvm_vcpu *last_exec_vcpu[NR_CPUS];
+
+/*
+ * Number of guest VTLB entries to use, so we can catch inconsistency between
+ * CPUs.
+ */
+static unsigned int kvm_vz_guest_vtlb_size;
+
+static inline long kvm_vz_read_gc0_ebase(void)
+{
+	if (sizeof(long) == 8 && cpu_has_ebase_wg)
+		return read_gc0_ebase_64();
+	else
+		return read_gc0_ebase();
+}
+
+static inline void kvm_vz_write_gc0_ebase(long v)
+{
+	/*
+	 * First write with WG=1 to write upper bits, then write again in case
+	 * WG should be left at 0.
+	 * write_gc0_ebase_64() is no longer UNDEFINED since R6.
+	 */
+	if (sizeof(long) == 8 &&
+	    (cpu_has_mips64r6 || cpu_has_ebase_wg)) {
+		write_gc0_ebase_64(v | MIPS_EBASE_WG);
+		write_gc0_ebase_64(v);
+	} else {
+		write_gc0_ebase(v | MIPS_EBASE_WG);
+		write_gc0_ebase(v);
+	}
+}
+
+/*
+ * These Config bits may be writable by the guest:
+ * Config:	[K23, KU] (!TLB), K0
+ * Config1:	(none)
+ * Config2:	[TU, SU] (impl)
+ * Config3:	ISAOnExc
+ * Config4:	FTLBPageSize
+ * Config5:	K, CV, MSAEn, UFE, FRE, SBRI, UFR
+ */
+
+static inline unsigned int kvm_vz_config_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return CONF_CM_CMASK;
+}
+
+static inline unsigned int kvm_vz_config1_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_vz_config2_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_vz_config3_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return MIPS_CONF3_ISA_OE;
+}
+
+static inline unsigned int kvm_vz_config4_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	/* no need to be exact */
+	return MIPS_CONF4_VFTLBPAGESIZE;
+}
+
+static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = MIPS_CONF5_K | MIPS_CONF5_CV | MIPS_CONF5_SBRI;
+
+	/* Permit MSAEn changes if MSA supported and enabled */
+	if (kvm_mips_guest_has_msa(&vcpu->arch))
+		mask |= MIPS_CONF5_MSAEN;
+
+	/*
+	 * Permit guest FPU mode changes if FPU is enabled and the relevant
+	 * feature exists according to FIR register.
+	 */
+	if (kvm_mips_guest_has_fpu(&vcpu->arch)) {
+		if (cpu_has_ufr)
+			mask |= MIPS_CONF5_UFR;
+		if (cpu_has_fre)
+			mask |= MIPS_CONF5_FRE | MIPS_CONF5_UFE;
+	}
+
+	return mask;
+}
+
+/*
+ * VZ optionally allows these additional Config bits to be written by root:
+ * Config:	M, [MT]
+ * Config1:	M, [MMUSize-1, C2, MD, PC, WR, CA], FP
+ * Config2:	M
+ * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP, CTXTC, ITL, LPA, VEIC,
+ *		VInt, SP, CDMM, MT, SM, TL]
+ * Config4:	M, [VTLBSizeExt, MMUSizeExt]
+ * Config5:	[MRP]
+ */
+
+static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config1_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = kvm_vz_config1_guest_wrmask(vcpu) | MIPS_CONF_M;
+
+	/* Permit FPU to be present if FPU is supported */
+	if (kvm_mips_guest_can_have_fpu(&vcpu->arch))
+		mask |= MIPS_CONF1_FP;
+
+	return mask;
+}
+
+static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config2_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M |
+		MIPS_CONF3_ULRI;
+
+	/* Permit MSA to be present if MSA is supported */
+	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+		mask |= MIPS_CONF3_MSA;
+
+	return mask;
+}
+
+static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config4_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config5_guest_wrmask(vcpu);
+}
+
+static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
+{
+	/* VZ guest has already converted gva to gpa */
+	return gva;
+}
+
+static void kvm_vz_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	set_bit(priority, &vcpu->arch.pending_exceptions);
+	clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+	set_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_queue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * timer expiry is asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_dequeue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * timer expiry is asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_queue_io_int_cb(struct kvm_vcpu *vcpu,
+				   struct kvm_mips_interrupt *irq)
+{
+	int intr = (int)irq->irq;
+
+	/*
+	 * interrupts are asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	switch (intr) {
+	case 2:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IO);
+		break;
+
+	case 3:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+		break;
+
+	case 4:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+		break;
+
+	default:
+		break;
+	}
+
+}
+
+static void kvm_vz_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
+				     struct kvm_mips_interrupt *irq)
+{
+	int intr = (int)irq->irq;
+
+	/*
+	 * interrupts are asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	switch (intr) {
+	case -2:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IO);
+		break;
+
+	case -3:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+		break;
+
+	case -4:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+		break;
+
+	default:
+		break;
+	}
+
+}
+
+static u32 kvm_vz_priority_to_irq[MIPS_EXC_MAX] = {
+	[MIPS_EXC_INT_TIMER] = C_IRQ5,
+	[MIPS_EXC_INT_IO]    = C_IRQ0,
+	[MIPS_EXC_INT_IPI_1] = C_IRQ1,
+	[MIPS_EXC_INT_IPI_2] = C_IRQ2,
+};
+
+static int kvm_vz_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+				 u32 cause)
+{
+	u32 irq = (priority < MIPS_EXC_MAX) ?
+		kvm_vz_priority_to_irq[priority] : 0;
+
+	switch (priority) {
+	case MIPS_EXC_INT_TIMER:
+		set_gc0_cause(C_TI);
+		break;
+
+	case MIPS_EXC_INT_IO:
+	case MIPS_EXC_INT_IPI_1:
+	case MIPS_EXC_INT_IPI_2:
+		if (cpu_has_guestctl2)
+			set_c0_guestctl2(irq);
+		else
+			set_gc0_cause(irq);
+		break;
+
+	default:
+		break;
+	}
+
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+	return 1;
+}
+
+static int kvm_vz_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+			       u32 cause)
+{
+	u32 irq = (priority < MIPS_EXC_MAX) ?
+		kvm_vz_priority_to_irq[priority] : 0;
+
+	switch (priority) {
+	case MIPS_EXC_INT_TIMER:
+		/*
+		 * Call to kvm_write_c0_guest_compare() clears Cause.TI in
+		 * kvm_mips_emulate_CP0(). Explicitly clear irq associated with
+		 * Cause.IP[IPTI] if GuestCtl2 virtual interrupt register not
+		 * supported or if not using GuestCtl2 Hardware Clear.
+		 */
+		if (cpu_has_guestctl2) {
+			if (!(read_c0_guestctl2() & (irq << 14)))
+				clear_c0_guestctl2(irq);
+		} else {
+			clear_gc0_cause(irq);
+		}
+		break;
+
+	case MIPS_EXC_INT_IO:
+	case MIPS_EXC_INT_IPI_1:
+	case MIPS_EXC_INT_IPI_2:
+		/* Clear GuestCtl2.VIP irq if not using Hardware Clear */
+		if (cpu_has_guestctl2) {
+			if (!(read_c0_guestctl2() & (irq << 14)))
+				clear_c0_guestctl2(irq);
+		} else {
+			clear_gc0_cause(irq);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+	return 1;
+}
+
+/*
+ * VZ guest timer handling.
+ */
+
+/**
+ * _kvm_vz_restore_stimer() - Restore soft timer state.
+ * @vcpu:	Virtual CPU.
+ * @compare:	CP0_Compare register value, restored by caller.
+ * @cause:	CP0_Cause register to restore.
+ *
+ * Restore VZ state relating to the soft timer.
+ */
+static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare,
+				   u32 cause)
+{
+	/*
+	 * Avoid spurious counter interrupts by setting Guest CP0_Count to just
+	 * after Guest CP0_Compare.
+	 */
+	write_c0_gtoffset(compare - read_c0_count());
+
+	back_to_back_c0_hazard();
+	write_gc0_cause(cause);
+}
+
+/**
+ * kvm_vz_restore_timer() - Restore guest timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Restore soft timer state from saved context.
+ */
+static void kvm_vz_restore_timer(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	u32 cause, compare;
+
+	compare = kvm_read_sw_gc0_compare(cop0);
+	cause = kvm_read_sw_gc0_cause(cop0);
+
+	write_gc0_compare(compare);
+	_kvm_vz_restore_stimer(vcpu, compare, cause);
+}
+
+/**
+ * kvm_vz_save_timer() - Save guest timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Save VZ guest timer state.
+ */
+static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	u32 compare, cause;
+
+	compare = read_gc0_compare();
+	cause = read_gc0_cause();
+
+	/* save timer-related state to VCPU context */
+	kvm_write_sw_gc0_cause(cop0, cause);
+	kvm_write_sw_gc0_compare(cop0, compare);
+}
+
+/**
+ * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA.
+ * @vcpu:	KVM VCPU state.
+ * @gva:	Guest virtual address to convert.
+ * @gpa:	Output guest physical address.
+ *
+ * Convert a guest virtual address (GVA) which is valid according to the guest
+ * context, to a guest physical address (GPA).
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+			     unsigned long *gpa)
+{
+	u32 gva32 = gva;
+
+	if ((long)gva == (s32)gva32) {
+		/* Handle canonical 32-bit virtual address */
+		if ((s32)gva32 < (s32)0xc0000000) {
+			/* legacy unmapped KSeg0 or KSeg1 */
+			*gpa = gva32 & 0x1fffffff;
+			return 0;
+		}
+#ifdef CONFIG_64BIT
+	} else if ((gva & 0xc000000000000000) == 0x8000000000000000) {
+		/* XKPHYS */
+		/*
+		 * Traditionally fully unmapped.
+		 * Bits 61:59 specify the CCA, which we can just mask off here.
+		 * Bits 58:PABITS should be zero, but we shouldn't have got here
+		 * if it wasn't.
+		 */
+		*gpa = gva & 0x07ffffffffffffff;
+		return 0;
+#endif
+	}
+
+	return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa);
+}
+
+/**
+ * kvm_vz_badvaddr_to_gpa() - Convert GVA BadVAddr from root exception to GPA.
+ * @vcpu:	KVM VCPU state.
+ * @badvaddr:	Root BadVAddr.
+ * @gpa:	Output guest physical address.
+ *
+ * VZ implementations are permitted to report guest virtual addresses (GVA) in
+ * BadVAddr on a root exception during guest execution, instead of the more
+ * convenient guest physical addresses (GPA). When we get a GVA, this function
+ * converts it to a GPA, taking into account guest segmentation and guest TLB
+ * state.
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+static int kvm_vz_badvaddr_to_gpa(struct kvm_vcpu *vcpu, unsigned long badvaddr,
+				  unsigned long *gpa)
+{
+	unsigned int gexccode = (vcpu->arch.host_cp0_guestctl0 &
+				 MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+
+	/* If BadVAddr is GPA, then all is well in the world */
+	if (likely(gexccode == MIPS_GCTL0_GEXC_GPA)) {
+		*gpa = badvaddr;
+		return 0;
+	}
+
+	/* Otherwise we'd expect it to be GVA ... */
+	if (WARN(gexccode != MIPS_GCTL0_GEXC_GVA,
+		 "Unexpected gexccode %#x\n", gexccode))
+		return -EINVAL;
+
+	/* ... and we need to perform the GVA->GPA translation in software */
+	return kvm_vz_gva_to_gpa(vcpu, badvaddr, gpa);
+}
+
+static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu)
+{
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+	u32 inst = 0;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
+		exccode, opc, inst, badvaddr,
+		read_gc0_status());
+	kvm_arch_vcpu_dump_regs(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	return RESUME_HOST;
+}
+
+static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
+					      u32 *opc, u32 cause,
+					      struct kvm_run *run,
+					      struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	enum emulation_result er = EMULATE_DONE;
+	u32 rt, rd, sel;
+	unsigned long curr_pc;
+	unsigned long val;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	if (inst.co_format.co) {
+		switch (inst.co_format.func) {
+		case wait_op:
+			er = kvm_mips_emul_wait(vcpu);
+			break;
+		default:
+			er = EMULATE_FAIL;
+		}
+	} else {
+		rt = inst.c0r_format.rt;
+		rd = inst.c0r_format.rd;
+		sel = inst.c0r_format.sel;
+
+		switch (inst.c0r_format.rs) {
+		case dmfc_op:
+		case mfc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+			cop0->stat[rd][sel]++;
+#endif
+			if (rd == MIPS_CP0_COUNT &&
+			    sel == 0) {			/* Count */
+				val = kvm_mips_read_count(vcpu);
+			} else if (rd == MIPS_CP0_COMPARE &&
+				   sel == 0) {		/* Compare */
+				val = read_gc0_compare();
+			} else if ((rd == MIPS_CP0_PRID &&
+				    (sel == 0 ||	/* PRid */
+				     sel == 2 ||	/* CDMMBase */
+				     sel == 3)) ||	/* CMGCRBase */
+				   (rd == MIPS_CP0_STATUS &&
+				    (sel == 2 ||	/* SRSCtl */
+				     sel == 3)) ||	/* SRSMap */
+				   (rd == MIPS_CP0_CONFIG &&
+				    (sel == 7)) ||	/* Config7 */
+				   (rd == MIPS_CP0_ERRCTL &&
+				    (sel == 0))) {	/* ErrCtl */
+				val = cop0->reg[rd][sel];
+			} else {
+				val = 0;
+				er = EMULATE_FAIL;
+			}
+
+			if (er != EMULATE_FAIL) {
+				/* Sign extend */
+				if (inst.c0r_format.rs == mfc_op)
+					val = (int)val;
+				vcpu->arch.gprs[rt] = val;
+			}
+
+			trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mfc_op) ?
+					KVM_TRACE_MFC0 : KVM_TRACE_DMFC0,
+				      KVM_TRACE_COP0(rd, sel), val);
+			break;
+
+		case dmtc_op:
+		case mtc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+			cop0->stat[rd][sel]++;
+#endif
+			val = vcpu->arch.gprs[rt];
+			trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mtc_op) ?
+					KVM_TRACE_MTC0 : KVM_TRACE_DMTC0,
+				      KVM_TRACE_COP0(rd, sel), val);
+
+			if (rd == MIPS_CP0_COUNT &&
+			    sel == 0) {			/* Count */
+				kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
+			} else if (rd == MIPS_CP0_COMPARE &&
+				   sel == 0) {		/* Compare */
+				kvm_mips_write_compare(vcpu,
+						       vcpu->arch.gprs[rt],
+						       true);
+			} else if (rd == MIPS_CP0_ERRCTL &&
+				   (sel == 0)) {	/* ErrCtl */
+				/* ignore the written value */
+			} else {
+				er = EMULATE_FAIL;
+			}
+			break;
+
+		default:
+			er = EMULATE_FAIL;
+			break;
+		}
+	}
+	/* Rollback PC only if emulation was unsuccessful */
+	if (er == EMULATE_FAIL) {
+		kvm_err("[%#lx]%s: unsupported cop0 instruction 0x%08x\n",
+			curr_pc, __func__, inst.word);
+
+		vcpu->arch.pc = curr_pc;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
+					       u32 *opc, u32 cause,
+					       struct kvm_run *run,
+					       struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	u32 cache, op_inst, op, base;
+	s16 offset;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	unsigned long va, curr_pc;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	base = inst.i_format.rs;
+	op_inst = inst.i_format.rt;
+	if (cpu_has_mips_r6)
+		offset = inst.spec3_format.simmediate;
+	else
+		offset = inst.i_format.simmediate;
+	cache = op_inst & CacheOp_Cache;
+	op = op_inst & CacheOp_Op;
+
+	va = arch->gprs[base] + offset;
+
+	kvm_debug("CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+		  cache, op, base, arch->gprs[base], offset);
+
+	/* Secondary or tirtiary cache ops ignored */
+	if (cache != Cache_I && cache != Cache_D)
+		return EMULATE_DONE;
+
+	switch (op_inst) {
+	case Index_Invalidate_I:
+		flush_icache_line_indexed(va);
+		return EMULATE_DONE;
+	case Index_Writeback_Inv_D:
+		flush_dcache_line_indexed(va);
+		return EMULATE_DONE;
+	default:
+		break;
+	};
+
+	kvm_err("@ %#lx/%#lx CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+		curr_pc, vcpu->arch.gprs[31], cache, op, base, arch->gprs[base],
+		offset);
+	/* Rollback PC */
+	vcpu->arch.pc = curr_pc;
+
+	return EMULATE_FAIL;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	struct kvm_run *run = vcpu->run;
+	union mips_instruction inst;
+	int rd, rt, sel;
+	int err;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	switch (inst.r_format.opcode) {
+	case cop0_op:
+		er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu);
+		break;
+#ifndef CONFIG_CPU_MIPSR6
+	case cache_op:
+		trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+		er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+		break;
+#endif
+	case spec3_op:
+		switch (inst.spec3_format.func) {
+#ifdef CONFIG_CPU_MIPSR6
+		case cache6_op:
+			trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+			er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+			break;
+#endif
+		case rdhwr_op:
+			if (inst.r_format.rs || (inst.r_format.re >> 3))
+				goto unknown;
+
+			rd = inst.r_format.rd;
+			rt = inst.r_format.rt;
+			sel = inst.r_format.re & 0x7;
+
+			switch (rd) {
+			case MIPS_HWR_CC:	/* Read count register */
+				arch->gprs[rt] =
+					(long)(int)kvm_mips_read_count(vcpu);
+				break;
+			default:
+				trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+					      KVM_TRACE_HWR(rd, sel), 0);
+				goto unknown;
+			};
+
+			trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+				      KVM_TRACE_HWR(rd, sel), arch->gprs[rt]);
+
+			er = update_pc(vcpu, cause);
+			break;
+		default:
+			goto unknown;
+		};
+		break;
+unknown:
+
+	default:
+		kvm_err("GPSI exception not supported (%p/%#x)\n",
+				opc, inst.word);
+		kvm_arch_vcpu_dump_regs(vcpu);
+		er = EMULATE_FAIL;
+		break;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	union mips_instruction inst;
+	int err;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	/* complete MTC0 on behalf of guest and advance EPC */
+	if (inst.c0r_format.opcode == cop0_op &&
+	    inst.c0r_format.rs == mtc_op &&
+	    inst.c0r_format.z == 0) {
+		int rt = inst.c0r_format.rt;
+		int rd = inst.c0r_format.rd;
+		int sel = inst.c0r_format.sel;
+		unsigned int val = arch->gprs[rt];
+		unsigned int old_val, change;
+
+		trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, KVM_TRACE_COP0(rd, sel),
+			      val);
+
+		if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
+			/* FR bit should read as zero if no FPU */
+			if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+				val &= ~(ST0_CU1 | ST0_FR);
+
+			/*
+			 * Also don't allow FR to be set if host doesn't support
+			 * it.
+			 */
+			if (!(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+				val &= ~ST0_FR;
+
+			old_val = read_gc0_status();
+			change = val ^ old_val;
+
+			if (change & ST0_FR) {
+				/*
+				 * FPU and Vector register state is made
+				 * UNPREDICTABLE by a change of FR, so don't
+				 * even bother saving it.
+				 */
+				kvm_drop_fpu(vcpu);
+			}
+
+			/*
+			 * If MSA state is already live, it is undefined how it
+			 * interacts with FR=0 FPU state, and we don't want to
+			 * hit reserved instruction exceptions trying to save
+			 * the MSA state later when CU=1 && FR=1, so play it
+			 * safe and save it first.
+			 */
+			if (change & ST0_CU1 && !(val & ST0_FR) &&
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
+				kvm_lose_fpu(vcpu);
+
+			write_gc0_status(val);
+		} else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
+			u32 old_cause = read_gc0_cause();
+			u32 change = old_cause ^ val;
+
+			/* DC bit enabling/disabling timer? */
+			if (change & CAUSEF_DC) {
+				if (val & CAUSEF_DC)
+					kvm_mips_count_disable_cause(vcpu);
+				else
+					kvm_mips_count_enable_cause(vcpu);
+			}
+
+			/* Only certain bits are RW to the guest */
+			change &= (CAUSEF_DC | CAUSEF_IV | CAUSEF_WP |
+				   CAUSEF_IP0 | CAUSEF_IP1);
+
+			/* WP can only be cleared */
+			change &= ~CAUSEF_WP | old_cause;
+
+			write_gc0_cause(old_cause ^ change);
+		} else if ((rd == MIPS_CP0_STATUS) && (sel == 1)) { /* IntCtl */
+			write_gc0_intctl(val);
+		} else if ((rd == MIPS_CP0_CONFIG) && (sel == 5)) {
+			old_val = read_gc0_config5();
+			change = val ^ old_val;
+			/* Handle changes in FPU/MSA modes */
+			preempt_disable();
+
+			/*
+			 * Propagate FRE changes immediately if the FPU
+			 * context is already loaded.
+			 */
+			if (change & MIPS_CONF5_FRE &&
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
+				change_c0_config5(MIPS_CONF5_FRE, val);
+
+			preempt_enable();
+
+			val = old_val ^
+				(change & kvm_vz_config5_guest_wrmask(vcpu));
+			write_gc0_config5(val);
+		} else {
+			kvm_err("Handle GSFC, unsupported field change @ %p: %#x\n",
+			    opc, inst.word);
+			er = EMULATE_FAIL;
+		}
+
+		if (er != EMULATE_FAIL)
+			er = update_pc(vcpu, cause);
+	} else {
+		kvm_err("Handle GSFC, unrecognized instruction @ %p: %#x\n",
+			opc, inst.word);
+		er = EMULATE_FAIL;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_hc(u32 cause, u32 *opc,
+						   struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	unsigned long curr_pc;
+	int err;
+
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	er = kvm_mips_emul_hypcall(vcpu, inst);
+	if (er == EMULATE_FAIL)
+		vcpu->arch.pc = curr_pc;
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_no_handler_guest_exit(u32 gexccode,
+							u32 cause,
+							u32 *opc,
+							struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Guest Exception Code: %d not yet handled @ PC: %p, inst: 0x%08x  Status: %#x\n",
+		gexccode, opc, inst, read_gc0_status());
+
+	return EMULATE_FAIL;
+}
+
+static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu)
+{
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	enum emulation_result er = EMULATE_DONE;
+	u32 gexccode = (vcpu->arch.host_cp0_guestctl0 &
+			MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+	int ret = RESUME_GUEST;
+
+	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_GEXCCODE_BASE + gexccode);
+	switch (gexccode) {
+	case MIPS_GCTL0_GEXC_GPSI:
+		++vcpu->stat.vz_gpsi_exits;
+		er = kvm_trap_vz_handle_gpsi(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GSFC:
+		++vcpu->stat.vz_gsfc_exits;
+		er = kvm_trap_vz_handle_gsfc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_HC:
+		++vcpu->stat.vz_hc_exits;
+		er = kvm_trap_vz_handle_hc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GRR:
+		++vcpu->stat.vz_grr_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GVA:
+		++vcpu->stat.vz_gva_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GHFC:
+		++vcpu->stat.vz_ghfc_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GPA:
+		++vcpu->stat.vz_gpa_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	default:
+		++vcpu->stat.vz_resvd_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_HYPERCALL) {
+		ret = kvm_mips_handle_hypcall(vcpu);
+	} else {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor.
+ * @vcpu:	Virtual CPU context.
+ *
+ * Handle when the guest attempts to use a coprocessor which hasn't been allowed
+ * by the root context.
+ */
+static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	enum emulation_result er = EMULATE_FAIL;
+	int ret = RESUME_GUEST;
+
+	if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) {
+		/*
+		 * If guest FPU not present, the FPU operation should have been
+		 * treated as a reserved instruction!
+		 * If FPU already in use, we shouldn't get this at all.
+		 */
+		if (WARN_ON(!kvm_mips_guest_has_fpu(&vcpu->arch) ||
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
+			preempt_enable();
+			return EMULATE_FAIL;
+		}
+
+		kvm_own_fpu(vcpu);
+		er = EMULATE_DONE;
+	}
+	/* other coprocessors not handled */
+
+	switch (er) {
+	case EMULATE_DONE:
+		ret = RESUME_GUEST;
+		break;
+
+	case EMULATE_FAIL:
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+		break;
+
+	default:
+		BUG();
+	}
+	return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_msa_disabled() - Guest used MSA while disabled in root.
+ * @vcpu:	Virtual CPU context.
+ *
+ * Handle when the guest attempts to use MSA when it is disabled in the root
+ * context.
+ */
+static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	/*
+	 * If MSA not present or not exposed to guest or FR=0, the MSA operation
+	 * should have been treated as a reserved instruction!
+	 * Same if CU1=1, FR=0.
+	 * If MSA already in use, we shouldn't get this at all.
+	 */
+	if (!kvm_mips_guest_has_msa(&vcpu->arch) ||
+	    (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 ||
+	    !(read_gc0_config5() & MIPS_CONF5_MSAEN) ||
+	    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	kvm_own_msa(vcpu);
+
+	return RESUME_GUEST;
+}
+
+static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+	union mips_instruction inst;
+	enum emulation_result er = EMULATE_DONE;
+	int err, ret = RESUME_GUEST;
+
+	if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, false)) {
+		/* A code fetch fault doesn't count as an MMIO */
+		if (kvm_is_ifetch_fault(&vcpu->arch)) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Fetch the instruction */
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		err = kvm_get_badinstr(opc, vcpu, &inst.word);
+		if (err) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Treat as MMIO */
+		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+		if (er == EMULATE_FAIL) {
+			kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+				opc, badvaddr);
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		}
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_DO_MMIO) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		ret = RESUME_HOST;
+	} else {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+	union mips_instruction inst;
+	enum emulation_result er = EMULATE_DONE;
+	int err;
+	int ret = RESUME_GUEST;
+
+	/* Just try the access again if we couldn't do the translation */
+	if (kvm_vz_badvaddr_to_gpa(vcpu, badvaddr, &badvaddr))
+		return RESUME_GUEST;
+	vcpu->arch.host_cp0_badvaddr = badvaddr;
+
+	if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, true)) {
+		/* Fetch the instruction */
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		err = kvm_get_badinstr(opc, vcpu, &inst.word);
+		if (err) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Treat as MMIO */
+		er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+		if (er == EMULATE_FAIL) {
+			kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+				opc, badvaddr);
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		}
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_DO_MMIO) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		ret = RESUME_HOST;
+	} else {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+static u64 kvm_vz_get_one_regs[] = {
+	KVM_REG_MIPS_CP0_INDEX,
+	KVM_REG_MIPS_CP0_ENTRYLO0,
+	KVM_REG_MIPS_CP0_ENTRYLO1,
+	KVM_REG_MIPS_CP0_CONTEXT,
+	KVM_REG_MIPS_CP0_PAGEMASK,
+	KVM_REG_MIPS_CP0_PAGEGRAIN,
+	KVM_REG_MIPS_CP0_WIRED,
+	KVM_REG_MIPS_CP0_HWRENA,
+	KVM_REG_MIPS_CP0_BADVADDR,
+	KVM_REG_MIPS_CP0_COUNT,
+	KVM_REG_MIPS_CP0_ENTRYHI,
+	KVM_REG_MIPS_CP0_COMPARE,
+	KVM_REG_MIPS_CP0_STATUS,
+	KVM_REG_MIPS_CP0_INTCTL,
+	KVM_REG_MIPS_CP0_CAUSE,
+	KVM_REG_MIPS_CP0_EPC,
+	KVM_REG_MIPS_CP0_PRID,
+	KVM_REG_MIPS_CP0_EBASE,
+	KVM_REG_MIPS_CP0_CONFIG,
+	KVM_REG_MIPS_CP0_CONFIG1,
+	KVM_REG_MIPS_CP0_CONFIG2,
+	KVM_REG_MIPS_CP0_CONFIG3,
+	KVM_REG_MIPS_CP0_CONFIG4,
+	KVM_REG_MIPS_CP0_CONFIG5,
+#ifdef CONFIG_64BIT
+	KVM_REG_MIPS_CP0_XCONTEXT,
+#endif
+	KVM_REG_MIPS_CP0_ERROREPC,
+
+	KVM_REG_MIPS_COUNT_CTL,
+	KVM_REG_MIPS_COUNT_RESUME,
+	KVM_REG_MIPS_COUNT_HZ,
+};
+
+static u64 kvm_vz_get_one_regs_kscratch[] = {
+	KVM_REG_MIPS_CP0_KSCRATCH1,
+	KVM_REG_MIPS_CP0_KSCRATCH2,
+	KVM_REG_MIPS_CP0_KSCRATCH3,
+	KVM_REG_MIPS_CP0_KSCRATCH4,
+	KVM_REG_MIPS_CP0_KSCRATCH5,
+	KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
+{
+	unsigned long ret;
+
+	ret = ARRAY_SIZE(kvm_vz_get_one_regs);
+	if (cpu_guest_has_userlocal)
+		++ret;
+	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
+
+	return ret;
+}
+
+static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+	u64 index;
+	unsigned int i;
+
+	if (copy_to_user(indices, kvm_vz_get_one_regs,
+			 sizeof(kvm_vz_get_one_regs)))
+		return -EFAULT;
+	indices += ARRAY_SIZE(kvm_vz_get_one_regs);
+
+	if (cpu_guest_has_userlocal) {
+		index = KVM_REG_MIPS_CP0_USERLOCAL;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	for (i = 0; i < 6; ++i) {
+		if (!cpu_guest_has_kscr(i + 2))
+			continue;
+
+		if (copy_to_user(indices, &kvm_vz_get_one_regs_kscratch[i],
+				 sizeof(kvm_vz_get_one_regs_kscratch[i])))
+			return -EFAULT;
+		++indices;
+	}
+
+	return 0;
+}
+
+static inline s64 entrylo_kvm_to_user(unsigned long v)
+{
+	s64 mask, ret = v;
+
+	if (BITS_PER_LONG == 32) {
+		/*
+		 * KVM API exposes 64-bit version of the register, so move the
+		 * RI/XI bits up into place.
+		 */
+		mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+		ret &= ~mask;
+		ret |= ((s64)v & mask) << 32;
+	}
+	return ret;
+}
+
+static inline unsigned long entrylo_user_to_kvm(s64 v)
+{
+	unsigned long mask, ret = v;
+
+	if (BITS_PER_LONG == 32) {
+		/*
+		 * KVM API exposes 64-bit versiono of the register, so move the
+		 * RI/XI bits down into place.
+		 */
+		mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+		ret &= ~mask;
+		ret |= (v >> 32) & mask;
+	}
+	return ret;
+}
+
+static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
+			      const struct kvm_one_reg *reg,
+			      s64 *v)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned int idx;
+
+	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		*v = (long)read_gc0_index();
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		*v = entrylo_kvm_to_user(read_gc0_entrylo0());
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		*v = entrylo_kvm_to_user(read_gc0_entrylo1());
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		*v = (long)read_gc0_context();
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		if (!cpu_guest_has_userlocal)
+			return -EINVAL;
+		*v = read_gc0_userlocal();
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		*v = (long)read_gc0_pagemask();
+		break;
+	case KVM_REG_MIPS_CP0_PAGEGRAIN:
+		*v = (long)read_gc0_pagegrain();
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		*v = (long)read_gc0_wired();
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		*v = (long)read_gc0_hwrena();
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		*v = (long)read_gc0_badvaddr();
+		break;
+	case KVM_REG_MIPS_CP0_COUNT:
+		*v = kvm_mips_read_count(vcpu);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		*v = (long)read_gc0_entryhi();
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		*v = (long)read_gc0_compare();
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		*v = (long)read_gc0_status();
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		*v = read_gc0_intctl();
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		*v = (long)read_gc0_cause();
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		*v = (long)read_gc0_epc();
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		*v = (long)kvm_read_c0_guest_prid(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		*v = kvm_vz_read_gc0_ebase();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		*v = read_gc0_config();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		if (!cpu_guest_has_conf1)
+			return -EINVAL;
+		*v = read_gc0_config1();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		if (!cpu_guest_has_conf2)
+			return -EINVAL;
+		*v = read_gc0_config2();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		if (!cpu_guest_has_conf3)
+			return -EINVAL;
+		*v = read_gc0_config3();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		if (!cpu_guest_has_conf4)
+			return -EINVAL;
+		*v = read_gc0_config4();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		if (!cpu_guest_has_conf5)
+			return -EINVAL;
+		*v = read_gc0_config5();
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXT:
+		*v = read_gc0_xcontext();
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		*v = (long)read_gc0_errorepc();
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!cpu_guest_has_kscr(idx))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			*v = (long)read_gc0_kscratch1();
+			break;
+		case 3:
+			*v = (long)read_gc0_kscratch2();
+			break;
+		case 4:
+			*v = (long)read_gc0_kscratch3();
+			break;
+		case 5:
+			*v = (long)read_gc0_kscratch4();
+			break;
+		case 6:
+			*v = (long)read_gc0_kscratch5();
+			break;
+		case 7:
+			*v = (long)read_gc0_kscratch6();
+			break;
+		}
+		break;
+	case KVM_REG_MIPS_COUNT_CTL:
+		*v = vcpu->arch.count_ctl;
+		break;
+	case KVM_REG_MIPS_COUNT_RESUME:
+		*v = ktime_to_ns(vcpu->arch.count_resume);
+		break;
+	case KVM_REG_MIPS_COUNT_HZ:
+		*v = vcpu->arch.count_hz;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
+			      const struct kvm_one_reg *reg,
+			      s64 v)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned int idx;
+	int ret = 0;
+	unsigned int cur, change;
+
+	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		write_gc0_index(v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		write_gc0_entrylo0(entrylo_user_to_kvm(v));
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		write_gc0_entrylo1(entrylo_user_to_kvm(v));
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		write_gc0_context(v);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		if (!cpu_guest_has_userlocal)
+			return -EINVAL;
+		write_gc0_userlocal(v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		write_gc0_pagemask(v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEGRAIN:
+		write_gc0_pagegrain(v);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		write_gc0_hwrena(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		write_gc0_badvaddr(v);
+		break;
+	case KVM_REG_MIPS_CP0_COUNT:
+		kvm_mips_write_count(vcpu, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		write_gc0_entryhi(v);
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		kvm_mips_write_compare(vcpu, v, false);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		write_gc0_status(v);
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		write_gc0_intctl(v);
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		/*
+		 * If the timer is stopped or started (DC bit) it must look
+		 * atomic with changes to the timer interrupt pending bit (TI).
+		 * A timer interrupt should not happen in between.
+		 */
+		if ((read_gc0_cause() ^ v) & CAUSEF_DC) {
+			if (v & CAUSEF_DC) {
+				/* disable timer first */
+				kvm_mips_count_disable_cause(vcpu);
+				change_gc0_cause((u32)~CAUSEF_DC, v);
+			} else {
+				/* enable timer last */
+				change_gc0_cause((u32)~CAUSEF_DC, v);
+				kvm_mips_count_enable_cause(vcpu);
+			}
+		} else {
+			write_gc0_cause(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		write_gc0_epc(v);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		kvm_write_c0_guest_prid(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		kvm_vz_write_gc0_ebase(v);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		cur = read_gc0_config();
+		change = (cur ^ v) & kvm_vz_config_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		if (!cpu_guest_has_conf1)
+			break;
+		cur = read_gc0_config1();
+		change = (cur ^ v) & kvm_vz_config1_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config1(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		if (!cpu_guest_has_conf2)
+			break;
+		cur = read_gc0_config2();
+		change = (cur ^ v) & kvm_vz_config2_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config2(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		if (!cpu_guest_has_conf3)
+			break;
+		cur = read_gc0_config3();
+		change = (cur ^ v) & kvm_vz_config3_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config3(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		if (!cpu_guest_has_conf4)
+			break;
+		cur = read_gc0_config4();
+		change = (cur ^ v) & kvm_vz_config4_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config4(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		if (!cpu_guest_has_conf5)
+			break;
+		cur = read_gc0_config5();
+		change = (cur ^ v) & kvm_vz_config5_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config5(v);
+		}
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXT:
+		write_gc0_xcontext(v);
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		write_gc0_errorepc(v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!cpu_guest_has_kscr(idx))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			write_gc0_kscratch1(v);
+			break;
+		case 3:
+			write_gc0_kscratch2(v);
+			break;
+		case 4:
+			write_gc0_kscratch3(v);
+			break;
+		case 5:
+			write_gc0_kscratch4(v);
+			break;
+		case 6:
+			write_gc0_kscratch5(v);
+			break;
+		case 7:
+			write_gc0_kscratch6(v);
+			break;
+		}
+		break;
+	case KVM_REG_MIPS_COUNT_CTL:
+		ret = kvm_mips_set_count_ctl(vcpu, v);
+		break;
+	case KVM_REG_MIPS_COUNT_RESUME:
+		ret = kvm_mips_set_count_resume(vcpu, v);
+		break;
+	case KVM_REG_MIPS_COUNT_HZ:
+		ret = kvm_mips_set_count_hz(vcpu, v);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return ret;
+}
+
+#define guestid_cache(cpu)	(cpu_data[cpu].guestid_cache)
+static void kvm_vz_get_new_guestid(unsigned long cpu, struct kvm_vcpu *vcpu)
+{
+	unsigned long guestid = guestid_cache(cpu);
+
+	if (!(++guestid & GUESTID_MASK)) {
+		if (cpu_has_vtag_icache)
+			flush_icache_all();
+
+		if (!guestid)		/* fix version if needed */
+			guestid = GUESTID_FIRST_VERSION;
+
+		++guestid;		/* guestid 0 reserved for root */
+
+		/* start new guestid cycle */
+		kvm_vz_local_flush_roottlb_all_guests();
+		kvm_vz_local_flush_guesttlb_all();
+	}
+
+	guestid_cache(cpu) = guestid;
+}
+
+/* Returns 1 if the guest TLB may be clobbered */
+static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu)
+{
+	int ret = 0;
+	int i;
+
+	if (!vcpu->requests)
+		return 0;
+
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+		if (cpu_has_guestid) {
+			/* Drop all GuestIDs for this VCPU */
+			for_each_possible_cpu(i)
+				vcpu->arch.vzguestid[i] = 0;
+			/* This will clobber guest TLB contents too */
+			ret = 1;
+		}
+		/*
+		 * For Root ASID Dealias (RAD) we don't do anything here, but we
+		 * still need the request to ensure we recheck asid_flush_mask.
+		 * We can still return 0 as only the root TLB will be affected
+		 * by a root ASID flush.
+		 */
+	}
+
+	return ret;
+}
+
+static void kvm_vz_vcpu_save_wired(struct kvm_vcpu *vcpu)
+{
+	unsigned int wired = read_gc0_wired();
+	struct kvm_mips_tlb *tlbs;
+	int i;
+
+	/* Expand the wired TLB array if necessary */
+	wired &= MIPSR6_WIRED_WIRED;
+	if (wired > vcpu->arch.wired_tlb_limit) {
+		tlbs = krealloc(vcpu->arch.wired_tlb, wired *
+				sizeof(*vcpu->arch.wired_tlb), GFP_ATOMIC);
+		if (WARN_ON(!tlbs)) {
+			/* Save whatever we can */
+			wired = vcpu->arch.wired_tlb_limit;
+		} else {
+			vcpu->arch.wired_tlb = tlbs;
+			vcpu->arch.wired_tlb_limit = wired;
+		}
+	}
+
+	if (wired)
+		/* Save wired entries from the guest TLB */
+		kvm_vz_save_guesttlb(vcpu->arch.wired_tlb, 0, wired);
+	/* Invalidate any dropped entries since last time */
+	for (i = wired; i < vcpu->arch.wired_tlb_used; ++i) {
+		vcpu->arch.wired_tlb[i].tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+		vcpu->arch.wired_tlb[i].tlb_lo[0] = 0;
+		vcpu->arch.wired_tlb[i].tlb_lo[1] = 0;
+		vcpu->arch.wired_tlb[i].tlb_mask = 0;
+	}
+	vcpu->arch.wired_tlb_used = wired;
+}
+
+static void kvm_vz_vcpu_load_wired(struct kvm_vcpu *vcpu)
+{
+	/* Load wired entries into the guest TLB */
+	if (vcpu->arch.wired_tlb)
+		kvm_vz_load_guesttlb(vcpu->arch.wired_tlb, 0,
+				     vcpu->arch.wired_tlb_used);
+}
+
+static void kvm_vz_vcpu_load_tlb(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct mm_struct *gpa_mm = &kvm->arch.gpa_mm;
+	bool migrated;
+
+	/*
+	 * Are we entering guest context on a different CPU to last time?
+	 * If so, the VCPU's guest TLB state on this CPU may be stale.
+	 */
+	migrated = (vcpu->arch.last_exec_cpu != cpu);
+	vcpu->arch.last_exec_cpu = cpu;
+
+	/*
+	 * A vcpu's GuestID is set in GuestCtl1.ID when the vcpu is loaded and
+	 * remains set until another vcpu is loaded in.  As a rule GuestRID
+	 * remains zeroed when in root context unless the kernel is busy
+	 * manipulating guest tlb entries.
+	 */
+	if (cpu_has_guestid) {
+		/*
+		 * Check if our GuestID is of an older version and thus invalid.
+		 *
+		 * We also discard the stored GuestID if we've executed on
+		 * another CPU, as the guest mappings may have changed without
+		 * hypervisor knowledge.
+		 */
+		if (migrated ||
+		    (vcpu->arch.vzguestid[cpu] ^ guestid_cache(cpu)) &
+					GUESTID_VERSION_MASK) {
+			kvm_vz_get_new_guestid(cpu, vcpu);
+			vcpu->arch.vzguestid[cpu] = guestid_cache(cpu);
+			trace_kvm_guestid_change(vcpu,
+						 vcpu->arch.vzguestid[cpu]);
+		}
+
+		/* Restore GuestID */
+		change_c0_guestctl1(GUESTID_MASK, vcpu->arch.vzguestid[cpu]);
+	} else {
+		/*
+		 * The Guest TLB only stores a single guest's TLB state, so
+		 * flush it if another VCPU has executed on this CPU.
+		 *
+		 * We also flush if we've executed on another CPU, as the guest
+		 * mappings may have changed without hypervisor knowledge.
+		 */
+		if (migrated || last_exec_vcpu[cpu] != vcpu)
+			kvm_vz_local_flush_guesttlb_all();
+		last_exec_vcpu[cpu] = vcpu;
+
+		/*
+		 * Root ASID dealiases guest GPA mappings in the root TLB.
+		 * Allocate new root ASID if needed.
+		 */
+		if (cpumask_test_and_clear_cpu(cpu, &kvm->arch.asid_flush_mask)
+		    || (cpu_context(cpu, gpa_mm) ^ asid_cache(cpu)) &
+						asid_version_mask(cpu))
+			get_new_mmu_context(gpa_mm, cpu);
+	}
+}
+
+static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	bool migrated, all;
+
+	/*
+	 * Have we migrated to a different CPU?
+	 * If so, any old guest TLB state may be stale.
+	 */
+	migrated = (vcpu->arch.last_sched_cpu != cpu);
+
+	/*
+	 * Was this the last VCPU to run on this CPU?
+	 * If not, any old guest state from this VCPU will have been clobbered.
+	 */
+	all = migrated || (last_vcpu[cpu] != vcpu);
+	last_vcpu[cpu] = vcpu;
+
+	/*
+	 * Restore CP0_Wired unconditionally as we clear it after use, and
+	 * restore wired guest TLB entries (while in guest context).
+	 */
+	kvm_restore_gc0_wired(cop0);
+	if (current->flags & PF_VCPU) {
+		tlbw_use_hazard();
+		kvm_vz_vcpu_load_tlb(vcpu, cpu);
+		kvm_vz_vcpu_load_wired(vcpu);
+	}
+
+	/*
+	 * Restore timer state regardless, as e.g. Cause.TI can change over time
+	 * if left unmaintained.
+	 */
+	kvm_vz_restore_timer(vcpu);
+
+	/* Don't bother restoring registers multiple times unless necessary */
+	if (!all)
+		return 0;
+
+	/*
+	 * Restore config registers first, as some implementations restrict
+	 * writes to other registers when the corresponding feature bits aren't
+	 * set. For example Status.CU1 cannot be set unless Config1.FP is set.
+	 */
+	kvm_restore_gc0_config(cop0);
+	if (cpu_guest_has_conf1)
+		kvm_restore_gc0_config1(cop0);
+	if (cpu_guest_has_conf2)
+		kvm_restore_gc0_config2(cop0);
+	if (cpu_guest_has_conf3)
+		kvm_restore_gc0_config3(cop0);
+	if (cpu_guest_has_conf4)
+		kvm_restore_gc0_config4(cop0);
+	if (cpu_guest_has_conf5)
+		kvm_restore_gc0_config5(cop0);
+	if (cpu_guest_has_conf6)
+		kvm_restore_gc0_config6(cop0);
+	if (cpu_guest_has_conf7)
+		kvm_restore_gc0_config7(cop0);
+
+	kvm_restore_gc0_index(cop0);
+	kvm_restore_gc0_entrylo0(cop0);
+	kvm_restore_gc0_entrylo1(cop0);
+	kvm_restore_gc0_context(cop0);
+#ifdef CONFIG_64BIT
+	kvm_restore_gc0_xcontext(cop0);
+#endif
+	kvm_restore_gc0_pagemask(cop0);
+	kvm_restore_gc0_pagegrain(cop0);
+	kvm_restore_gc0_hwrena(cop0);
+	kvm_restore_gc0_badvaddr(cop0);
+	kvm_restore_gc0_entryhi(cop0);
+	kvm_restore_gc0_status(cop0);
+	kvm_restore_gc0_intctl(cop0);
+	kvm_restore_gc0_epc(cop0);
+	kvm_vz_write_gc0_ebase(kvm_read_sw_gc0_ebase(cop0));
+	if (cpu_guest_has_userlocal)
+		kvm_restore_gc0_userlocal(cop0);
+
+	kvm_restore_gc0_errorepc(cop0);
+
+	/* restore KScratch registers if enabled in guest */
+	if (cpu_guest_has_conf4) {
+		if (cpu_guest_has_kscr(2))
+			kvm_restore_gc0_kscratch1(cop0);
+		if (cpu_guest_has_kscr(3))
+			kvm_restore_gc0_kscratch2(cop0);
+		if (cpu_guest_has_kscr(4))
+			kvm_restore_gc0_kscratch3(cop0);
+		if (cpu_guest_has_kscr(5))
+			kvm_restore_gc0_kscratch4(cop0);
+		if (cpu_guest_has_kscr(6))
+			kvm_restore_gc0_kscratch5(cop0);
+		if (cpu_guest_has_kscr(7))
+			kvm_restore_gc0_kscratch6(cop0);
+	}
+
+	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
+	if (cpu_has_guestctl2)
+		write_c0_guestctl2(
+			cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL]);
+
+	return 0;
+}
+
+static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	if (current->flags & PF_VCPU)
+		kvm_vz_vcpu_save_wired(vcpu);
+
+	kvm_lose_fpu(vcpu);
+
+	kvm_save_gc0_index(cop0);
+	kvm_save_gc0_entrylo0(cop0);
+	kvm_save_gc0_entrylo1(cop0);
+	kvm_save_gc0_context(cop0);
+#ifdef CONFIG_64BIT
+	kvm_save_gc0_xcontext(cop0);
+#endif
+	kvm_save_gc0_pagemask(cop0);
+	kvm_save_gc0_pagegrain(cop0);
+	kvm_save_gc0_wired(cop0);
+	/* allow wired TLB entries to be overwritten */
+	clear_gc0_wired(MIPSR6_WIRED_WIRED);
+	kvm_save_gc0_hwrena(cop0);
+	kvm_save_gc0_badvaddr(cop0);
+	kvm_save_gc0_entryhi(cop0);
+	kvm_save_gc0_status(cop0);
+	kvm_save_gc0_intctl(cop0);
+	kvm_save_gc0_epc(cop0);
+	kvm_write_sw_gc0_ebase(cop0, kvm_vz_read_gc0_ebase());
+	if (cpu_guest_has_userlocal)
+		kvm_save_gc0_userlocal(cop0);
+
+	/* only save implemented config registers */
+	kvm_save_gc0_config(cop0);
+	if (cpu_guest_has_conf1)
+		kvm_save_gc0_config1(cop0);
+	if (cpu_guest_has_conf2)
+		kvm_save_gc0_config2(cop0);
+	if (cpu_guest_has_conf3)
+		kvm_save_gc0_config3(cop0);
+	if (cpu_guest_has_conf4)
+		kvm_save_gc0_config4(cop0);
+	if (cpu_guest_has_conf5)
+		kvm_save_gc0_config5(cop0);
+	if (cpu_guest_has_conf6)
+		kvm_save_gc0_config6(cop0);
+	if (cpu_guest_has_conf7)
+		kvm_save_gc0_config7(cop0);
+
+	kvm_save_gc0_errorepc(cop0);
+
+	/* save KScratch registers if enabled in guest */
+	if (cpu_guest_has_conf4) {
+		if (cpu_guest_has_kscr(2))
+			kvm_save_gc0_kscratch1(cop0);
+		if (cpu_guest_has_kscr(3))
+			kvm_save_gc0_kscratch2(cop0);
+		if (cpu_guest_has_kscr(4))
+			kvm_save_gc0_kscratch3(cop0);
+		if (cpu_guest_has_kscr(5))
+			kvm_save_gc0_kscratch4(cop0);
+		if (cpu_guest_has_kscr(6))
+			kvm_save_gc0_kscratch5(cop0);
+		if (cpu_guest_has_kscr(7))
+			kvm_save_gc0_kscratch6(cop0);
+	}
+
+	kvm_vz_save_timer(vcpu);
+
+	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
+	if (cpu_has_guestctl2)
+		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] =
+			read_c0_guestctl2();
+
+	return 0;
+}
+
+/**
+ * kvm_vz_resize_guest_vtlb() - Attempt to resize guest VTLB.
+ * @size:	Number of guest VTLB entries (0 < @size <= root VTLB entries).
+ *
+ * Attempt to resize the guest VTLB by writing guest Config registers. This is
+ * necessary for cores with a shared root/guest TLB to avoid overlap with wired
+ * entries in the root VTLB.
+ *
+ * Returns:	The resulting guest VTLB size.
+ */
+static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
+{
+	unsigned int config4 = 0, ret = 0, limit;
+
+	/* Write MMUSize - 1 into guest Config registers */
+	if (cpu_guest_has_conf1)
+		change_gc0_config1(MIPS_CONF1_TLBS,
+				   (size - 1) << MIPS_CONF1_TLBS_SHIFT);
+	if (cpu_guest_has_conf4) {
+		config4 = read_gc0_config4();
+		if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+		    MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT) {
+			config4 &= ~MIPS_CONF4_VTLBSIZEEXT;
+			config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+				MIPS_CONF4_VTLBSIZEEXT_SHIFT;
+		} else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+			   MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT) {
+			config4 &= ~MIPS_CONF4_MMUSIZEEXT;
+			config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+				MIPS_CONF4_MMUSIZEEXT_SHIFT;
+		}
+		write_gc0_config4(config4);
+	}
+
+	/*
+	 * Set Guest.Wired.Limit = 0 (no limit up to Guest.MMUSize-1), unless it
+	 * would exceed Root.Wired.Limit (clearing Guest.Wired.Wired so write
+	 * not dropped)
+	 */
+	if (cpu_has_mips_r6) {
+		limit = (read_c0_wired() & MIPSR6_WIRED_LIMIT) >>
+						MIPSR6_WIRED_LIMIT_SHIFT;
+		if (size - 1 <= limit)
+			limit = 0;
+		write_gc0_wired(limit << MIPSR6_WIRED_LIMIT_SHIFT);
+	}
+
+	/* Read back MMUSize - 1 */
+	back_to_back_c0_hazard();
+	if (cpu_guest_has_conf1)
+		ret = (read_gc0_config1() & MIPS_CONF1_TLBS) >>
+						MIPS_CONF1_TLBS_SHIFT;
+	if (config4) {
+		if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+		    MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT)
+			ret |= ((config4 & MIPS_CONF4_VTLBSIZEEXT) >>
+				MIPS_CONF4_VTLBSIZEEXT_SHIFT) <<
+				MIPS_CONF1_TLBS_SIZE;
+		else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+			 MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT)
+			ret |= ((config4 & MIPS_CONF4_MMUSIZEEXT) >>
+				MIPS_CONF4_MMUSIZEEXT_SHIFT) <<
+				MIPS_CONF1_TLBS_SIZE;
+	}
+	return ret + 1;
+}
+
+static int kvm_vz_hardware_enable(void)
+{
+	unsigned int mmu_size, guest_mmu_size, ftlb_size;
+
+	/*
+	 * ImgTec cores tend to use a shared root/guest TLB. To avoid overlap of
+	 * root wired and guest entries, the guest TLB may need resizing.
+	 */
+	mmu_size = current_cpu_data.tlbsizevtlb;
+	ftlb_size = current_cpu_data.tlbsize - mmu_size;
+
+	/* Try switching to maximum guest VTLB size for flush */
+	guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size);
+	current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+	kvm_vz_local_flush_guesttlb_all();
+
+	/*
+	 * Reduce to make space for root wired entries and at least 2 root
+	 * non-wired entries. This does assume that long-term wired entries
+	 * won't be added later.
+	 */
+	guest_mmu_size = mmu_size - num_wired_entries() - 2;
+	guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size);
+	current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+
+	/*
+	 * Write the VTLB size, but if another CPU has already written, check it
+	 * matches or we won't provide a consistent view to the guest. If this
+	 * ever happens it suggests an asymmetric number of wired entries.
+	 */
+	if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) &&
+	    WARN(guest_mmu_size != kvm_vz_guest_vtlb_size,
+		 "Available guest VTLB size mismatch"))
+		return -EINVAL;
+
+	/*
+	 * Enable virtualization features granting guest direct control of
+	 * certain features:
+	 * CP0=1:	Guest coprocessor 0 context.
+	 * AT=Guest:	Guest MMU.
+	 * CG=1:	Hit (virtual address) CACHE operations (optional).
+	 * CF=1:	Guest Config registers.
+	 * CGI=1:	Indexed flush CACHE operations (optional).
+	 */
+	write_c0_guestctl0(MIPS_GCTL0_CP0 |
+			   (MIPS_GCTL0_AT_GUEST << MIPS_GCTL0_AT_SHIFT) |
+			   MIPS_GCTL0_CG | MIPS_GCTL0_CF);
+	if (cpu_has_guestctl0ext)
+		set_c0_guestctl0ext(MIPS_GCTL0EXT_CGI);
+
+	if (cpu_has_guestid) {
+		write_c0_guestctl1(0);
+		kvm_vz_local_flush_roottlb_all_guests();
+
+		GUESTID_MASK = current_cpu_data.guestid_mask;
+		GUESTID_FIRST_VERSION = GUESTID_MASK + 1;
+		GUESTID_VERSION_MASK = ~GUESTID_MASK;
+
+		current_cpu_data.guestid_cache = GUESTID_FIRST_VERSION;
+	}
+
+	/* clear any pending injected virtual guest interrupts */
+	if (cpu_has_guestctl2)
+		clear_c0_guestctl2(0x3f << 10);
+
+	return 0;
+}
+
+static void kvm_vz_hardware_disable(void)
+{
+	kvm_vz_local_flush_guesttlb_all();
+
+	if (cpu_has_guestid) {
+		write_c0_guestctl1(0);
+		kvm_vz_local_flush_roottlb_all_guests();
+	}
+}
+
+static int kvm_vz_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_MIPS_VZ:
+		/* we wouldn't be here unless cpu_has_vz */
+		r = 1;
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_CAP_MIPS_64BIT:
+		/* We support 64-bit registers/operations and addresses */
+		r = 2;
+		break;
+#endif
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_vz_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		vcpu->arch.vzguestid[i] = 0;
+
+	return 0;
+}
+
+static void kvm_vz_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+
+	/*
+	 * If the VCPU is freed and reused as another VCPU, we don't want the
+	 * matching pointer wrongly hanging around in last_vcpu[] or
+	 * last_exec_vcpu[].
+	 */
+	for_each_possible_cpu(cpu) {
+		if (last_vcpu[cpu] == vcpu)
+			last_vcpu[cpu] = NULL;
+		if (last_exec_vcpu[cpu] == vcpu)
+			last_exec_vcpu[cpu] = NULL;
+	}
+}
+
+static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned long count_hz = 100*1000*1000; /* default to 100 MHz */
+
+	/*
+	 * Start off the timer at the same frequency as the host timer, but the
+	 * soft timer doesn't handle frequencies greater than 1GHz yet.
+	 */
+	if (mips_hpt_frequency && mips_hpt_frequency <= NSEC_PER_SEC)
+		count_hz = mips_hpt_frequency;
+	kvm_mips_init_count(vcpu, count_hz);
+
+	/*
+	 * Initialize guest register state to valid architectural reset state.
+	 */
+
+	/* PageGrain */
+	if (cpu_has_mips_r6)
+		kvm_write_sw_gc0_pagegrain(cop0, PG_RIE | PG_XIE | PG_IEC);
+	/* Wired */
+	if (cpu_has_mips_r6)
+		kvm_write_sw_gc0_wired(cop0,
+				       read_gc0_wired() & MIPSR6_WIRED_LIMIT);
+	/* Status */
+	kvm_write_sw_gc0_status(cop0, ST0_BEV | ST0_ERL);
+	if (cpu_has_mips_r6)
+		kvm_change_sw_gc0_status(cop0, ST0_FR, read_gc0_status());
+	/* IntCtl */
+	kvm_write_sw_gc0_intctl(cop0, read_gc0_intctl() &
+				(INTCTLF_IPFDC | INTCTLF_IPPCI | INTCTLF_IPTI));
+	/* PRId */
+	kvm_write_sw_gc0_prid(cop0, boot_cpu_data.processor_id);
+	/* EBase */
+	kvm_write_sw_gc0_ebase(cop0, (s32)0x80000000 | vcpu->vcpu_id);
+	/* Config */
+	kvm_save_gc0_config(cop0);
+	/* architecturally writable (e.g. from guest) */
+	kvm_change_sw_gc0_config(cop0, CONF_CM_CMASK,
+				 _page_cachable_default >> _CACHE_SHIFT);
+	/* architecturally read only, but maybe writable from root */
+	kvm_change_sw_gc0_config(cop0, MIPS_CONF_MT, read_c0_config());
+	if (cpu_guest_has_conf1) {
+		kvm_set_sw_gc0_config(cop0, MIPS_CONF_M);
+		/* Config1 */
+		kvm_save_gc0_config1(cop0);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config1(cop0, MIPS_CONF1_C2	|
+					       MIPS_CONF1_MD	|
+					       MIPS_CONF1_PC	|
+					       MIPS_CONF1_WR	|
+					       MIPS_CONF1_CA	|
+					       MIPS_CONF1_FP);
+	}
+	if (cpu_guest_has_conf2) {
+		kvm_set_sw_gc0_config1(cop0, MIPS_CONF_M);
+		/* Config2 */
+		kvm_save_gc0_config2(cop0);
+	}
+	if (cpu_guest_has_conf3) {
+		kvm_set_sw_gc0_config2(cop0, MIPS_CONF_M);
+		/* Config3 */
+		kvm_save_gc0_config3(cop0);
+		/* architecturally writable (e.g. from guest) */
+		kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_ISA_OE);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_MSA	|
+					       MIPS_CONF3_BPG	|
+					       MIPS_CONF3_ULRI	|
+					       MIPS_CONF3_DSP	|
+					       MIPS_CONF3_CTXTC	|
+					       MIPS_CONF3_ITL	|
+					       MIPS_CONF3_LPA	|
+					       MIPS_CONF3_VEIC	|
+					       MIPS_CONF3_VINT	|
+					       MIPS_CONF3_SP	|
+					       MIPS_CONF3_CDMM	|
+					       MIPS_CONF3_MT	|
+					       MIPS_CONF3_SM	|
+					       MIPS_CONF3_TL);
+	}
+	if (cpu_guest_has_conf4) {
+		kvm_set_sw_gc0_config3(cop0, MIPS_CONF_M);
+		/* Config4 */
+		kvm_save_gc0_config4(cop0);
+	}
+	if (cpu_guest_has_conf5) {
+		kvm_set_sw_gc0_config4(cop0, MIPS_CONF_M);
+		/* Config5 */
+		kvm_save_gc0_config5(cop0);
+		/* architecturally writable (e.g. from guest) */
+		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_K	|
+					       MIPS_CONF5_CV	|
+					       MIPS_CONF5_MSAEN	|
+					       MIPS_CONF5_UFE	|
+					       MIPS_CONF5_FRE	|
+					       MIPS_CONF5_SBRI	|
+					       MIPS_CONF5_UFR);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP);
+	}
+
+	/* start with no pending virtual guest interrupts */
+	if (cpu_has_guestctl2)
+		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
+
+	/* Put PC at reset vector */
+	vcpu->arch.pc = CKSEG1ADDR(0x1fc00000);
+
+	return 0;
+}
+
+static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+{
+	if (cpu_has_guestid) {
+		/* Flush GuestID for each VCPU individually */
+		kvm_flush_remote_tlbs(kvm);
+	} else {
+		/*
+		 * For each CPU there is a single GPA ASID used by all VCPUs in
+		 * the VM, so it doesn't make sense for the VCPUs to handle
+		 * invalidation of these ASIDs individually.
+		 *
+		 * Instead mark all CPUs as needing ASID invalidation in
+		 * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+		 * kick any running VCPUs so they check asid_flush_mask.
+		 */
+		cpumask_setall(&kvm->arch.asid_flush_mask);
+		kvm_flush_remote_tlbs(kvm);
+	}
+}
+
+static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
+					const struct kvm_memory_slot *slot)
+{
+	kvm_vz_flush_shadow_all(kvm);
+}
+
+static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int preserve_guest_tlb;
+
+	preserve_guest_tlb = kvm_vz_check_requests(vcpu, cpu);
+
+	if (preserve_guest_tlb)
+		kvm_vz_vcpu_save_wired(vcpu);
+
+	kvm_vz_vcpu_load_tlb(vcpu, cpu);
+
+	if (preserve_guest_tlb)
+		kvm_vz_vcpu_load_wired(vcpu);
+}
+
+static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int r;
+
+	/* Check if we have any exceptions/interrupts pending */
+	kvm_mips_deliver_interrupts(vcpu, read_gc0_cause());
+
+	kvm_vz_check_requests(vcpu, cpu);
+	kvm_vz_vcpu_load_tlb(vcpu, cpu);
+	kvm_vz_vcpu_load_wired(vcpu);
+
+	r = vcpu->arch.vcpu_run(run, vcpu);
+
+	kvm_vz_vcpu_save_wired(vcpu);
+
+	return r;
+}
+
+static struct kvm_mips_callbacks kvm_vz_callbacks = {
+	.handle_cop_unusable = kvm_trap_vz_handle_cop_unusable,
+	.handle_tlb_mod = kvm_trap_vz_handle_tlb_st_miss,
+	.handle_tlb_ld_miss = kvm_trap_vz_handle_tlb_ld_miss,
+	.handle_tlb_st_miss = kvm_trap_vz_handle_tlb_st_miss,
+	.handle_addr_err_st = kvm_trap_vz_no_handler,
+	.handle_addr_err_ld = kvm_trap_vz_no_handler,
+	.handle_syscall = kvm_trap_vz_no_handler,
+	.handle_res_inst = kvm_trap_vz_no_handler,
+	.handle_break = kvm_trap_vz_no_handler,
+	.handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
+	.handle_guest_exit = kvm_trap_vz_handle_guest_exit,
+
+	.hardware_enable = kvm_vz_hardware_enable,
+	.hardware_disable = kvm_vz_hardware_disable,
+	.check_extension = kvm_vz_check_extension,
+	.vcpu_init = kvm_vz_vcpu_init,
+	.vcpu_uninit = kvm_vz_vcpu_uninit,
+	.vcpu_setup = kvm_vz_vcpu_setup,
+	.flush_shadow_all = kvm_vz_flush_shadow_all,
+	.flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+	.gva_to_gpa = kvm_vz_gva_to_gpa_cb,
+	.queue_timer_int = kvm_vz_queue_timer_int_cb,
+	.dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
+	.queue_io_int = kvm_vz_queue_io_int_cb,
+	.dequeue_io_int = kvm_vz_dequeue_io_int_cb,
+	.irq_deliver = kvm_vz_irq_deliver_cb,
+	.irq_clear = kvm_vz_irq_clear_cb,
+	.num_regs = kvm_vz_num_regs,
+	.copy_reg_indices = kvm_vz_copy_reg_indices,
+	.get_one_reg = kvm_vz_get_one_reg,
+	.set_one_reg = kvm_vz_set_one_reg,
+	.vcpu_load = kvm_vz_vcpu_load,
+	.vcpu_put = kvm_vz_vcpu_put,
+	.vcpu_run = kvm_vz_vcpu_run,
+	.vcpu_reenter = kvm_vz_vcpu_reenter,
+};
+
+int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
+{
+	if (!cpu_has_vz)
+		return -ENODEV;
+
+	/*
+	 * VZ requires at least 2 KScratch registers, so it should have been
+	 * possible to allocate pgd_reg.
+	 */
+	if (WARN(pgd_reg == -1,
+		 "pgd_reg not allocated even though cpu_has_vz\n"))
+		return -ENODEV;
+
+	pr_info("Starting KVM with MIPS VZ extensions\n");
+
+	*install_callbacks = &kvm_vz_callbacks;
+	return 0;
+}

From 56f7a4405047353684e0d083d7426594daa5e344 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:32 +0000
Subject: [PATCH 038/227] KVM: MIPS: Add VZ support to build system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the MIPS Virtualization (VZ) ASE to the MIPS KVM build
system. For now KVM can only be configured for T&E or VZ and not both,
but the design of the user facing APIs support the possibility of having
both available, so this could change in future.

Note that support for various optional guest features (some of which
can't be turned off) are implemented in immediately following commits,
so although it should now be possible to build VZ support, it may not
work yet on your hardware.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/Kconfig  | 27 +++++++++++++++++++++++++--
 arch/mips/kvm/Makefile |  8 +++++++-
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 65067327db122b..50a722dfb236d7 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -26,11 +26,34 @@ config KVM
 	select SRCU
 	---help---
 	  Support for hosting Guest kernels.
-	  Currently supported on MIPS32 processors.
+
+choice
+	prompt "Virtualization mode"
+	depends on KVM
+	default KVM_MIPS_TE
+
+config KVM_MIPS_TE
+	bool "Trap & Emulate"
+	---help---
+	  Use trap and emulate to virtualize 32-bit guests in user mode. This
+	  does not require any special hardware Virtualization support beyond
+	  standard MIPS32/64 r2 or later, but it does require the guest kernel
+	  to be configured with CONFIG_KVM_GUEST=y so that it resides in the
+	  user address segment.
+
+config KVM_MIPS_VZ
+	bool "MIPS Virtualization (VZ) ASE"
+	---help---
+	  Use the MIPS Virtualization (VZ) ASE to virtualize guests. This
+	  supports running unmodified guest kernels (with CONFIG_KVM_GUEST=n),
+	  but requires hardware support.
+
+endchoice
 
 config KVM_MIPS_DYN_TRANS
 	bool "KVM/MIPS: Dynamic binary translation to reduce traps"
-	depends on KVM
+	depends on KVM_MIPS_TE
+	default y
 	---help---
 	  When running in Trap & Emulate mode patch privileged
 	  instructions to reduce the number of traps.
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index e56403c8a3f5f9..45d90f5d5177a0 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -9,9 +9,15 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
 kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
 	    interrupt.o stats.o commpage.o \
-	    dyntrans.o trap_emul.o fpu.o
+	    fpu.o
 kvm-objs += hypcall.o
 kvm-objs += mmu.o
 
+ifdef CONFIG_KVM_MIPS_VZ
+kvm-objs		+= vz.o
+else
+kvm-objs		+= dyntrans.o
+kvm-objs		+= trap_emul.o
+endif
 obj-$(CONFIG_KVM)	+= kvm.o
 obj-y			+= callback.o tlb.o

From edc89260d474fd044e77b7039bd2a697dddcac6c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:33 +0000
Subject: [PATCH 039/227] KVM: MIPS/VZ: Support guest CP0_BadInstr[P]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for VZ guest CP0_BadInstr and CP0_BadInstrP registers, as
found on most VZ capable cores. These guest registers need context
switching, and exposing via the KVM ioctl API when they are present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  2 ++
 arch/mips/include/asm/kvm_host.h  |  4 +++
 arch/mips/kvm/vz.c                | 46 +++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d2827864827fca..5ef4fa1de7d4d5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2079,6 +2079,8 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
+  MIPS  | KVM_REG_MIPS_CP0_BADINSTR     | 32
+  MIPS  | KVM_REG_MIPS_CP0_BADINSTRP    | 32
   MIPS  | KVM_REG_MIPS_CP0_COUNT        | 32
   MIPS  | KVM_REG_MIPS_CP0_ENTRYHI      | 64
   MIPS  | KVM_REG_MIPS_CP0_COMPARE      | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bc3f9dedaac802..3a9ca33263158e 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -40,6 +40,8 @@
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
+#define KVM_REG_MIPS_CP0_BADINSTR	MIPS_CP0_32(8, 1)
+#define KVM_REG_MIPS_CP0_BADINSTRP	MIPS_CP0_32(8, 2)
 #define KVM_REG_MIPS_CP0_COUNT		MIPS_CP0_32(9, 0)
 #define KVM_REG_MIPS_CP0_ENTRYHI	MIPS_CP0_64(10, 0)
 #define KVM_REG_MIPS_CP0_COMPARE	MIPS_CP0_32(11, 0)
@@ -669,6 +671,8 @@ __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
 __BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
 __BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
+__BUILD_KVM_RW_HW(badinstr,       32, MIPS_CP0_BAD_VADDR,    1)
+__BUILD_KVM_RW_HW(badinstrp,      32, MIPS_CP0_BAD_VADDR,    2)
 __BUILD_KVM_RW_SW(count,          32, MIPS_CP0_COUNT,        0)
 __BUILD_KVM_RW_HW(entryhi,        l,  MIPS_CP0_TLB_HI,       0)
 __BUILD_KVM_RW_HW(compare,        32, MIPS_CP0_COMPARE,      0)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index cfed234be1e384..ec909fcd08ce98 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1221,6 +1221,10 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 	ret = ARRAY_SIZE(kvm_vz_get_one_regs);
 	if (cpu_guest_has_userlocal)
 		++ret;
+	if (cpu_guest_has_badinstr)
+		++ret;
+	if (cpu_guest_has_badinstrp)
+		++ret;
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1242,6 +1246,18 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		++indices;
 	}
+	if (cpu_guest_has_badinstr) {
+		index = KVM_REG_MIPS_CP0_BADINSTR;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	if (cpu_guest_has_badinstrp) {
+		index = KVM_REG_MIPS_CP0_BADINSTRP;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1327,6 +1343,16 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_BADVADDR:
 		*v = (long)read_gc0_badvaddr();
 		break;
+	case KVM_REG_MIPS_CP0_BADINSTR:
+		if (!cpu_guest_has_badinstr)
+			return -EINVAL;
+		*v = read_gc0_badinstr();
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTRP:
+		if (!cpu_guest_has_badinstrp)
+			return -EINVAL;
+		*v = read_gc0_badinstrp();
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		*v = kvm_mips_read_count(vcpu);
 		break;
@@ -1472,6 +1498,16 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_BADVADDR:
 		write_gc0_badvaddr(v);
 		break;
+	case KVM_REG_MIPS_CP0_BADINSTR:
+		if (!cpu_guest_has_badinstr)
+			return -EINVAL;
+		write_gc0_badinstr(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTRP:
+		if (!cpu_guest_has_badinstrp)
+			return -EINVAL;
+		write_gc0_badinstrp(v);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		kvm_mips_write_count(vcpu, v);
 		break;
@@ -1871,6 +1907,11 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_restore_gc0_kscratch6(cop0);
 	}
 
+	if (cpu_guest_has_badinstr)
+		kvm_restore_gc0_badinstr(cop0);
+	if (cpu_guest_has_badinstrp)
+		kvm_restore_gc0_badinstrp(cop0);
+
 	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
 	if (cpu_has_guestctl2)
 		write_c0_guestctl2(
@@ -1945,6 +1986,11 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 			kvm_save_gc0_kscratch6(cop0);
 	}
 
+	if (cpu_guest_has_badinstr)
+		kvm_save_gc0_badinstr(cop0);
+	if (cpu_guest_has_badinstrp)
+		kvm_save_gc0_badinstrp(cop0);
+
 	kvm_vz_save_timer(vcpu);
 
 	/* save Root.GuestCtl2 in unused Guest guestctl2 register */

From dffe042fd8b2cc174966b929b379503af2c97933 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:34 +0000
Subject: [PATCH 040/227] KVM: MIPS/VZ: Support guest CP0_[X]ContextConfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for VZ guest CP0_ContextConfig and CP0_XContextConfig
(MIPS64 only) registers, as found on P5600 and P6600 cores. These guest
registers need initialising, context switching, and exposing via the KVM
ioctl API when they are present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  2 +
 arch/mips/include/asm/kvm_host.h  |  4 ++
 arch/mips/kvm/vz.c                | 62 ++++++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 5ef4fa1de7d4d5..5f53bfdc0d84eb 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2073,7 +2073,9 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_ENTRYLO0     | 64
   MIPS  | KVM_REG_MIPS_CP0_ENTRYLO1     | 64
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
+  MIPS  | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
+  MIPS  | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
   MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 3a9ca33263158e..5066d89f22270d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -34,7 +34,9 @@
 #define KVM_REG_MIPS_CP0_ENTRYLO0	MIPS_CP0_64(2, 0)
 #define KVM_REG_MIPS_CP0_ENTRYLO1	MIPS_CP0_64(3, 0)
 #define KVM_REG_MIPS_CP0_CONTEXT	MIPS_CP0_64(4, 0)
+#define KVM_REG_MIPS_CP0_CONTEXTCONFIG	MIPS_CP0_32(4, 1)
 #define KVM_REG_MIPS_CP0_USERLOCAL	MIPS_CP0_64(4, 2)
+#define KVM_REG_MIPS_CP0_XCONTEXTCONFIG	MIPS_CP0_64(4, 3)
 #define KVM_REG_MIPS_CP0_PAGEMASK	MIPS_CP0_32(5, 0)
 #define KVM_REG_MIPS_CP0_PAGEGRAIN	MIPS_CP0_32(5, 1)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
@@ -665,7 +667,9 @@ __BUILD_KVM_RW_HW(index,          32, MIPS_CP0_TLB_INDEX,    0)
 __BUILD_KVM_RW_HW(entrylo0,       l,  MIPS_CP0_TLB_LO0,      0)
 __BUILD_KVM_RW_HW(entrylo1,       l,  MIPS_CP0_TLB_LO1,      0)
 __BUILD_KVM_RW_HW(context,        l,  MIPS_CP0_TLB_CONTEXT,  0)
+__BUILD_KVM_RW_HW(contextconfig,  32, MIPS_CP0_TLB_CONTEXT,  1)
 __BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
+__BUILD_KVM_RW_HW(xcontextconfig, l,  MIPS_CP0_TLB_CONTEXT,  3)
 __BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
 __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index ec909fcd08ce98..97e7a788bf4a9d 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -131,7 +131,7 @@ static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
  * Config:	M, [MT]
  * Config1:	M, [MMUSize-1, C2, MD, PC, WR, CA], FP
  * Config2:	M
- * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP, CTXTC, ITL, LPA, VEIC,
+ * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC,
  *		VInt, SP, CDMM, MT, SM, TL]
  * Config4:	M, [VTLBSizeExt, MMUSizeExt]
  * Config5:	[MRP]
@@ -161,7 +161,7 @@ static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu)
 static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu)
 {
 	unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M |
-		MIPS_CONF3_ULRI;
+		MIPS_CONF3_ULRI | MIPS_CONF3_CTXTC;
 
 	/* Permit MSA to be present if MSA is supported */
 	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -1205,6 +1205,13 @@ static u64 kvm_vz_get_one_regs[] = {
 	KVM_REG_MIPS_COUNT_HZ,
 };
 
+static u64 kvm_vz_get_one_regs_contextconfig[] = {
+	KVM_REG_MIPS_CP0_CONTEXTCONFIG,
+#ifdef CONFIG_64BIT
+	KVM_REG_MIPS_CP0_XCONTEXTCONFIG,
+#endif
+};
+
 static u64 kvm_vz_get_one_regs_kscratch[] = {
 	KVM_REG_MIPS_CP0_KSCRATCH1,
 	KVM_REG_MIPS_CP0_KSCRATCH2,
@@ -1225,6 +1232,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		++ret;
 	if (cpu_guest_has_badinstrp)
 		++ret;
+	if (cpu_guest_has_contextconfig)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1258,6 +1267,12 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		++indices;
 	}
+	if (cpu_guest_has_contextconfig) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_contextconfig,
+				 sizeof(kvm_vz_get_one_regs_contextconfig)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1323,11 +1338,23 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_CONTEXT:
 		*v = (long)read_gc0_context();
 		break;
+	case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		*v = read_gc0_contextconfig();
+		break;
 	case KVM_REG_MIPS_CP0_USERLOCAL:
 		if (!cpu_guest_has_userlocal)
 			return -EINVAL;
 		*v = read_gc0_userlocal();
 		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		*v = read_gc0_xcontextconfig();
+		break;
+#endif
 	case KVM_REG_MIPS_CP0_PAGEMASK:
 		*v = (long)read_gc0_pagemask();
 		break;
@@ -1478,11 +1505,23 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_CONTEXT:
 		write_gc0_context(v);
 		break;
+	case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		write_gc0_contextconfig(v);
+		break;
 	case KVM_REG_MIPS_CP0_USERLOCAL:
 		if (!cpu_guest_has_userlocal)
 			return -EINVAL;
 		write_gc0_userlocal(v);
 		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		write_gc0_xcontextconfig(v);
+		break;
+#endif
 	case KVM_REG_MIPS_CP0_PAGEMASK:
 		write_gc0_pagemask(v);
 		break;
@@ -1874,8 +1913,12 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	kvm_restore_gc0_entrylo0(cop0);
 	kvm_restore_gc0_entrylo1(cop0);
 	kvm_restore_gc0_context(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_restore_gc0_contextconfig(cop0);
 #ifdef CONFIG_64BIT
 	kvm_restore_gc0_xcontext(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_restore_gc0_xcontextconfig(cop0);
 #endif
 	kvm_restore_gc0_pagemask(cop0);
 	kvm_restore_gc0_pagegrain(cop0);
@@ -1933,8 +1976,12 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	kvm_save_gc0_entrylo0(cop0);
 	kvm_save_gc0_entrylo1(cop0);
 	kvm_save_gc0_context(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_save_gc0_contextconfig(cop0);
 #ifdef CONFIG_64BIT
 	kvm_save_gc0_xcontext(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_save_gc0_xcontextconfig(cop0);
 #endif
 	kvm_save_gc0_pagemask(cop0);
 	kvm_save_gc0_pagegrain(cop0);
@@ -2298,6 +2345,17 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
 		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP);
 	}
 
+	if (cpu_guest_has_contextconfig) {
+		/* ContextConfig */
+		kvm_write_sw_gc0_contextconfig(cop0, 0x007ffff0);
+#ifdef CONFIG_64BIT
+		/* XContextConfig */
+		/* bits SEGBITS-13+3:4 set */
+		kvm_write_sw_gc0_xcontextconfig(cop0,
+					((1ull << (cpu_vmbits - 13)) - 1) << 4);
+#endif
+	}
+
 	/* start with no pending virtual guest interrupts */
 	if (cpu_has_guestctl2)
 		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;

From 4b7de028e66cc499ab85b5003af944225c8c87e9 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:35 +0000
Subject: [PATCH 041/227] KVM: MIPS/VZ: Support guest segmentation control
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for VZ guest CP0_SegCtl0, CP0_SegCtl1, and CP0_SegCtl2
registers, as found on P5600 and P6600 cores. These guest registers need
initialising, context switching, and exposing via the KVM ioctl API when
they are present.

They also require the GVA -> GPA translation code for handling a GVA
root exception to be updated to interpret the segmentation registers and
decode the faulting instruction enough to detect EVA memory access
instructions.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |   3 +
 arch/mips/include/asm/kvm_host.h  |   6 +
 arch/mips/kvm/vz.c                | 242 +++++++++++++++++++++++++++++-
 3 files changed, 250 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 5f53bfdc0d84eb..45194363a16048 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2078,6 +2078,9 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
   MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL0      | 64
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL1      | 64
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL2      | 64
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5066d89f22270d..b2129c031df77f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -39,6 +39,9 @@
 #define KVM_REG_MIPS_CP0_XCONTEXTCONFIG	MIPS_CP0_64(4, 3)
 #define KVM_REG_MIPS_CP0_PAGEMASK	MIPS_CP0_32(5, 0)
 #define KVM_REG_MIPS_CP0_PAGEGRAIN	MIPS_CP0_32(5, 1)
+#define KVM_REG_MIPS_CP0_SEGCTL0	MIPS_CP0_64(5, 2)
+#define KVM_REG_MIPS_CP0_SEGCTL1	MIPS_CP0_64(5, 3)
+#define KVM_REG_MIPS_CP0_SEGCTL2	MIPS_CP0_64(5, 4)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
@@ -672,6 +675,9 @@ __BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
 __BUILD_KVM_RW_HW(xcontextconfig, l,  MIPS_CP0_TLB_CONTEXT,  3)
 __BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
 __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
+__BUILD_KVM_RW_HW(segctl0,        l,  MIPS_CP0_TLB_PG_MASK,  2)
+__BUILD_KVM_RW_HW(segctl1,        l,  MIPS_CP0_TLB_PG_MASK,  3)
+__BUILD_KVM_RW_HW(segctl2,        l,  MIPS_CP0_TLB_PG_MASK,  4)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
 __BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
 __BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 97e7a788bf4a9d..f32c1ab3f724d0 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -411,6 +411,117 @@ static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
 	kvm_write_sw_gc0_compare(cop0, compare);
 }
 
+/**
+ * is_eva_access() - Find whether an instruction is an EVA memory accessor.
+ * @inst:	32-bit instruction encoding.
+ *
+ * Finds whether @inst encodes an EVA memory access instruction, which would
+ * indicate that emulation of it should access the user mode address space
+ * instead of the kernel mode address space. This matters for MUSUK segments
+ * which are TLB mapped for user mode but unmapped for kernel mode.
+ *
+ * Returns:	Whether @inst encodes an EVA accessor instruction.
+ */
+static bool is_eva_access(union mips_instruction inst)
+{
+	if (inst.spec3_format.opcode != spec3_op)
+		return false;
+
+	switch (inst.spec3_format.func) {
+	case lwle_op:
+	case lwre_op:
+	case cachee_op:
+	case sbe_op:
+	case she_op:
+	case sce_op:
+	case swe_op:
+	case swle_op:
+	case swre_op:
+	case prefe_op:
+	case lbue_op:
+	case lhue_op:
+	case lbe_op:
+	case lhe_op:
+	case lle_op:
+	case lwe_op:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * is_eva_am_mapped() - Find whether an access mode is mapped.
+ * @vcpu:	KVM VCPU state.
+ * @am:		3-bit encoded access mode.
+ * @eu:		Segment becomes unmapped and uncached when Status.ERL=1.
+ *
+ * Decode @am to find whether it encodes a mapped segment for the current VCPU
+ * state. Where necessary @eu and the actual instruction causing the fault are
+ * taken into account to make the decision.
+ *
+ * Returns:	Whether the VCPU faulted on a TLB mapped address.
+ */
+static bool is_eva_am_mapped(struct kvm_vcpu *vcpu, unsigned int am, bool eu)
+{
+	u32 am_lookup;
+	int err;
+
+	/*
+	 * Interpret access control mode. We assume address errors will already
+	 * have been caught by the guest, leaving us with:
+	 *      AM      UM  SM  KM  31..24 23..16
+	 * UK    0 000          Unm   0      0
+	 * MK    1 001          TLB   1
+	 * MSK   2 010      TLB TLB   1
+	 * MUSK  3 011  TLB TLB TLB   1
+	 * MUSUK 4 100  TLB TLB Unm   0      1
+	 * USK   5 101      Unm Unm   0      0
+	 * -     6 110                0      0
+	 * UUSK  7 111  Unm Unm Unm   0      0
+	 *
+	 * We shift a magic value by AM across the sign bit to find if always
+	 * TLB mapped, and if not shift by 8 again to find if it depends on KM.
+	 */
+	am_lookup = 0x70080000 << am;
+	if ((s32)am_lookup < 0) {
+		/*
+		 * MK, MSK, MUSK
+		 * Always TLB mapped, unless SegCtl.EU && ERL
+		 */
+		if (!eu || !(read_gc0_status() & ST0_ERL))
+			return true;
+	} else {
+		am_lookup <<= 8;
+		if ((s32)am_lookup < 0) {
+			union mips_instruction inst;
+			unsigned int status;
+			u32 *opc;
+
+			/*
+			 * MUSUK
+			 * TLB mapped if not in kernel mode
+			 */
+			status = read_gc0_status();
+			if (!(status & (ST0_EXL | ST0_ERL)) &&
+			    (status & ST0_KSU))
+				return true;
+			/*
+			 * EVA access instructions in kernel
+			 * mode access user address space.
+			 */
+			opc = (u32 *)vcpu->arch.pc;
+			if (vcpu->arch.host_cp0_cause & CAUSEF_BD)
+				opc += 1;
+			err = kvm_get_badinstr(opc, vcpu, &inst.word);
+			if (!err && is_eva_access(inst))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 /**
  * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA.
  * @vcpu:	KVM VCPU state.
@@ -427,10 +538,58 @@ static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 			     unsigned long *gpa)
 {
 	u32 gva32 = gva;
+	unsigned long segctl;
 
 	if ((long)gva == (s32)gva32) {
 		/* Handle canonical 32-bit virtual address */
-		if ((s32)gva32 < (s32)0xc0000000) {
+		if (cpu_guest_has_segments) {
+			unsigned long mask, pa;
+
+			switch (gva32 >> 29) {
+			case 0:
+			case 1: /* CFG5 (1GB) */
+				segctl = read_gc0_segctl2() >> 16;
+				mask = (unsigned long)0xfc0000000ull;
+				break;
+			case 2:
+			case 3: /* CFG4 (1GB) */
+				segctl = read_gc0_segctl2();
+				mask = (unsigned long)0xfc0000000ull;
+				break;
+			case 4: /* CFG3 (512MB) */
+				segctl = read_gc0_segctl1() >> 16;
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 5: /* CFG2 (512MB) */
+				segctl = read_gc0_segctl1();
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 6: /* CFG1 (512MB) */
+				segctl = read_gc0_segctl0() >> 16;
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 7: /* CFG0 (512MB) */
+				segctl = read_gc0_segctl0();
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			default:
+				/*
+				 * GCC 4.9 isn't smart enough to figure out that
+				 * segctl and mask are always initialised.
+				 */
+				unreachable();
+			}
+
+			if (is_eva_am_mapped(vcpu, (segctl >> 4) & 0x7,
+					     segctl & 0x0008))
+				goto tlb_mapped;
+
+			/* Unmapped, find guest physical address */
+			pa = (segctl << 20) & mask;
+			pa |= gva32 & ~mask;
+			*gpa = pa;
+			return 0;
+		} else if ((s32)gva32 < (s32)0xc0000000) {
 			/* legacy unmapped KSeg0 or KSeg1 */
 			*gpa = gva32 & 0x1fffffff;
 			return 0;
@@ -438,6 +597,20 @@ static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 #ifdef CONFIG_64BIT
 	} else if ((gva & 0xc000000000000000) == 0x8000000000000000) {
 		/* XKPHYS */
+		if (cpu_guest_has_segments) {
+			/*
+			 * Each of the 8 regions can be overridden by SegCtl2.XR
+			 * to use SegCtl1.XAM.
+			 */
+			segctl = read_gc0_segctl2();
+			if (segctl & (1ull << (56 + ((gva >> 59) & 0x7)))) {
+				segctl = read_gc0_segctl1();
+				if (is_eva_am_mapped(vcpu, (segctl >> 59) & 0x7,
+						     0))
+					goto tlb_mapped;
+			}
+
+		}
 		/*
 		 * Traditionally fully unmapped.
 		 * Bits 61:59 specify the CCA, which we can just mask off here.
@@ -449,6 +622,7 @@ static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 #endif
 	}
 
+tlb_mapped:
 	return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa);
 }
 
@@ -1212,6 +1386,12 @@ static u64 kvm_vz_get_one_regs_contextconfig[] = {
 #endif
 };
 
+static u64 kvm_vz_get_one_regs_segments[] = {
+	KVM_REG_MIPS_CP0_SEGCTL0,
+	KVM_REG_MIPS_CP0_SEGCTL1,
+	KVM_REG_MIPS_CP0_SEGCTL2,
+};
+
 static u64 kvm_vz_get_one_regs_kscratch[] = {
 	KVM_REG_MIPS_CP0_KSCRATCH1,
 	KVM_REG_MIPS_CP0_KSCRATCH2,
@@ -1234,6 +1414,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		++ret;
 	if (cpu_guest_has_contextconfig)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+	if (cpu_guest_has_segments)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1273,6 +1455,12 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
 	}
+	if (cpu_guest_has_segments) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_segments,
+				 sizeof(kvm_vz_get_one_regs_segments)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1361,6 +1549,21 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_PAGEGRAIN:
 		*v = (long)read_gc0_pagegrain();
 		break;
+	case KVM_REG_MIPS_CP0_SEGCTL0:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl0();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL1:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl1();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL2:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl2();
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		*v = (long)read_gc0_wired();
 		break;
@@ -1528,6 +1731,21 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_PAGEGRAIN:
 		write_gc0_pagegrain(v);
 		break;
+	case KVM_REG_MIPS_CP0_SEGCTL0:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl0(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL1:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl1(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL2:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl2(v);
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
 		break;
@@ -1955,6 +2173,12 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (cpu_guest_has_badinstrp)
 		kvm_restore_gc0_badinstrp(cop0);
 
+	if (cpu_guest_has_segments) {
+		kvm_restore_gc0_segctl0(cop0);
+		kvm_restore_gc0_segctl1(cop0);
+		kvm_restore_gc0_segctl2(cop0);
+	}
+
 	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
 	if (cpu_has_guestctl2)
 		write_c0_guestctl2(
@@ -2038,6 +2262,12 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	if (cpu_guest_has_badinstrp)
 		kvm_save_gc0_badinstrp(cop0);
 
+	if (cpu_guest_has_segments) {
+		kvm_save_gc0_segctl0(cop0);
+		kvm_save_gc0_segctl1(cop0);
+		kvm_save_gc0_segctl2(cop0);
+	}
+
 	kvm_vz_save_timer(vcpu);
 
 	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
@@ -2356,6 +2586,16 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
 #endif
 	}
 
+	/* Implementation dependent, use the legacy layout */
+	if (cpu_guest_has_segments) {
+		/* SegCtl0, SegCtl1, SegCtl2 */
+		kvm_write_sw_gc0_segctl0(cop0, 0x00200010);
+		kvm_write_sw_gc0_segctl1(cop0, 0x00000002 |
+				(_page_cachable_default >> _CACHE_SHIFT) <<
+						(16 + MIPS_SEGCFG_C_SHIFT));
+		kvm_write_sw_gc0_segctl2(cop0, 0x00380438);
+	}
+
 	/* start with no pending virtual guest interrupts */
 	if (cpu_has_guestctl2)
 		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;

From 5a2f352f343ac9b4e869ff29b3a0a833c870303b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:36 +0000
Subject: [PATCH 042/227] KVM: MIPS/VZ: Support guest hardware page table
 walker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for VZ guest CP0_PWBase, CP0_PWField, CP0_PWSize, and
CP0_PWCtl registers for controlling the guest hardware page table walker
(HTW) present on P5600 and P6600 cores. These guest registers need
initialising on R6, context switching, and exposing via the KVM ioctl
API when they are present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  4 ++
 arch/mips/include/asm/kvm_host.h  |  8 ++++
 arch/mips/kvm/vz.c                | 80 +++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 45194363a16048..b108238dc9dcd6 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2081,7 +2081,11 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_SEGCTL0      | 64
   MIPS  | KVM_REG_MIPS_CP0_SEGCTL1      | 64
   MIPS  | KVM_REG_MIPS_CP0_SEGCTL2      | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWBASE       | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWFIELD      | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWSIZE       | 64
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
+  MIPS  | KVM_REG_MIPS_CP0_PWCTL        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
   MIPS  | KVM_REG_MIPS_CP0_BADINSTR     | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b2129c031df77f..8d016ab3a8b97b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -42,7 +42,11 @@
 #define KVM_REG_MIPS_CP0_SEGCTL0	MIPS_CP0_64(5, 2)
 #define KVM_REG_MIPS_CP0_SEGCTL1	MIPS_CP0_64(5, 3)
 #define KVM_REG_MIPS_CP0_SEGCTL2	MIPS_CP0_64(5, 4)
+#define KVM_REG_MIPS_CP0_PWBASE		MIPS_CP0_64(5, 5)
+#define KVM_REG_MIPS_CP0_PWFIELD	MIPS_CP0_64(5, 6)
+#define KVM_REG_MIPS_CP0_PWSIZE		MIPS_CP0_64(5, 7)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
+#define KVM_REG_MIPS_CP0_PWCTL		MIPS_CP0_32(6, 6)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
 #define KVM_REG_MIPS_CP0_BADINSTR	MIPS_CP0_32(8, 1)
@@ -678,7 +682,11 @@ __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
 __BUILD_KVM_RW_HW(segctl0,        l,  MIPS_CP0_TLB_PG_MASK,  2)
 __BUILD_KVM_RW_HW(segctl1,        l,  MIPS_CP0_TLB_PG_MASK,  3)
 __BUILD_KVM_RW_HW(segctl2,        l,  MIPS_CP0_TLB_PG_MASK,  4)
+__BUILD_KVM_RW_HW(pwbase,         l,  MIPS_CP0_TLB_PG_MASK,  5)
+__BUILD_KVM_RW_HW(pwfield,        l,  MIPS_CP0_TLB_PG_MASK,  6)
+__BUILD_KVM_RW_HW(pwsize,         l,  MIPS_CP0_TLB_PG_MASK,  7)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
+__BUILD_KVM_RW_HW(pwctl,          32, MIPS_CP0_TLB_WIRED,    6)
 __BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
 __BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
 __BUILD_KVM_RW_HW(badinstr,       32, MIPS_CP0_BAD_VADDR,    1)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index f32c1ab3f724d0..fb12c5b4a75c50 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1392,6 +1392,13 @@ static u64 kvm_vz_get_one_regs_segments[] = {
 	KVM_REG_MIPS_CP0_SEGCTL2,
 };
 
+static u64 kvm_vz_get_one_regs_htw[] = {
+	KVM_REG_MIPS_CP0_PWBASE,
+	KVM_REG_MIPS_CP0_PWFIELD,
+	KVM_REG_MIPS_CP0_PWSIZE,
+	KVM_REG_MIPS_CP0_PWCTL,
+};
+
 static u64 kvm_vz_get_one_regs_kscratch[] = {
 	KVM_REG_MIPS_CP0_KSCRATCH1,
 	KVM_REG_MIPS_CP0_KSCRATCH2,
@@ -1416,6 +1423,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
 	if (cpu_guest_has_segments)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+	if (cpu_guest_has_htw)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1461,6 +1470,12 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
 	}
+	if (cpu_guest_has_htw) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_htw,
+				 sizeof(kvm_vz_get_one_regs_htw)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1564,9 +1579,29 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		*v = read_gc0_segctl2();
 		break;
+	case KVM_REG_MIPS_CP0_PWBASE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwbase();
+		break;
+	case KVM_REG_MIPS_CP0_PWFIELD:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwfield();
+		break;
+	case KVM_REG_MIPS_CP0_PWSIZE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwsize();
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		*v = (long)read_gc0_wired();
 		break;
+	case KVM_REG_MIPS_CP0_PWCTL:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwctl();
+		break;
 	case KVM_REG_MIPS_CP0_HWRENA:
 		*v = (long)read_gc0_hwrena();
 		break;
@@ -1746,9 +1781,29 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		write_gc0_segctl2(v);
 		break;
+	case KVM_REG_MIPS_CP0_PWBASE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwbase(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWFIELD:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwfield(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWSIZE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwsize(v);
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
 		break;
+	case KVM_REG_MIPS_CP0_PWCTL:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwctl(v);
+		break;
 	case KVM_REG_MIPS_CP0_HWRENA:
 		write_gc0_hwrena(v);
 		break;
@@ -2179,6 +2234,14 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_restore_gc0_segctl2(cop0);
 	}
 
+	/* restore HTW registers */
+	if (cpu_guest_has_htw) {
+		kvm_restore_gc0_pwbase(cop0);
+		kvm_restore_gc0_pwfield(cop0);
+		kvm_restore_gc0_pwsize(cop0);
+		kvm_restore_gc0_pwctl(cop0);
+	}
+
 	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
 	if (cpu_has_guestctl2)
 		write_c0_guestctl2(
@@ -2268,6 +2331,15 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 		kvm_save_gc0_segctl2(cop0);
 	}
 
+	/* save HTW registers if enabled in guest */
+	if (cpu_guest_has_htw &&
+	    kvm_read_sw_gc0_config3(cop0) & MIPS_CONF3_PW) {
+		kvm_save_gc0_pwbase(cop0);
+		kvm_save_gc0_pwfield(cop0);
+		kvm_save_gc0_pwsize(cop0);
+		kvm_save_gc0_pwctl(cop0);
+	}
+
 	kvm_vz_save_timer(vcpu);
 
 	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
@@ -2596,6 +2668,14 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
 		kvm_write_sw_gc0_segctl2(cop0, 0x00380438);
 	}
 
+	/* reset HTW registers */
+	if (cpu_guest_has_htw && cpu_has_mips_r6) {
+		/* PWField */
+		kvm_write_sw_gc0_pwfield(cop0, 0x0c30c302);
+		/* PWSize */
+		kvm_write_sw_gc0_pwsize(cop0, 1 << MIPS_PWSIZE_PTW_SHIFT);
+	}
+
 	/* start with no pending virtual guest interrupts */
 	if (cpu_has_guestctl2)
 		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;

From 273819a6a5ab493500c14889b161bf5bc2265736 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:37 +0000
Subject: [PATCH 043/227] KVM: MIPS/VZ: Support guest load-linked bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When restoring guest state after another VCPU has run, be sure to clear
CP0_LLAddr.LLB in order to break any interrupted atomic critical
section. Without this SMP guest atomics don't work when LLB is present
as one guest can complete the atomic section started by another guest.

MIPS VZ guest read of CP0_LLAddr causes Guest Privileged Sensitive
Instruction (GPSI) exception due to the address being root physical.
Handle this by reporting only the LLB bit, which contains the bit for
whether a ll/sc atomic is in progress without any reason for failure.

Similarly on P5600 a guest write to CP0_LLAddr also causes a GPSI
exception. Handle this also by clearing the guest LLB bit from root
mode.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/vz.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index fb12c5b4a75c50..450f946358aeda 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -730,6 +730,13 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 			} else if (rd == MIPS_CP0_COMPARE &&
 				   sel == 0) {		/* Compare */
 				val = read_gc0_compare();
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 0) {		/* LLAddr */
+				if (cpu_guest_has_rw_llb)
+					val = read_gc0_lladdr() &
+						MIPS_LLADDR_LLB;
+				else
+					val = 0;
 			} else if ((rd == MIPS_CP0_PRID &&
 				    (sel == 0 ||	/* PRid */
 				     sel == 2 ||	/* CDMMBase */
@@ -777,6 +784,15 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 				kvm_mips_write_compare(vcpu,
 						       vcpu->arch.gprs[rt],
 						       true);
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 0) {		/* LLAddr */
+				/*
+				 * P5600 generates GPSI on guest MTC0 LLAddr.
+				 * Only allow the guest to clear LLB.
+				 */
+				if (cpu_guest_has_rw_llb &&
+				    !(val & MIPS_LLADDR_LLB))
+					write_gc0_lladdr(0);
 			} else if (rd == MIPS_CP0_ERRCTL &&
 				   (sel == 0)) {	/* ErrCtl */
 				/* ignore the written value */
@@ -2247,6 +2263,14 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		write_c0_guestctl2(
 			cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL]);
 
+	/*
+	 * We should clear linked load bit to break interrupted atomics. This
+	 * prevents a SC on the next VCPU from succeeding by matching a LL on
+	 * the previous VCPU.
+	 */
+	if (cpu_guest_has_rw_llb)
+		write_gc0_lladdr(0);
+
 	return 0;
 }
 

From d42a008f86ba3d715d31788fc5143a4de5685d33 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:38 +0000
Subject: [PATCH 044/227] KVM: MIPS/VZ: Emulate MAARs when necessary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add emulation of Memory Accessibility Attribute Registers (MAARs) when
necessary. We can't actually do anything with whatever the guest
provides, but it may not be possible to clear Guest.Config5.MRP so we
have to emulate at least a pair of MAARs.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |   5 ++
 arch/mips/include/asm/kvm_host.h  |   5 ++
 arch/mips/include/uapi/asm/kvm.h  |  20 +++++-
 arch/mips/kvm/trace.h             |   2 +
 arch/mips/kvm/vz.c                | 110 +++++++++++++++++++++++++++++-
 5 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b108238dc9dcd6..e601c8f01fd92a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2114,6 +2114,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH4    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH5    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH6    | 64
+  MIPS  | KVM_REG_MIPS_CP0_MAAR(0..63)  | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
@@ -2180,6 +2181,10 @@ hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
 with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
 the PFNX field starting at bit 30.
 
+MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit
+patterns:
+  0x7030 0000 0001 01 <reg:8>
+
 MIPS KVM control registers (see above) have the following id bit patterns:
   0x7030 0000 0002 <reg:16>
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 8d016ab3a8b97b..a662a80152b111 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -67,6 +67,7 @@
 #define KVM_REG_MIPS_CP0_CONFIG4	MIPS_CP0_32(16, 4)
 #define KVM_REG_MIPS_CP0_CONFIG5	MIPS_CP0_32(16, 5)
 #define KVM_REG_MIPS_CP0_CONFIG7	MIPS_CP0_32(16, 7)
+#define KVM_REG_MIPS_CP0_MAARI		MIPS_CP0_64(17, 2)
 #define KVM_REG_MIPS_CP0_XCONTEXT	MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC	MIPS_CP0_64(30, 0)
 #define KVM_REG_MIPS_CP0_KSCRATCH1	MIPS_CP0_64(31, 2)
@@ -388,6 +389,9 @@ struct kvm_vcpu_arch {
 	struct kvm_mips_tlb *wired_tlb;
 	unsigned int wired_tlb_limit;
 	unsigned int wired_tlb_used;
+
+	/* emulated guest MAAR registers */
+	unsigned long maar[6];
 #endif
 
 	/* Last CPU the VCPU state was loaded on */
@@ -708,6 +712,7 @@ __BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
 __BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
 __BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
 __BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_SW(maari,          l,  MIPS_CP0_LLADDR,       2)
 __BUILD_KVM_RW_HW(xcontext,       l,  MIPS_CP0_TLB_XCONTEXT, 0)
 __BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
 __BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index a8a0199bf7601a..3107095d7f0a46 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -54,9 +54,14 @@ struct kvm_fpu {
  * Register set = 0: GP registers from kvm_regs (see definitions below).
  *
  * Register set = 1: CP0 registers.
- *  bits[15..8]  - Must be zero.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[2..0]   - Register 'sel' index.
+ *  bits[15..8]  - COP0 register set.
+ *
+ *  COP0 register set = 0: Main CP0 registers.
+ *   bits[7..3]   - Register 'rd'  index.
+ *   bits[2..0]   - Register 'sel' index.
+ *
+ *  COP0 register set = 1: MAARs.
+ *   bits[7..0]   - MAAR index.
  *
  * Register set = 2: KVM specific registers (see definitions below).
  *
@@ -114,6 +119,15 @@ struct kvm_fpu {
 #define KVM_REG_MIPS_PC		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34)
 
 
+/*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR	(KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n)	(KVM_REG_MIPS_MAAR | \
+					 KVM_REG_SIZE_U64 | (n))
+
+
 /*
  * KVM_REG_MIPS_KVM - KVM specific control registers.
  */
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index d80d37a1b82eb8..affde8a2c584ba 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -176,6 +176,8 @@ TRACE_EVENT(kvm_exit,
 	{ KVM_TRACE_COP0(16, 4),	"Config4" },		\
 	{ KVM_TRACE_COP0(16, 5),	"Config5" },		\
 	{ KVM_TRACE_COP0(16, 7),	"Config7" },		\
+	{ KVM_TRACE_COP0(17, 1),	"MAAR" },		\
+	{ KVM_TRACE_COP0(17, 2),	"MAARI" },		\
 	{ KVM_TRACE_COP0(26, 0),	"ECC" },		\
 	{ KVM_TRACE_COP0(30, 0),	"ErrorEPC" },		\
 	{ KVM_TRACE_COP0(31, 2),	"KScratch1" },		\
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 450f946358aeda..cbc6850cff021f 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -134,7 +134,7 @@ static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
  * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC,
  *		VInt, SP, CDMM, MT, SM, TL]
  * Config4:	M, [VTLBSizeExt, MMUSizeExt]
- * Config5:	[MRP]
+ * Config5:	MRP
  */
 
 static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu)
@@ -177,7 +177,7 @@ static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu)
 
 static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu)
 {
-	return kvm_vz_config5_guest_wrmask(vcpu);
+	return kvm_vz_config5_guest_wrmask(vcpu) | MIPS_CONF5_MRP;
 }
 
 static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
@@ -685,6 +685,41 @@ static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu)
 	return RESUME_HOST;
 }
 
+static unsigned long mips_process_maar(unsigned int op, unsigned long val)
+{
+	/* Mask off unused bits */
+	unsigned long mask = 0xfffff000 | MIPS_MAAR_S | MIPS_MAAR_VL;
+
+	if (read_gc0_pagegrain() & PG_ELPA)
+		mask |= 0x00ffffff00000000ull;
+	if (cpu_guest_has_mvh)
+		mask |= MIPS_MAAR_VH;
+
+	/* Set or clear VH */
+	if (op == mtc_op) {
+		/* clear VH */
+		val &= ~MIPS_MAAR_VH;
+	} else if (op == dmtc_op) {
+		/* set VH to match VL */
+		val &= ~MIPS_MAAR_VH;
+		if (val & MIPS_MAAR_VL)
+			val |= MIPS_MAAR_VH;
+	}
+
+	return val & mask;
+}
+
+static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	val &= MIPS_MAARI_INDEX;
+	if (val == MIPS_MAARI_INDEX)
+		kvm_write_sw_gc0_maari(cop0, ARRAY_SIZE(vcpu->arch.maar) - 1);
+	else if (val < ARRAY_SIZE(vcpu->arch.maar))
+		kvm_write_sw_gc0_maari(cop0, val);
+}
+
 static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 					      u32 *opc, u32 cause,
 					      struct kvm_run *run,
@@ -737,6 +772,15 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 						MIPS_LLADDR_LLB;
 				else
 					val = 0;
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 1 &&		/* MAAR */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				/* MAARI must be in range */
+				BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+						ARRAY_SIZE(vcpu->arch.maar));
+				val = vcpu->arch.maar[
+					kvm_read_sw_gc0_maari(cop0)];
 			} else if ((rd == MIPS_CP0_PRID &&
 				    (sel == 0 ||	/* PRid */
 				     sel == 2 ||	/* CDMMBase */
@@ -746,6 +790,10 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 				     sel == 3)) ||	/* SRSMap */
 				   (rd == MIPS_CP0_CONFIG &&
 				    (sel == 7)) ||	/* Config7 */
+				   (rd == MIPS_CP0_LLADDR &&
+				    (sel == 2) &&	/* MAARI */
+				    cpu_guest_has_maar &&
+				    !cpu_guest_has_dyn_maar) ||
 				   (rd == MIPS_CP0_ERRCTL &&
 				    (sel == 0))) {	/* ErrCtl */
 				val = cop0->reg[rd][sel];
@@ -793,6 +841,23 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 				if (cpu_guest_has_rw_llb &&
 				    !(val & MIPS_LLADDR_LLB))
 					write_gc0_lladdr(0);
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 1 &&		/* MAAR */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				val = mips_process_maar(inst.c0r_format.rs,
+							val);
+
+				/* MAARI must be in range */
+				BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+						ARRAY_SIZE(vcpu->arch.maar));
+				vcpu->arch.maar[kvm_read_sw_gc0_maari(cop0)] =
+									val;
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   (sel == 2) &&	/* MAARI */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				kvm_write_maari(vcpu, val);
 			} else if (rd == MIPS_CP0_ERRCTL &&
 				   (sel == 0)) {	/* ErrCtl */
 				/* ignore the written value */
@@ -1441,6 +1506,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
 	if (cpu_guest_has_htw)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+	if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar)
+		ret += 1 + ARRAY_SIZE(vcpu->arch.maar);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1492,6 +1559,19 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
 	}
+	if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar) {
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.maar); ++i) {
+			index = KVM_REG_MIPS_CP0_MAAR(i);
+			if (copy_to_user(indices, &index, sizeof(index)))
+				return -EFAULT;
+			++indices;
+		}
+
+		index = KVM_REG_MIPS_CP0_MAARI;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1689,6 +1769,19 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		*v = read_gc0_config5();
 		break;
+	case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+		if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+			return -EINVAL;
+		*v = vcpu->arch.maar[idx];
+		break;
+	case KVM_REG_MIPS_CP0_MAARI:
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		*v = kvm_read_sw_gc0_maari(vcpu->arch.cop0);
+		break;
 #ifdef CONFIG_64BIT
 	case KVM_REG_MIPS_CP0_XCONTEXT:
 		*v = read_gc0_xcontext();
@@ -1938,6 +2031,19 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 			write_gc0_config5(v);
 		}
 		break;
+	case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+		if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+			return -EINVAL;
+		vcpu->arch.maar[idx] = mips_process_maar(dmtc_op, v);
+		break;
+	case KVM_REG_MIPS_CP0_MAARI:
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		kvm_write_maari(vcpu, v);
+		break;
 #ifdef CONFIG_64BIT
 	case KVM_REG_MIPS_CP0_XCONTEXT:
 		write_gc0_xcontext(v);

From f4474d50c7d483dd4432d5b0891b0bb9ad0eefc9 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:39 +0000
Subject: [PATCH 045/227] KVM: MIPS/VZ: Support hardware guest timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Transfer timer state to the VZ guest context (CP0_GTOffset & guest
CP0_Count) when entering guest mode, enabling direct guest access to it,
and transfer back to soft timer when saving guest register state.

This usually allows guest code to directly read CP0_Count (via MFC0 and
RDHWR) and read/write CP0_Compare, without trapping to the hypervisor
for it to emulate the guest timer. Writing to CP0_Count or CP0_Cause.DC
is much less common and still triggers a hypervisor GPSI exception, in
which case the timer state is transferred back to an hrtimer before
emulating the write.

We are careful to prevent small amounts of drift from building up due to
undeterministic time intervals between reading of the ktime and reading
of CP0_Count. Some drift is expected however, since the system
clocksource may use a different timer to the local CP0_Count timer used
by VZ. This is permitted to prevent guest CP0_Count from appearing to go
backwards.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  14 +++
 arch/mips/kvm/emulate.c          |  81 ++++++++++++-
 arch/mips/kvm/mips.c             |   6 +-
 arch/mips/kvm/vz.c               | 197 +++++++++++++++++++++++++++++--
 4 files changed, 287 insertions(+), 11 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index a662a80152b111..5c518c148f9d5a 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -1068,6 +1068,20 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
 void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
+/* fairly internal functions requiring some care to use */
+int kvm_mips_count_disabled(struct kvm_vcpu *vcpu);
+ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count);
+int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before,
+			     u32 count, int min_drift);
+
+#ifdef CONFIG_KVM_MIPS_VZ
+void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu);
+void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu);
+#else
+static inline void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) {}
+static inline void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) {}
+#endif
+
 enum emulation_result kvm_mips_check_privilege(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index bb0449296cd621..2070864c8e489a 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -308,7 +308,7 @@ int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
  *		CP0_Cause.DC bit or the count_ctl.DC bit.
  *		0 otherwise (in which case CP0_Count timer is running).
  */
-static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
+int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 
@@ -467,7 +467,7 @@ u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
  *
  * Returns:	The ktime at the point of freeze.
  */
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
+ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
 {
 	ktime_t now;
 
@@ -516,6 +516,82 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
 	hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS);
 }
 
+/**
+ * kvm_mips_restore_hrtimer() - Restore hrtimer after a gap, updating expiry.
+ * @vcpu:	Virtual CPU.
+ * @before:	Time before Count was saved, lower bound of drift calculation.
+ * @count:	CP0_Count at point of restore.
+ * @min_drift:	Minimum amount of drift permitted before correction.
+ *		Must be <= 0.
+ *
+ * Restores the timer from a particular @count, accounting for drift. This can
+ * be used in conjunction with kvm_mips_freeze_timer() when a hardware timer is
+ * to be used for a period of time, but the exact ktime corresponding to the
+ * final Count that must be restored is not known.
+ *
+ * It is gauranteed that a timer interrupt immediately after restore will be
+ * handled, but not if CP0_Compare is exactly at @count. That case should
+ * already be handled when the hardware timer state is saved.
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is not
+ * stopped).
+ *
+ * Returns:	Amount of correction to count_bias due to drift.
+ */
+int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before,
+			     u32 count, int min_drift)
+{
+	ktime_t now, count_time;
+	u32 now_count, before_count;
+	u64 delta;
+	int drift, ret = 0;
+
+	/* Calculate expected count at before */
+	before_count = vcpu->arch.count_bias +
+			kvm_mips_ktime_to_count(vcpu, before);
+
+	/*
+	 * Detect significantly negative drift, where count is lower than
+	 * expected. Some negative drift is expected when hardware counter is
+	 * set after kvm_mips_freeze_timer(), and it is harmless to allow the
+	 * time to jump forwards a little, within reason. If the drift is too
+	 * significant, adjust the bias to avoid a big Guest.CP0_Count jump.
+	 */
+	drift = count - before_count;
+	if (drift < min_drift) {
+		count_time = before;
+		vcpu->arch.count_bias += drift;
+		ret = drift;
+		goto resume;
+	}
+
+	/* Calculate expected count right now */
+	now = ktime_get();
+	now_count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+
+	/*
+	 * Detect positive drift, where count is higher than expected, and
+	 * adjust the bias to avoid guest time going backwards.
+	 */
+	drift = count - now_count;
+	if (drift > 0) {
+		count_time = now;
+		vcpu->arch.count_bias += drift;
+		ret = drift;
+		goto resume;
+	}
+
+	/* Subtract nanosecond delta to find ktime when count was read */
+	delta = (u64)(u32)(now_count - count);
+	delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
+	count_time = ktime_sub_ns(now, delta);
+
+resume:
+	/* Resume using the calculated ktime */
+	kvm_mips_resume_hrtimer(vcpu, count_time, count);
+	return ret;
+}
+
 /**
  * kvm_mips_write_count() - Modify the count and update timer.
  * @vcpu:	Virtual CPU.
@@ -897,6 +973,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 	++vcpu->stat.wait_exits;
 	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
 	if (!vcpu->arch.pending_exceptions) {
+		kvm_vz_lose_htimer(vcpu);
 		vcpu->arch.wait = 1;
 		kvm_vcpu_block(vcpu);
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 1bb5d01d5dec05..e8ddb128b8adc8 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1094,7 +1094,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	return kvm_mips_pending_timer(vcpu);
+	return kvm_mips_pending_timer(vcpu) ||
+		kvm_read_c0_guest_cause(vcpu->arch.cop0) & C_TI;
 }
 
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
@@ -1382,6 +1383,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 skip_emul:
 	local_irq_disable();
 
+	if (ret == RESUME_GUEST)
+		kvm_vz_acquire_htimer(vcpu);
+
 	if (er == EMULATE_DONE && !(ret & RESUME_HOST))
 		kvm_mips_deliver_interrupts(vcpu, cause);
 
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index cbc6850cff021f..44dad9ae5d0293 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -353,13 +353,38 @@ static int kvm_vz_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
  * VZ guest timer handling.
  */
 
+/**
+ * kvm_vz_should_use_htimer() - Find whether to use the VZ hard guest timer.
+ * @vcpu:	Virtual CPU.
+ *
+ * Returns:	true if the VZ GTOffset & real guest CP0_Count should be used
+ *		instead of software emulation of guest timer.
+ *		false otherwise.
+ */
+static bool kvm_vz_should_use_htimer(struct kvm_vcpu *vcpu)
+{
+	if (kvm_mips_count_disabled(vcpu))
+		return false;
+
+	/* Chosen frequency must match real frequency */
+	if (mips_hpt_frequency != vcpu->arch.count_hz)
+		return false;
+
+	/* We don't support a CP0_GTOffset with fewer bits than CP0_Count */
+	if (current_cpu_data.gtoffset_mask != 0xffffffff)
+		return false;
+
+	return true;
+}
+
 /**
  * _kvm_vz_restore_stimer() - Restore soft timer state.
  * @vcpu:	Virtual CPU.
  * @compare:	CP0_Compare register value, restored by caller.
  * @cause:	CP0_Cause register to restore.
  *
- * Restore VZ state relating to the soft timer.
+ * Restore VZ state relating to the soft timer. The hard timer can be enabled
+ * later.
  */
 static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare,
 				   u32 cause)
@@ -375,7 +400,47 @@ static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare,
 }
 
 /**
- * kvm_vz_restore_timer() - Restore guest timer state.
+ * _kvm_vz_restore_htimer() - Restore hard timer state.
+ * @vcpu:	Virtual CPU.
+ * @compare:	CP0_Compare register value, restored by caller.
+ * @cause:	CP0_Cause register to restore.
+ *
+ * Restore hard timer Guest.Count & Guest.Cause taking care to preserve the
+ * value of Guest.CP0_Cause.TI while restoring Guest.CP0_Cause.
+ */
+static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
+				   u32 compare, u32 cause)
+{
+	u32 start_count, after_count;
+	ktime_t freeze_time;
+	unsigned long flags;
+
+	/*
+	 * Freeze the soft-timer and sync the guest CP0_Count with it. We do
+	 * this with interrupts disabled to avoid latency.
+	 */
+	local_irq_save(flags);
+	freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count);
+	write_c0_gtoffset(start_count - read_c0_count());
+	local_irq_restore(flags);
+
+	/* restore guest CP0_Cause, as TI may already be set */
+	back_to_back_c0_hazard();
+	write_gc0_cause(cause);
+
+	/*
+	 * The above sequence isn't atomic and would result in lost timer
+	 * interrupts if we're not careful. Detect if a timer interrupt is due
+	 * and assert it.
+	 */
+	back_to_back_c0_hazard();
+	after_count = read_gc0_count();
+	if (after_count - start_count > compare - start_count - 1)
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+/**
+ * kvm_vz_restore_timer() - Restore timer state.
  * @vcpu:	Virtual CPU.
  *
  * Restore soft timer state from saved context.
@@ -392,25 +457,137 @@ static void kvm_vz_restore_timer(struct kvm_vcpu *vcpu)
 	_kvm_vz_restore_stimer(vcpu, compare, cause);
 }
 
+/**
+ * kvm_vz_acquire_htimer() - Switch to hard timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Restore hard timer state on top of existing soft timer state if possible.
+ *
+ * Since hard timer won't remain active over preemption, preemption should be
+ * disabled by the caller.
+ */
+void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu)
+{
+	u32 gctl0;
+
+	gctl0 = read_c0_guestctl0();
+	if (!(gctl0 & MIPS_GCTL0_GT) && kvm_vz_should_use_htimer(vcpu)) {
+		/* enable guest access to hard timer */
+		write_c0_guestctl0(gctl0 | MIPS_GCTL0_GT);
+
+		_kvm_vz_restore_htimer(vcpu, read_gc0_compare(),
+				       read_gc0_cause());
+	}
+}
+
+/**
+ * _kvm_vz_save_htimer() - Switch to software emulation of guest timer.
+ * @vcpu:	Virtual CPU.
+ * @compare:	Pointer to write compare value to.
+ * @cause:	Pointer to write cause value to.
+ *
+ * Save VZ guest timer state and switch to software emulation of guest CP0
+ * timer. The hard timer must already be in use, so preemption should be
+ * disabled.
+ */
+static void _kvm_vz_save_htimer(struct kvm_vcpu *vcpu,
+				u32 *out_compare, u32 *out_cause)
+{
+	u32 cause, compare, before_count, end_count;
+	ktime_t before_time;
+
+	compare = read_gc0_compare();
+	*out_compare = compare;
+
+	before_time = ktime_get();
+
+	/*
+	 * Record the CP0_Count *prior* to saving CP0_Cause, so we have a time
+	 * at which no pending timer interrupt is missing.
+	 */
+	before_count = read_gc0_count();
+	back_to_back_c0_hazard();
+	cause = read_gc0_cause();
+	*out_cause = cause;
+
+	/*
+	 * Record a final CP0_Count which we will transfer to the soft-timer.
+	 * This is recorded *after* saving CP0_Cause, so we don't get any timer
+	 * interrupts from just after the final CP0_Count point.
+	 */
+	back_to_back_c0_hazard();
+	end_count = read_gc0_count();
+
+	/*
+	 * The above sequence isn't atomic, so we could miss a timer interrupt
+	 * between reading CP0_Cause and end_count. Detect and record any timer
+	 * interrupt due between before_count and end_count.
+	 */
+	if (end_count - before_count > compare - before_count - 1)
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+
+	/*
+	 * Restore soft-timer, ignoring a small amount of negative drift due to
+	 * delay between freeze_hrtimer and setting CP0_GTOffset.
+	 */
+	kvm_mips_restore_hrtimer(vcpu, before_time, end_count, -0x10000);
+}
+
 /**
  * kvm_vz_save_timer() - Save guest timer state.
  * @vcpu:	Virtual CPU.
  *
- * Save VZ guest timer state.
+ * Save VZ guest timer state and switch to soft guest timer if hard timer was in
+ * use.
  */
 static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	u32 compare, cause;
+	u32 gctl0, compare, cause;
 
-	compare = read_gc0_compare();
-	cause = read_gc0_cause();
+	gctl0 = read_c0_guestctl0();
+	if (gctl0 & MIPS_GCTL0_GT) {
+		/* disable guest use of hard timer */
+		write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT);
+
+		/* save hard timer state */
+		_kvm_vz_save_htimer(vcpu, &compare, &cause);
+	} else {
+		compare = read_gc0_compare();
+		cause = read_gc0_cause();
+	}
 
 	/* save timer-related state to VCPU context */
 	kvm_write_sw_gc0_cause(cop0, cause);
 	kvm_write_sw_gc0_compare(cop0, compare);
 }
 
+/**
+ * kvm_vz_lose_htimer() - Ensure hard guest timer is not in use.
+ * @vcpu:	Virtual CPU.
+ *
+ * Transfers the state of the hard guest timer to the soft guest timer, leaving
+ * guest state intact so it can continue to be used with the soft timer.
+ */
+void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu)
+{
+	u32 gctl0, compare, cause;
+
+	preempt_disable();
+	gctl0 = read_c0_guestctl0();
+	if (gctl0 & MIPS_GCTL0_GT) {
+		/* disable guest use of timer */
+		write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT);
+
+		/* switch to soft timer */
+		_kvm_vz_save_htimer(vcpu, &compare, &cause);
+
+		/* leave soft timer in usable state */
+		_kvm_vz_restore_stimer(vcpu, compare, cause);
+	}
+	preempt_enable();
+}
+
 /**
  * is_eva_access() - Find whether an instruction is an EVA memory accessor.
  * @inst:	32-bit instruction encoding.
@@ -826,6 +1003,7 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 
 			if (rd == MIPS_CP0_COUNT &&
 			    sel == 0) {			/* Count */
+				kvm_vz_lose_htimer(vcpu);
 				kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
 			} else if (rd == MIPS_CP0_COMPARE &&
 				   sel == 0) {		/* Compare */
@@ -1090,10 +1268,12 @@ static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc,
 
 			/* DC bit enabling/disabling timer? */
 			if (change & CAUSEF_DC) {
-				if (val & CAUSEF_DC)
+				if (val & CAUSEF_DC) {
+					kvm_vz_lose_htimer(vcpu);
 					kvm_mips_count_disable_cause(vcpu);
-				else
+				} else {
 					kvm_mips_count_enable_cause(vcpu);
+				}
 			}
 
 			/* Only certain bits are RW to the guest */
@@ -2863,6 +3043,7 @@ static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int cpu = smp_processor_id();
 	int r;
 
+	kvm_vz_acquire_htimer(vcpu);
 	/* Check if we have any exceptions/interrupts pending */
 	kvm_mips_deliver_interrupts(vcpu, read_gc0_cause());
 

From edec9d7bdc4eb3845ec7a3f9610f0d54a7152e90 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:15:40 +0000
Subject: [PATCH 046/227] KVM: MIPS/VZ: Trace guest mode changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a trace event for guest mode changes, and enable VZ's
GuestCtl0.MC bit after the trace event is enabled to trap all guest mode
changes.

The MC bit causes Guest Hardware Field Change (GHFC) exceptions whenever
a guest mode change occurs (such as an exception entry or return from
exception), so we need to handle this exception now. The MC bit is only
enabled when restoring register state, so enabling the trace event won't
take immediate effect.

Tracing guest mode changes can be particularly handy when trying to work
out what a guest OS gets up to before something goes wrong, especially
if the problem occurs as a result of some previous guest userland
exception which would otherwise be invisible in the trace.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c  | 13 +++++++++++++
 arch/mips/kvm/trace.h | 37 +++++++++++++++++++++++++++++++++++++
 arch/mips/kvm/vz.c    | 21 +++++++++++++++++++--
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e8ddb128b8adc8..1fc6fef463db15 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -76,6 +76,19 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{NULL}
 };
 
+bool kvm_trace_guest_mode_change;
+
+int kvm_guest_mode_change_trace_reg(void)
+{
+	kvm_trace_guest_mode_change = 1;
+	return 0;
+}
+
+void kvm_guest_mode_change_trace_unreg(void)
+{
+	kvm_trace_guest_mode_change = 0;
+}
+
 /*
  * XXXKYMA: We are simulatoring a processor that has the WII bit set in
  * Config7, so we are "runnable" if interrupts are pending
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index affde8a2c584ba..a8c7fd7bf6d267 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -17,6 +17,13 @@
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE trace
 
+/*
+ * arch/mips/kvm/mips.c
+ */
+extern bool kvm_trace_guest_mode_change;
+int kvm_guest_mode_change_trace_reg(void);
+void kvm_guest_mode_change_trace_unreg(void);
+
 /*
  * Tracepoints for VM enters
  */
@@ -303,6 +310,36 @@ TRACE_EVENT(kvm_guestid_change,
 		      __entry->guestid)
 );
 
+TRACE_EVENT_FN(kvm_guest_mode_change,
+	    TP_PROTO(struct kvm_vcpu *vcpu),
+	    TP_ARGS(vcpu),
+	    TP_STRUCT__entry(
+			__field(unsigned long, epc)
+			__field(unsigned long, pc)
+			__field(unsigned long, badvaddr)
+			__field(unsigned int, status)
+			__field(unsigned int, cause)
+	    ),
+
+	    TP_fast_assign(
+			__entry->epc = kvm_read_c0_guest_epc(vcpu->arch.cop0);
+			__entry->pc = vcpu->arch.pc;
+			__entry->badvaddr = kvm_read_c0_guest_badvaddr(vcpu->arch.cop0);
+			__entry->status = kvm_read_c0_guest_status(vcpu->arch.cop0);
+			__entry->cause = kvm_read_c0_guest_cause(vcpu->arch.cop0);
+	    ),
+
+	    TP_printk("EPC: 0x%08lx PC: 0x%08lx Status: 0x%08x Cause: 0x%08x BadVAddr: 0x%08lx",
+		      __entry->epc,
+		      __entry->pc,
+		      __entry->status,
+		      __entry->cause,
+		      __entry->badvaddr),
+
+	    kvm_guest_mode_change_trace_reg,
+	    kvm_guest_mode_change_trace_unreg
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 44dad9ae5d0293..33bb8c6e1b0526 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1322,6 +1322,18 @@ static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc,
 	return er;
 }
 
+static enum emulation_result kvm_trap_vz_handle_ghfc(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Presumably this is due to MC (guest mode change), so lets trace some
+	 * relevant info.
+	 */
+	trace_kvm_guest_mode_change(vcpu);
+
+	return EMULATE_DONE;
+}
+
 static enum emulation_result kvm_trap_vz_handle_hc(u32 cause, u32 *opc,
 						   struct kvm_vcpu *vcpu)
 {
@@ -1407,8 +1419,7 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu)
 		break;
 	case MIPS_GCTL0_GEXC_GHFC:
 		++vcpu->stat.vz_ghfc_exits;
-		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
-						       vcpu);
+		er = kvm_trap_vz_handle_ghfc(cause, opc, vcpu);
 		break;
 	case MIPS_GCTL0_GEXC_GPA:
 		++vcpu->stat.vz_gpa_exits;
@@ -2459,6 +2470,12 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	kvm_vz_restore_timer(vcpu);
 
+	/* Set MC bit if we want to trace guest mode changes */
+	if (kvm_trace_guest_mode_change)
+		set_c0_guestctl0(MIPS_GCTL0_MC);
+	else
+		clear_c0_guestctl0(MIPS_GCTL0_MC);
+
 	/* Don't bother restoring registers multiple times unless necessary */
 	if (!all)
 		return 0;

From 7d8a528d19815e57566d1e2277f9535e43f4cb68 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:44 +0000
Subject: [PATCH 047/227] MIPS: Add Octeon III register accessors & definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add accessors for some VZ related Cavium Octeon III specific COP0
registers, along with field definitions. These will mostly be used by
KVM to set up interrupt routing and partition the TLB between root and
guest.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/mipsregs.h | 36 ++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index c6b8f96b80f963..ebe608d21d7e5d 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -974,6 +974,22 @@
 /* Flush FTLB */
 #define LOONGSON_DIAG_FTLB	(_ULCAST_(1) << 13)
 
+/* CvmCtl register field definitions */
+#define CVMCTL_IPPCI_SHIFT	7
+#define CVMCTL_IPPCI		(_U64CAST_(0x7) << CVMCTL_IPPCI_SHIFT)
+#define CVMCTL_IPTI_SHIFT	4
+#define CVMCTL_IPTI		(_U64CAST_(0x7) << CVMCTL_IPTI_SHIFT)
+
+/* CvmMemCtl2 register field definitions */
+#define CVMMEMCTL2_INHIBITTS	(_U64CAST_(1) << 17)
+
+/* CvmVMConfig register field definitions */
+#define CVMVMCONF_DGHT		(_U64CAST_(1) << 60)
+#define CVMVMCONF_MMUSIZEM1_S	12
+#define CVMVMCONF_MMUSIZEM1	(_U64CAST_(0xff) << CVMVMCONF_MMUSIZEM1_S)
+#define CVMVMCONF_RMMUSIZEM1_S	0
+#define CVMVMCONF_RMMUSIZEM1	(_U64CAST_(0xff) << CVMVMCONF_RMMUSIZEM1_S)
+
 /*
  * Coprocessor 1 (FPU) register names
  */
@@ -1733,6 +1749,13 @@ do {									\
 
 #define read_c0_cvmmemctl()	__read_64bit_c0_register($11, 7)
 #define write_c0_cvmmemctl(val) __write_64bit_c0_register($11, 7, val)
+
+#define read_c0_cvmmemctl2()	__read_64bit_c0_register($16, 6)
+#define write_c0_cvmmemctl2(val) __write_64bit_c0_register($16, 6, val)
+
+#define read_c0_cvmvmconfig()	__read_64bit_c0_register($16, 7)
+#define write_c0_cvmvmconfig(val) __write_64bit_c0_register($16, 7, val)
+
 /*
  * The cacheerr registers are not standardized.	 On OCTEON, they are
  * 64 bits wide.
@@ -2106,6 +2129,19 @@ do {									\
 #define write_gc0_kscratch5(val)	__write_ulong_gc0_register(31, 6, val)
 #define write_gc0_kscratch6(val)	__write_ulong_gc0_register(31, 7, val)
 
+/* Cavium OCTEON (cnMIPS) */
+#define read_gc0_cvmcount()		__read_ulong_gc0_register(9, 6)
+#define write_gc0_cvmcount(val)		__write_ulong_gc0_register(9, 6, val)
+
+#define read_gc0_cvmctl()		__read_64bit_gc0_register(9, 7)
+#define write_gc0_cvmctl(val)		__write_64bit_gc0_register(9, 7, val)
+
+#define read_gc0_cvmmemctl()		__read_64bit_gc0_register(11, 7)
+#define write_gc0_cvmmemctl(val)	__write_64bit_gc0_register(11, 7, val)
+
+#define read_gc0_cvmmemctl2()		__read_64bit_gc0_register(16, 6)
+#define write_gc0_cvmmemctl2(val)	__write_64bit_gc0_register(16, 6, val)
+
 /*
  * Macros to access the floating point coprocessor control registers
  */

From 4fa9de5a645a9770679032a7eea0604f9a36eaf3 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:45 +0000
Subject: [PATCH 048/227] KVM: MIPS/Emulate: Adapt T&E CACHE emulation for
 Octeon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cache management is implemented separately for Cavium Octeon CPUs, so
r4k_blast_[id]cache aren't available. Instead for Octeon perform a local
icache flush using local_flush_icache_range(), and for other platforms
which don't use c-r4k.c use __flush_cache_all() / flush_icache_all().

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 30 +++++++++++++++++++++++++++---
 arch/mips/mm/cache.c    |  1 +
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 2070864c8e489a..4833ebad89d9b8 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1833,11 +1833,35 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 			  vcpu->arch.pc, vcpu->arch.gprs[31], cache, op, base,
 			  arch->gprs[base], offset);
 
-		if (cache == Cache_D)
+		if (cache == Cache_D) {
+#ifdef CONFIG_CPU_R4K_CACHE_TLB
 			r4k_blast_dcache();
-		else if (cache == Cache_I)
+#else
+			switch (boot_cpu_type()) {
+			case CPU_CAVIUM_OCTEON3:
+				/* locally flush icache */
+				local_flush_icache_range(0, 0);
+				break;
+			default:
+				__flush_cache_all();
+				break;
+			}
+#endif
+		} else if (cache == Cache_I) {
+#ifdef CONFIG_CPU_R4K_CACHE_TLB
 			r4k_blast_icache();
-		else {
+#else
+			switch (boot_cpu_type()) {
+			case CPU_CAVIUM_OCTEON3:
+				/* locally flush icache */
+				local_flush_icache_range(0, 0);
+				break;
+			default:
+				flush_icache_all();
+				break;
+			}
+#endif
+		} else {
 			kvm_err("%s: unsupported CACHE INDEX operation\n",
 				__func__);
 			return EMULATE_FAIL;
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 6db3413472023d..899e4627990281 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -24,6 +24,7 @@
 /* Cache operations. */
 void (*flush_cache_all)(void);
 void (*__flush_cache_all)(void);
+EXPORT_SYMBOL_GPL(__flush_cache_all);
 void (*flush_cache_mm)(struct mm_struct *mm);
 void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end);

From 1c506c9c104cf01d01a9633ad2e76f15f938c54c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:46 +0000
Subject: [PATCH 049/227] KVM: MIPS/TLB: Handle virtually tagged icaches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When TLB entries are invalidated in the presence of a virtually tagged
icache, such as that found on Octeon CPUs, flush the icache so that we
don't get a reserved instruction exception even though the TLB mapping
is removed.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/tlb.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index c215470fdcb0c9..fbab2f74772126 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -185,6 +185,13 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 
 	local_irq_restore(flags);
 
+	/*
+	 * We don't want to get reserved instruction exceptions for missing tlb
+	 * entries.
+	 */
+	if (cpu_has_vtag_icache)
+		flush_icache_all();
+
 	if (user && idx_user >= 0)
 		kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
 			  __func__, (va & VPN2_MASK) |
@@ -260,6 +267,13 @@ int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 	htw_start();
 	local_irq_restore(flags);
 
+	/*
+	 * We don't want to get reserved instruction exceptions for missing tlb
+	 * entries.
+	 */
+	if (cpu_has_vtag_icache)
+		flush_icache_all();
+
 	if (idx > 0)
 		kvm_debug("%s: Invalidated root entryhi %#lx @ idx %d\n",
 			  __func__, (va & VPN2_MASK) |

From 867f4da75277aaef10daa4e0e9ad6f905fa33fb1 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:47 +0000
Subject: [PATCH 050/227] KVM: MIPS/T&E: Report correct dcache line size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Octeon CPUs don't report the correct dcache line size in CP0_Config1.DL,
so encode the correct value for the guest CP0_Config1.DL based on
cpu_dcache_line_size().

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 75ba3c4b7cd57f..a563759fd142c6 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/log2.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <asm/mmu_context.h>
@@ -644,6 +645,13 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Read the cache characteristics from the host Config1 Register */
 	config1 = (read_c0_config1() & ~0x7f);
 
+	/* DCache line size not correctly reported in Config1 on Octeon CPUs */
+	if (cpu_dcache_line_size()) {
+		config1 &= ~MIPS_CONF1_DL;
+		config1 |= ((ilog2(cpu_dcache_line_size()) - 1) <<
+			    MIPS_CONF1_DL_SHF) & MIPS_CONF1_DL;
+	}
+
 	/* Set up MMU size */
 	config1 &= ~(0x3f << 25);
 	config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);

From 824533ad169f8bafcafba385a428e5b680928411 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:48 +0000
Subject: [PATCH 051/227] KVM: MIPS/VZ: VZ hardware setup for Octeon III
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set up hardware virtualisation on Octeon III cores, configuring guest
interrupt routing and carving out half of the root TLB for guest use,
restoring it back again afterwards.

We need to be careful to inhibit TLB shutdown machine check exceptions
while invalidating guest TLB entries, since TLB invalidation is not
available so guest entries must be invalidated by setting them to unique
unmapped addresses, which could conflict with mappings set by the guest
or root if recently repartitioned.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/tlb.c |  16 ++++++
 arch/mips/kvm/vz.c  | 120 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 108 insertions(+), 28 deletions(-)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index fbab2f74772126..7c6336dd2638ce 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -447,6 +447,7 @@ void kvm_vz_local_flush_guesttlb_all(void)
 	unsigned long old_entrylo[2];
 	unsigned long old_pagemask;
 	int entry;
+	u64 cvmmemctl2 = 0;
 
 	local_irq_save(flags);
 
@@ -457,6 +458,15 @@ void kvm_vz_local_flush_guesttlb_all(void)
 	old_entrylo[1] = read_gc0_entrylo1();
 	old_pagemask = read_gc0_pagemask();
 
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON3:
+		/* Inhibit machine check due to multiple matching TLB entries */
+		cvmmemctl2 = read_c0_cvmmemctl2();
+		cvmmemctl2 |= CVMMEMCTL2_INHIBITTS;
+		write_c0_cvmmemctl2(cvmmemctl2);
+		break;
+	};
+
 	/* Invalidate guest entries in guest TLB */
 	write_gc0_entrylo0(0);
 	write_gc0_entrylo1(0);
@@ -468,6 +478,12 @@ void kvm_vz_local_flush_guesttlb_all(void)
 		mtc0_tlbw_hazard();
 		guest_tlb_write_indexed();
 	}
+
+	if (cvmmemctl2) {
+		cvmmemctl2 &= ~CVMMEMCTL2_INHIBITTS;
+		write_c0_cvmmemctl2(cvmmemctl2);
+	};
+
 	write_gc0_index(old_index);
 	write_gc0_entryhi(old_entryhi);
 	write_gc0_entrylo0(old_entrylo[0]);
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 33bb8c6e1b0526..21f4495feb152c 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -2747,37 +2747,73 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
 static int kvm_vz_hardware_enable(void)
 {
 	unsigned int mmu_size, guest_mmu_size, ftlb_size;
+	u64 guest_cvmctl, cvmvmconfig;
+
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON3:
+		/* Set up guest timer/perfcount IRQ lines */
+		guest_cvmctl = read_gc0_cvmctl();
+		guest_cvmctl &= ~CVMCTL_IPTI;
+		guest_cvmctl |= 7ull << CVMCTL_IPTI_SHIFT;
+		guest_cvmctl &= ~CVMCTL_IPPCI;
+		guest_cvmctl |= 6ull << CVMCTL_IPPCI_SHIFT;
+		write_gc0_cvmctl(guest_cvmctl);
+
+		cvmvmconfig = read_c0_cvmvmconfig();
+		/* No I/O hole translation. */
+		cvmvmconfig |= CVMVMCONF_DGHT;
+		/* Halve the root MMU size */
+		mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1)
+			    >> CVMVMCONF_MMUSIZEM1_S) + 1;
+		guest_mmu_size = mmu_size / 2;
+		mmu_size -= guest_mmu_size;
+		cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1;
+		cvmvmconfig |= mmu_size - 1;
+		write_c0_cvmvmconfig(cvmvmconfig);
+
+		/* Update our records */
+		current_cpu_data.tlbsize = mmu_size;
+		current_cpu_data.tlbsizevtlb = mmu_size;
+		current_cpu_data.guest.tlbsize = guest_mmu_size;
+
+		/* Flush moved entries in new (guest) context */
+		kvm_vz_local_flush_guesttlb_all();
+		break;
+	default:
+		/*
+		 * ImgTec cores tend to use a shared root/guest TLB. To avoid
+		 * overlap of root wired and guest entries, the guest TLB may
+		 * need resizing.
+		 */
+		mmu_size = current_cpu_data.tlbsizevtlb;
+		ftlb_size = current_cpu_data.tlbsize - mmu_size;
 
-	/*
-	 * ImgTec cores tend to use a shared root/guest TLB. To avoid overlap of
-	 * root wired and guest entries, the guest TLB may need resizing.
-	 */
-	mmu_size = current_cpu_data.tlbsizevtlb;
-	ftlb_size = current_cpu_data.tlbsize - mmu_size;
-
-	/* Try switching to maximum guest VTLB size for flush */
-	guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size);
-	current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
-	kvm_vz_local_flush_guesttlb_all();
+		/* Try switching to maximum guest VTLB size for flush */
+		guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size);
+		current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+		kvm_vz_local_flush_guesttlb_all();
 
-	/*
-	 * Reduce to make space for root wired entries and at least 2 root
-	 * non-wired entries. This does assume that long-term wired entries
-	 * won't be added later.
-	 */
-	guest_mmu_size = mmu_size - num_wired_entries() - 2;
-	guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size);
-	current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+		/*
+		 * Reduce to make space for root wired entries and at least 2
+		 * root non-wired entries. This does assume that long-term wired
+		 * entries won't be added later.
+		 */
+		guest_mmu_size = mmu_size - num_wired_entries() - 2;
+		guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size);
+		current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
 
-	/*
-	 * Write the VTLB size, but if another CPU has already written, check it
-	 * matches or we won't provide a consistent view to the guest. If this
-	 * ever happens it suggests an asymmetric number of wired entries.
-	 */
-	if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) &&
-	    WARN(guest_mmu_size != kvm_vz_guest_vtlb_size,
-		 "Available guest VTLB size mismatch"))
-		return -EINVAL;
+		/*
+		 * Write the VTLB size, but if another CPU has already written,
+		 * check it matches or we won't provide a consistent view to the
+		 * guest. If this ever happens it suggests an asymmetric number
+		 * of wired entries.
+		 */
+		if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) &&
+		    WARN(guest_mmu_size != kvm_vz_guest_vtlb_size,
+			 "Available guest VTLB size mismatch"))
+			return -EINVAL;
+		break;
+	}
 
 	/*
 	 * Enable virtualization features granting guest direct control of
@@ -2814,8 +2850,36 @@ static int kvm_vz_hardware_enable(void)
 
 static void kvm_vz_hardware_disable(void)
 {
+	u64 cvmvmconfig;
+	unsigned int mmu_size;
+
+	/* Flush any remaining guest TLB entries */
 	kvm_vz_local_flush_guesttlb_all();
 
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON3:
+		/*
+		 * Allocate whole TLB for root. Existing guest TLB entries will
+		 * change ownership to the root TLB. We should be safe though as
+		 * they've already been flushed above while in guest TLB.
+		 */
+		cvmvmconfig = read_c0_cvmvmconfig();
+		mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1)
+			    >> CVMVMCONF_MMUSIZEM1_S) + 1;
+		cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1;
+		cvmvmconfig |= mmu_size - 1;
+		write_c0_cvmvmconfig(cvmvmconfig);
+
+		/* Update our records */
+		current_cpu_data.tlbsize = mmu_size;
+		current_cpu_data.tlbsizevtlb = mmu_size;
+		current_cpu_data.guest.tlbsize = 0;
+
+		/* Flush moved entries in new (root) context */
+		local_flush_tlb_all();
+		break;
+	}
+
 	if (cpu_has_guestid) {
 		write_c0_guestctl1(0);
 		kvm_vz_local_flush_roottlb_all_guests();

From 3ba731daf09a2dd9515713e2428ef859bb50957b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:49 +0000
Subject: [PATCH 052/227] KVM: MIPS/VZ: Emulate hit CACHE ops for Octeon III
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Octeon III doesn't implement the optional GuestCtl0.CG bit to allow
guest mode to execute virtual address based CACHE instructions, so
implement emulation of a few important ones specifically for Octeon III
in response to a GPSI exception.

Currently the main reason to perform these operations is for icache
synchronisation, so they are implemented as a simple icache flush with
local_flush_icache_range().

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/vz.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 21f4495feb152c..5c495277bf4401 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1105,6 +1105,17 @@ static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
 	case Index_Writeback_Inv_D:
 		flush_dcache_line_indexed(va);
 		return EMULATE_DONE;
+	case Hit_Invalidate_I:
+	case Hit_Invalidate_D:
+	case Hit_Writeback_Inv_D:
+		if (boot_cpu_type() == CPU_CAVIUM_OCTEON3) {
+			/* We can just flush entire icache */
+			local_flush_icache_range(0, 0);
+			return EMULATE_DONE;
+		}
+
+		/* So far, other platforms support guest hit cache ops */
+		break;
 	default:
 		break;
 	};

From 1f48f9be4df2fd82e3bea0c4d452299c9b3df792 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:50 +0000
Subject: [PATCH 053/227] KVM: MIPS/VZ: Handle Octeon III guest.PRid register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Octeon III implements a read-only guest CP0_PRid register, so add cases
to the KVM register access API for Octeon to ensure the correct value is
read and writes are ignored.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/mipsregs.h |  2 ++
 arch/mips/kvm/vz.c               | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index ebe608d21d7e5d..6875b69f59f763 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -2025,6 +2025,8 @@ do {									\
 #define read_gc0_epc()			__read_ulong_gc0_register(14, 0)
 #define write_gc0_epc(val)		__write_ulong_gc0_register(14, 0, val)
 
+#define read_gc0_prid()			__read_32bit_gc0_register(15, 0)
+
 #define read_gc0_ebase()		__read_32bit_gc0_register(15, 1)
 #define write_gc0_ebase(val)		__write_32bit_gc0_register(15, 1, val)
 
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 5c495277bf4401..71d8856ade64c8 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1938,7 +1938,15 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 		*v = (long)read_gc0_epc();
 		break;
 	case KVM_REG_MIPS_CP0_PRID:
-		*v = (long)kvm_read_c0_guest_prid(cop0);
+		switch (boot_cpu_type()) {
+		case CPU_CAVIUM_OCTEON3:
+			/* Octeon III has a read-only guest.PRid */
+			*v = read_gc0_prid();
+			break;
+		default:
+			*v = (long)kvm_read_c0_guest_prid(cop0);
+			break;
+		};
 		break;
 	case KVM_REG_MIPS_CP0_EBASE:
 		*v = kvm_vz_read_gc0_ebase();
@@ -2170,7 +2178,14 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 		write_gc0_epc(v);
 		break;
 	case KVM_REG_MIPS_CP0_PRID:
-		kvm_write_c0_guest_prid(cop0, v);
+		switch (boot_cpu_type()) {
+		case CPU_CAVIUM_OCTEON3:
+			/* Octeon III has a guest.PRid, but its read-only */
+			break;
+		default:
+			kvm_write_c0_guest_prid(cop0, v);
+			break;
+		};
 		break;
 	case KVM_REG_MIPS_CP0_EBASE:
 		kvm_vz_write_gc0_ebase(v);

From 0ae3abcda2dc5fe0130e3261da8a89489fff3e0e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 10:25:51 +0000
Subject: [PATCH 054/227] MIPS: Allow KVM to be enabled on Octeon CPUs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Octeon III has VZ ASE support, so allow KVM to be enabled on Octeon
CPUs as it should now be functional.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index a008a9f03072de..0a4adbc326e6c3 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1687,6 +1687,7 @@ config CPU_CAVIUM_OCTEON
 	select USB_EHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
 	select USB_OHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
 	select MIPS_L1_CACHE_SHIFT_7
+	select HAVE_KVM
 	help
 	  The Cavium Octeon processor is a highly integrated chip containing
 	  many ethernet hardware widgets for networking tasks. The processor

From dc44abd6aad22411f7f9890e39fd4753dabf0d03 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Mar 2017 17:00:08 +0000
Subject: [PATCH 055/227] KVM: MIPS/Emulate: Properly implement TLBR for T&E
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Properly implement emulation of the TLBR instruction for Trap & Emulate.
This instruction reads the TLB entry pointed at by the CP0_Index
register into the other TLB registers, which may have the side effect of
changing the current ASID. Therefore abstract the CP0_EntryHi and ASID
changing code into a common function in the process.

A comment indicated that Linux doesn't use TLBR, which is true during
normal use, however dumping of the TLB does use it (for example with the
relatively recent 'x' magic sysrq key), as does a wired TLB entries test
case in my KVM tests.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 99 ++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 46 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 4833ebad89d9b8..34e78a3ee9d79e 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -990,17 +990,62 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
-/*
- * XXXKYMA: Linux doesn't seem to use TLBR, return EMULATE_FAIL for now so that
- * we can catch this, if things ever change
- */
+static void kvm_mips_change_entryhi(struct kvm_vcpu *vcpu,
+				    unsigned long entryhi)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	int cpu, i;
+	u32 nasid = entryhi & KVM_ENTRYHI_ASID;
+
+	if (((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) {
+		trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) &
+				      KVM_ENTRYHI_ASID, nasid);
+
+		/*
+		 * Flush entries from the GVA page tables.
+		 * Guest user page table will get flushed lazily on re-entry to
+		 * guest user if the guest ASID actually changes.
+		 */
+		kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_KERN);
+
+		/*
+		 * Regenerate/invalidate kernel MMU context.
+		 * The user MMU context will be regenerated lazily on re-entry
+		 * to guest user if the guest ASID actually changes.
+		 */
+		preempt_disable();
+		cpu = smp_processor_id();
+		get_new_mmu_context(kern_mm, cpu);
+		for_each_possible_cpu(i)
+			if (i != cpu)
+				cpu_context(i, kern_mm) = 0;
+		preempt_enable();
+	}
+	kvm_write_c0_guest_entryhi(cop0, entryhi);
+}
+
 enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct kvm_mips_tlb *tlb;
 	unsigned long pc = vcpu->arch.pc;
+	int index;
 
-	kvm_err("[%#lx] COP0_TLBR [%d]\n", pc, kvm_read_c0_guest_index(cop0));
-	return EMULATE_FAIL;
+	index = kvm_read_c0_guest_index(cop0);
+	if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
+		/* UNDEFINED */
+		kvm_debug("[%#lx] TLBR Index %#x out of range\n", pc, index);
+		index &= KVM_MIPS_GUEST_TLB_SIZE - 1;
+	}
+
+	tlb = &vcpu->arch.guest_tlb[index];
+	kvm_write_c0_guest_pagemask(cop0, tlb->tlb_mask);
+	kvm_write_c0_guest_entrylo0(cop0, tlb->tlb_lo[0]);
+	kvm_write_c0_guest_entrylo1(cop0, tlb->tlb_lo[1]);
+	kvm_mips_change_entryhi(vcpu, tlb->tlb_hi);
+
+	return EMULATE_DONE;
 }
 
 /**
@@ -1222,11 +1267,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					   struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	enum emulation_result er = EMULATE_DONE;
 	u32 rt, rd, sel;
 	unsigned long curr_pc;
-	int cpu, i;
 
 	/*
 	 * Update PC and hold onto current PC in case there is
@@ -1328,44 +1371,8 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 				kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
 							  vcpu->arch.gprs[rt]);
 			} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
-				u32 nasid =
-					vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
-				if (((kvm_read_c0_guest_entryhi(cop0) &
-				      KVM_ENTRYHI_ASID) != nasid)) {
-					trace_kvm_asid_change(vcpu,
-						kvm_read_c0_guest_entryhi(cop0)
-							& KVM_ENTRYHI_ASID,
-						nasid);
-
-					/*
-					 * Flush entries from the GVA page
-					 * tables.
-					 * Guest user page table will get
-					 * flushed lazily on re-entry to guest
-					 * user if the guest ASID actually
-					 * changes.
-					 */
-					kvm_mips_flush_gva_pt(kern_mm->pgd,
-							      KMF_KERN);
-
-					/*
-					 * Regenerate/invalidate kernel MMU
-					 * context.
-					 * The user MMU context will be
-					 * regenerated lazily on re-entry to
-					 * guest user if the guest ASID actually
-					 * changes.
-					 */
-					preempt_disable();
-					cpu = smp_processor_id();
-					get_new_mmu_context(kern_mm, cpu);
-					for_each_possible_cpu(i)
-						if (i != cpu)
-							cpu_context(i, kern_mm) = 0;
-					preempt_enable();
-				}
-				kvm_write_c0_guest_entryhi(cop0,
-							   vcpu->arch.gprs[rt]);
+				kvm_mips_change_entryhi(vcpu,
+							vcpu->arch.gprs[rt]);
 			}
 			/* Are we writing to COUNT */
 			else if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {

From a0b4e6a0325e325d91901342dd436d917da0ddd6 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:00 +0100
Subject: [PATCH 056/227] tools/kvm_stat: hide cursor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When running kvm_stat in interactive mode, the cursor appears at the lower
left corner, which looks a bit distracting.
This patch hides the cursor by turning it invisible.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-By: Sascha Silbe <silbe@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 581278c5848877..8cc83a309e65ee 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -812,6 +812,13 @@ class Tui(object):
         except:
             pass
 
+        # Hide cursor in extra statement as some monochrome terminals
+        # might support hiding but not colors.
+        try:
+            curses.curs_set(0)
+        except curses.error:
+            pass
+
         curses.use_default_colors()
         return self
 

From 9fc0adfc427a5e3f95f8db8dafabe1eaa3720f4a Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:01 +0100
Subject: [PATCH 057/227] tools/kvm_stat: catch curses exceptions only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous version was catching all exceptions, including SIGINT.
We only want to catch the curses exceptions here.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Sascha Silbe <silbe@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 8cc83a309e65ee..ef47ad749f8a88 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -809,7 +809,7 @@ class Tui(object):
         # return from C start_color() is ignorable.
         try:
             curses.start_color()
-        except:
+        except curses.error:
             pass
 
         # Hide cursor in extra statement as some monochrome terminals

From dadf1e7839243474b691ca4258bfd2a59e628a5e Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:02 +0100
Subject: [PATCH 058/227] tools/kvm_stat: handle SIGINT in log and batch modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SIGINT causes ugly unhandled exceptions in log and batch mode, which we
prevent by catching the exceptions accordingly.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index ef47ad749f8a88..14536c059fd772 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -969,12 +969,15 @@ class Tui(object):
 
 def batch(stats):
     """Prints statistics in a key, value format."""
-    s = stats.get()
-    time.sleep(1)
-    s = stats.get()
-    for key in sorted(s.keys()):
-        values = s[key]
-        print '%-42s%10d%10d' % (key, values[0], values[1])
+    try:
+        s = stats.get()
+        time.sleep(1)
+        s = stats.get()
+        for key in sorted(s.keys()):
+            values = s[key]
+            print '%-42s%10d%10d' % (key, values[0], values[1])
+    except KeyboardInterrupt:
+        pass
 
 def log(stats):
     """Prints statistics as reiterating key block, multiple value blocks."""
@@ -991,11 +994,14 @@ def log(stats):
     line = 0
     banner_repeat = 20
     while True:
-        time.sleep(1)
-        if line % banner_repeat == 0:
-            banner()
-        statline()
-        line += 1
+        try:
+            time.sleep(1)
+            if line % banner_repeat == 0:
+                banner()
+            statline()
+            line += 1
+        except KeyboardInterrupt:
+            break
 
 def get_options():
     """Returns processed program arguments."""

From e0ba38765c1d1d670246d6f6c518594aa8e62587 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:03 +0100
Subject: [PATCH 059/227] tools/kvm_stat: fix misc glitches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses
- eliminate extra import
- missing variable initialization
- type redefinition from int to float
- passing of int type argument instead of string
- a couple of PEP8-reported indentation/formatting glitches
- remove unused variable drilldown in class Tui

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 14536c059fd772..231186a773aaf5 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -31,7 +31,6 @@ import resource
 import struct
 import re
 from collections import defaultdict
-from time import sleep
 
 VMX_EXIT_REASONS = {
     'EXCEPTION_NMI':        0,
@@ -657,6 +656,7 @@ class DebugfsProvider(object):
         self._fields = self.get_available_fields()
         self._pid = 0
         self.do_read = True
+        self.paths = []
 
     def get_available_fields(self):
         """"Returns a list of available fields.
@@ -794,7 +794,6 @@ class Tui(object):
     def __init__(self, stats):
         self.stats = stats
         self.screen = None
-        self.drilldown = False
         self.update_drilldown()
 
     def __enter__(self):
@@ -950,11 +949,10 @@ class Tui(object):
         while True:
             self.refresh(sleeptime)
             curses.halfdelay(int(sleeptime * 10))
-            sleeptime = 3
+            sleeptime = 3.0
             try:
                 char = self.screen.getkey()
                 if char == 'x':
-                    self.drilldown = not self.drilldown
                     self.update_drilldown()
                 if char == 'q':
                     break
@@ -1064,12 +1062,12 @@ Requirements:
                          help='fields to display (regex)',
                          )
     optparser.add_option('-p', '--pid',
-                        action='store',
-                        default=0,
-                        type=int,
-                        dest='pid',
-                        help='restrict statistics to pid',
-                        )
+                         action='store',
+                         default=0,
+                         type='int',
+                         dest='pid',
+                         help='restrict statistics to pid',
+                         )
     (options, _) = optparser.parse_args(sys.argv)
     return options
 
@@ -1099,8 +1097,8 @@ def check_access(options):
                          "Also ensure, that the kvm modules are loaded.\n")
         sys.exit(1)
 
-    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints
-                                                     or not options.debugfs):
+    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or
+                                                     not options.debugfs):
         sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
                          "when using the option -t (default).\n"
                          "If it is enabled, make {0} readable by the "
@@ -1111,7 +1109,7 @@ def check_access(options):
 
         sys.stderr.write("Falling back to debugfs statistics!\n")
         options.debugfs = True
-        sleep(5)
+        time.sleep(5)
 
     return options
 

From a183606937489ab5ada2215aa8211374a6b26bd3 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:04 +0100
Subject: [PATCH 060/227] tools/kvm_stat: fix trace setup glitch on field
 updates in TracepointProvider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updating the fields of the TracepointProvider does not propagate changes to the
tracepoints. This shows when a pid filter is enabled, whereby subsequent
extensions of the fields of the Tracepoint provider (e.g. by toggling
drilldown) will not modify the tracepoints as required.
To reproduce, select a specific process via interactive command 'p', and
enable drilldown via 'x' - none of the fields with the braces will appear
although they should.
The fix will always leave all available fields in the TracepointProvider
enabled.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Based-on-text-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 231186a773aaf5..6207843b9199a5 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -550,6 +550,7 @@ class TracepointProvider(object):
     def setup_traces(self):
         """Creates all event and group objects needed to be able to retrieve
         data."""
+        fields = self.get_available_fields()
         if self._pid > 0:
             # Fetch list of all threads of the monitored pid, as qemu
             # starts a thread for each vcpu.
@@ -560,7 +561,7 @@ class TracepointProvider(object):
 
         # The constant is needed as a buffer for python libs, std
         # streams and other files that the script opens.
-        newlim = len(groupids) * len(self._fields) + 50
+        newlim = len(groupids) * len(fields) + 50
         try:
             softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE)
 
@@ -576,7 +577,7 @@ class TracepointProvider(object):
 
         for groupid in groupids:
             group = Group()
-            for name in self._fields:
+            for name in fields:
                 tracepoint = name
                 tracefilter = None
                 match = re.match(r'(.*)\((.*)\)', name)

From 692c7f6deb553dde2531102cd10ac17ab61438e4 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:05 +0100
Subject: [PATCH 061/227] tools/kvm_stat: full PEP8 compliance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provides all missing empty lines as required for full PEP compliance.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 6207843b9199a5..5c4f24879dc4d1 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -224,6 +224,7 @@ IOCTL_NUMBERS = {
     'RESET':       0x00002403,
 }
 
+
 class Arch(object):
     """Encapsulates global architecture specific data.
 
@@ -254,12 +255,14 @@ class Arch(object):
                     return ArchX86(SVM_EXIT_REASONS)
                 return
 
+
 class ArchX86(Arch):
     def __init__(self, exit_reasons):
         self.sc_perf_evt_open = 298
         self.ioctl_numbers = IOCTL_NUMBERS
         self.exit_reasons = exit_reasons
 
+
 class ArchPPC(Arch):
     def __init__(self):
         self.sc_perf_evt_open = 319
@@ -274,12 +277,14 @@ class ArchPPC(Arch):
         self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
         self.exit_reasons = {}
 
+
 class ArchA64(Arch):
     def __init__(self):
         self.sc_perf_evt_open = 241
         self.ioctl_numbers = IOCTL_NUMBERS
         self.exit_reasons = AARCH64_EXIT_REASONS
 
+
 class ArchS390(Arch):
     def __init__(self):
         self.sc_perf_evt_open = 331
@@ -341,6 +346,7 @@ def get_filters():
 libc = ctypes.CDLL('libc.so.6', use_errno=True)
 syscall = libc.syscall
 
+
 class perf_event_attr(ctypes.Structure):
     """Struct that holds the necessary data to set up a trace event.
 
@@ -369,6 +375,7 @@ class perf_event_attr(ctypes.Structure):
         self.size = ctypes.sizeof(self)
         self.read_format = PERF_FORMAT_GROUP
 
+
 def perf_event_open(attr, pid, cpu, group_fd, flags):
     """Wrapper for the sys_perf_evt_open() syscall.
 
@@ -394,6 +401,7 @@ PERF_FORMAT_GROUP = 1 << 3
 PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
 PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
 
+
 class Group(object):
     """Represents a perf event group."""
 
@@ -426,6 +434,7 @@ class Group(object):
                         struct.unpack(read_format,
                                       os.read(self.events[0].fd, length))))
 
+
 class Event(object):
     """Represents a performance event and manages its life cycle."""
     def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
@@ -509,6 +518,7 @@ class Event(object):
         """Resets the count of the trace event in the kernel."""
         fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
 
+
 class TracepointProvider(object):
     """Data provider for the stats class.
 
@@ -650,6 +660,7 @@ class TracepointProvider(object):
                     ret[name] += val
         return ret
 
+
 class DebugfsProvider(object):
     """Provides data from the files that KVM creates in the kvm debugfs
     folder."""
@@ -719,6 +730,7 @@ class DebugfsProvider(object):
         except IOError:
             return 0
 
+
 class Stats(object):
     """Manages the data providers and the data they provide.
 
@@ -790,6 +802,7 @@ class Stats(object):
 LABEL_WIDTH = 40
 NUMBER_WIDTH = 10
 
+
 class Tui(object):
     """Instruments curses to draw a nice text ui."""
     def __init__(self, stats):
@@ -859,6 +872,7 @@ class Tui(object):
                            len('Current'), 'Current')
         row = 3
         stats = self.stats.get()
+
         def sortkey(x):
             if stats[x][1]:
                 return (-stats[x][1], -stats[x][0])
@@ -966,6 +980,7 @@ class Tui(object):
             except curses.error:
                 continue
 
+
 def batch(stats):
     """Prints statistics in a key, value format."""
     try:
@@ -978,13 +993,16 @@ def batch(stats):
     except KeyboardInterrupt:
         pass
 
+
 def log(stats):
     """Prints statistics as reiterating key block, multiple value blocks."""
     keys = sorted(stats.get().iterkeys())
+
     def banner():
         for k in keys:
             print '%s' % k,
         print
+
     def statline():
         s = stats.get()
         for k in keys:
@@ -1002,6 +1020,7 @@ def log(stats):
         except KeyboardInterrupt:
             break
 
+
 def get_options():
     """Returns processed program arguments."""
     description_text = """
@@ -1072,6 +1091,7 @@ Requirements:
     (options, _) = optparser.parse_args(sys.argv)
     return options
 
+
 def get_providers(options):
     """Returns a list of data providers depending on the passed options."""
     providers = []
@@ -1085,6 +1105,7 @@ def get_providers(options):
 
     return providers
 
+
 def check_access(options):
     """Exits if the current user can't access all needed directories."""
     if not os.path.exists('/sys/kernel/debug'):
@@ -1114,6 +1135,7 @@ def check_access(options):
 
     return options
 
+
 def main():
     options = get_options()
     options = check_access(options)

From 184b2d23b057b35fba7fd4049962a897ef0e3f9d Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:06 +0100
Subject: [PATCH 062/227] tools/kvm_stat: reduce perceived idle time on filter
 updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whenever a user adds a filter, we
* redraw the header immediately for a snappy response
* print a message indicating to the user that we're busy while the
  noticeable delay induced by updating all of the stats objects takes place
* update the statistics ASAP (i.e. after 0.25s instead of 3s) to be
  consistent with behavior on startup
To do so, we split the Tui's refresh() method to allow for drawing header
and stats separately, and trigger a header refresh whenever we are about
to do something that takes a while - like updating filters.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 48 +++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 5c4f24879dc4d1..3e60d93870c027 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -801,6 +801,8 @@ class Stats(object):
 
 LABEL_WIDTH = 40
 NUMBER_WIDTH = 10
+DELAY_INITIAL = 0.25
+DELAY_REGULAR = 3.0
 
 
 class Tui(object):
@@ -856,13 +858,14 @@ class Tui(object):
         """Propagates pid selection to stats object."""
         self.stats.pid_filter = pid
 
-    def refresh(self, sleeptime):
-        """Refreshes on-screen data."""
+    def refresh_header(self, pid=None):
+        """Refreshes the header."""
+        if pid is None:
+            pid = self.stats.pid_filter
         self.screen.erase()
-        if self.stats.pid_filter > 0:
+        if pid > 0:
             self.screen.addstr(0, 0, 'kvm statistics - pid {0}'
-                               .format(self.stats.pid_filter),
-                               curses.A_BOLD)
+                               .format(pid), curses.A_BOLD)
         else:
             self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
         self.screen.addstr(2, 1, 'Event')
@@ -870,7 +873,13 @@ class Tui(object):
                            len('Total'), 'Total')
         self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
                            len('Current'), 'Current')
+        self.screen.addstr(4, 1, 'Collecting data...')
+        self.screen.refresh()
+
+    def refresh_body(self, sleeptime):
         row = 3
+        self.screen.move(row, 0)
+        self.screen.clrtobot()
         stats = self.stats.get()
 
         def sortkey(x):
@@ -914,10 +923,12 @@ class Tui(object):
             regex = self.screen.getstr()
             curses.noecho()
             if len(regex) == 0:
+                self.refresh_header()
                 return
             try:
                 re.compile(regex)
                 self.stats.fields_filter = regex
+                self.refresh_header()
                 return
             except re.error:
                 continue
@@ -944,37 +955,38 @@ class Tui(object):
 
             try:
                 pid = int(pid)
-
-                if pid == 0:
-                    self.update_pid(pid)
-                    break
-                else:
-                    if not os.path.isdir(os.path.join('/proc/', str(pid))):
-                        continue
-                    else:
-                        self.update_pid(pid)
-                        break
+                if pid != 0 and not os.path.isdir(os.path.join('/proc/',
+                                                               str(pid))):
+                    continue
+                self.refresh_header(pid)
+                self.update_pid(pid)
+                break
 
             except ValueError:
                 continue
 
     def show_stats(self):
         """Refreshes the screen and processes user input."""
-        sleeptime = 0.25
+        sleeptime = DELAY_INITIAL
+        self.refresh_header()
         while True:
-            self.refresh(sleeptime)
+            self.refresh_body(sleeptime)
             curses.halfdelay(int(sleeptime * 10))
-            sleeptime = 3.0
+            sleeptime = DELAY_REGULAR
             try:
                 char = self.screen.getkey()
                 if char == 'x':
+                    self.refresh_header()
                     self.update_drilldown()
+                    sleeptime = DELAY_INITIAL
                 if char == 'q':
                     break
                 if char == 'f':
                     self.show_filter_selection()
+                    sleeptime = DELAY_INITIAL
                 if char == 'p':
                     self.show_vm_selection()
+                    sleeptime = DELAY_INITIAL
             except KeyboardInterrupt:
                 break
             except curses.error:

From 1eaa2f9022d55a8d7249c42def8dc4b0d682e142 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:07 +0100
Subject: [PATCH 063/227] tools/kvm_stat: document list of interactive commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apart from the source code, there does not seem to be a place that documents
the interactive capabilities of kvm_stat yet.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat     |  7 +++++++
 tools/kvm/kvm_stat/kvm_stat.txt | 16 ++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 3e60d93870c027..aca85086be1e68 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1052,6 +1052,13 @@ Requirements:
   CAP_SYS_ADMIN and perf events are used.
 - CAP_SYS_RESOURCE if the hard limit is not high enough to allow
   the large number of files that are possibly opened.
+
+Interactive Commands:
+   f     filter by regular expression
+   p     filter by PID
+   q     quit
+   x     toggle reporting of stats for individual child trace events
+Press any other key to refresh statistics immediately.
 """
 
     class PlainHelpFormatter(optparse.IndentedHelpFormatter):
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index b92a153d7115c0..077bcc7e20dc83 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -18,11 +18,27 @@ state transitions such as guest mode entry and exit.
 This tool is useful for observing guest behavior from the host perspective.
 Often conclusions about performance or buggy behavior can be drawn from the
 output.
+While running in regular mode, use any of the keys listed in section
+'Interactive Commands' below.
+Use batch and logging modes for scripting purposes.
 
 The set of KVM kernel module trace events may be specific to the kernel version
 or architecture.  It is best to check the KVM kernel module source code for the
 meaning of events.
 
+INTERACTIVE COMMANDS
+--------------------
+[horizontal]
+*f*::	filter by regular expression
+
+*p*::	filter by PID
+
+*q*::	quit
+
+*x*::	toggle reporting of stats for child trace events
+
+Press any other key to refresh statistics immediately.
+
 OPTIONS
 -------
 -1::

From a24e85f6a69f09a9d09a86110a6bb168c60610ef Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:08 +0100
Subject: [PATCH 064/227] tools/kvm_stat: display guest name when using pid
 filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When running kvm_stat with option '-p' to filter per process, display
the QEMU guest name next to the pid, if available.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-By: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 41 +++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index aca85086be1e68..95ffa9a4b0446c 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -320,6 +320,37 @@ def parse_int_list(list_string):
     return integers
 
 
+def get_gname_from_pid(pid):
+    """Returns the guest name for a QEMU process pid.
+
+    Extracts the guest name from the QEMU comma line by processing the '-name'
+    option. Will also handle names specified out of sequence.
+
+    """
+    name = ''
+    try:
+        line = open('/proc/{}/cmdline'.format(pid), 'rb').read().split('\0')
+        parms = line[line.index('-name') + 1].split(',')
+        while '' in parms:
+            # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results in
+            # ['foo', '', 'bar'], which we revert here
+            idx = parms.index('')
+            parms[idx - 1] += ',' + parms[idx + 1]
+            del parms[idx:idx+2]
+        # the '-name' switch allows for two ways to specify the guest name,
+        # where the plain name overrides the name specified via 'guest='
+        for arg in parms:
+            if '=' not in arg:
+                name = arg
+                break
+            if arg[:6] == 'guest=':
+                name = arg[6:]
+    except (ValueError, IOError, IndexError):
+        pass
+
+    return name
+
+
 def get_online_cpus():
     """Returns a list of cpu id integers."""
     with open('/sys/devices/system/cpu/online') as cpu_list:
@@ -803,6 +834,7 @@ LABEL_WIDTH = 40
 NUMBER_WIDTH = 10
 DELAY_INITIAL = 0.25
 DELAY_REGULAR = 3.0
+MAX_GUEST_NAME_LEN = 48
 
 
 class Tui(object):
@@ -863,9 +895,14 @@ class Tui(object):
         if pid is None:
             pid = self.stats.pid_filter
         self.screen.erase()
+        gname = get_gname_from_pid(pid)
+        if gname:
+            gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...'
+                                   if len(gname) > MAX_GUEST_NAME_LEN
+                                   else gname))
         if pid > 0:
-            self.screen.addstr(0, 0, 'kvm statistics - pid {0}'
-                               .format(pid), curses.A_BOLD)
+            self.screen.addstr(0, 0, 'kvm statistics - pid {0} {1}'
+                               .format(pid, gname), curses.A_BOLD)
         else:
             self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
         self.screen.addstr(2, 1, 'Event')

From be03ea3b77387db36617d71d60ee182a866fb9cd Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:09 +0100
Subject: [PATCH 065/227] tools/kvm_stat: remove pid filter on empty input
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improve consistency in the interactive dialogue for pid filtering by
removing any filters on empty input (in addition to entering 0).

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 95ffa9a4b0446c..9e9eb983f6fef8 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -991,10 +991,13 @@ class Tui(object):
             curses.noecho()
 
             try:
-                pid = int(pid)
-                if pid != 0 and not os.path.isdir(os.path.join('/proc/',
-                                                               str(pid))):
-                    continue
+                if len(pid) > 0:
+                    pid = int(pid)
+                    if pid != 0 and not os.path.isdir(os.path.join('/proc/',
+                                                                   str(pid))):
+                        continue
+                else:
+                    pid = 0
                 self.refresh_header(pid)
                 self.update_pid(pid)
                 break

From 0152c20f0400498774ae56067f8076cef312abc7 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:10 +0100
Subject: [PATCH 066/227] tools/kvm_stat: print error messages on faulty pid
 filter input
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Print helpful messages in case users enter invalid input or invalid pids in
the interactive pid filter dialogue.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 9e9eb983f6fef8..ced0cb908ddb10 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -976,6 +976,7 @@ class Tui(object):
         Asks for a pid until a valid pid or 0 has been entered.
 
         """
+        msg = ''
         while True:
             self.screen.erase()
             self.screen.addstr(0, 0,
@@ -984,6 +985,7 @@ class Tui(object):
             self.screen.addstr(1, 0,
                                'This might limit the shown data to the trace '
                                'statistics.')
+            self.screen.addstr(5, 0, msg)
 
             curses.echo()
             self.screen.addstr(3, 0, "Pid [0 or pid]: ")
@@ -995,6 +997,7 @@ class Tui(object):
                     pid = int(pid)
                     if pid != 0 and not os.path.isdir(os.path.join('/proc/',
                                                                    str(pid))):
+                        msg = '"' + str(pid) + '": Not a running process'
                         continue
                 else:
                     pid = 0
@@ -1003,6 +1006,7 @@ class Tui(object):
                 break
 
             except ValueError:
+                msg = '"' + str(pid) + '": Not a valid pid'
                 continue
 
     def show_stats(self):

From 72187dfa8e2686b748ad7485d0ca59ba993ba526 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:11 +0100
Subject: [PATCH 067/227] tools/kvm_stat: display regex when set to non-default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a user defines a regex filter through the interactive command, display
the active regex in the header's second line.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index ced0cb908ddb10..af7071778a0c0a 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -835,6 +835,7 @@ NUMBER_WIDTH = 10
 DELAY_INITIAL = 0.25
 DELAY_REGULAR = 3.0
 MAX_GUEST_NAME_LEN = 48
+MAX_REGEX_LEN = 44
 
 
 class Tui(object):
@@ -905,6 +906,11 @@ class Tui(object):
                                .format(pid, gname), curses.A_BOLD)
         else:
             self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
+        if self.stats.fields_filter and self.stats.fields_filter != '^[^\(]*$':
+            regex = self.stats.fields_filter
+            if len(regex) > MAX_REGEX_LEN:
+                regex = regex[:MAX_REGEX_LEN] + '...'
+            self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex))
         self.screen.addstr(2, 1, 'Event')
         self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
                            len('Total'), 'Total')

From 645c1728a9d33d78028d93a2ed770f51df0a92c6 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:12 +0100
Subject: [PATCH 068/227] tools/kvm_stat: remove regex filter on empty input
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Behavior on empty/0 input for regex and pid filtering was inconsistent, as
the former would keep the current filter, while the latter would (naturally)
remove any pid filtering.
Make things consistent by falling back to the default filter on empty input
for the regex filter dialogue.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index af7071778a0c0a..f2a868b696a8cd 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -966,6 +966,7 @@ class Tui(object):
             regex = self.screen.getstr()
             curses.noecho()
             if len(regex) == 0:
+                self.stats.fields_filter = r'^[^\(]*$'
                 self.refresh_header()
                 return
             try:

From f9ff1087354e5e063b96a291360a8de84bea0bed Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:13 +0100
Subject: [PATCH 069/227] tools/kvm_stat: add option '--guest'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new option '-g'/'--guest' to select a particular process by providing
the QEMU guest name.
Notes:
- The logic to figure out the pid corresponding to the guest name might look
  scary, but works pretty reliably in practice; in the unlikely event that it
  returns add'l flukes, it will bail out and hint at using '-p' instead, no
  harm done.
- Mixing '-g' and '-p' is possible, and the final instance specified on the
  command line is the significant one. This is consistent with current
  behavior for '-p' which, if specified multiple times, also regards the final
  instance as the significant one.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat     | 101 +++++++++++++++++++++++++++++++-
 tools/kvm/kvm_stat/kvm_stat.txt |   6 ++
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index f2a868b696a8cd..f263312c9a2999 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -30,6 +30,7 @@ import fcntl
 import resource
 import struct
 import re
+import subprocess
 from collections import defaultdict
 
 VMX_EXIT_REASONS = {
@@ -320,6 +321,30 @@ def parse_int_list(list_string):
     return integers
 
 
+def get_pid_from_gname(gname):
+    """Fuzzy function to convert guest name to QEMU process pid.
+
+    Returns a list of potential pids, can be empty if no match found.
+    Throws an exception on processing errors.
+
+    """
+    pids = []
+    try:
+        child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'],
+                                 stdout=subprocess.PIPE)
+    except:
+        raise Exception
+    for line in child.stdout:
+        line = line.lstrip().split(' ', 1)
+        # perform a sanity check before calling the more expensive
+        # function to possibly extract the guest name
+        if ' -name ' in line[1] and gname == get_gname_from_pid(line[0]):
+            pids.append(int(line[0]))
+    child.stdout.close()
+
+    return pids
+
+
 def get_gname_from_pid(pid):
     """Returns the guest name for a QEMU process pid.
 
@@ -977,7 +1002,7 @@ class Tui(object):
             except re.error:
                 continue
 
-    def show_vm_selection(self):
+    def show_vm_selection_by_pid(self):
         """Draws PID selection mask.
 
         Asks for a pid until a valid pid or 0 has been entered.
@@ -1016,6 +1041,50 @@ class Tui(object):
                 msg = '"' + str(pid) + '": Not a valid pid'
                 continue
 
+    def show_vm_selection_by_guest_name(self):
+        """Draws guest selection mask.
+
+        Asks for a guest name until a valid guest name or '' is entered.
+
+        """
+        msg = ''
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0,
+                               'Show statistics for specific guest.',
+                               curses.A_BOLD)
+            self.screen.addstr(1, 0,
+                               'This might limit the shown data to the trace '
+                               'statistics.')
+            self.screen.addstr(5, 0, msg)
+            curses.echo()
+            self.screen.addstr(3, 0, "Guest [ENTER or guest]: ")
+            gname = self.screen.getstr()
+            curses.noecho()
+
+            if not gname:
+                self.refresh_header(0)
+                self.update_pid(0)
+                break
+            else:
+                pids = []
+                try:
+                    pids = get_pid_from_gname(gname)
+                except:
+                    msg = '"' + gname + '": Internal error while searching, ' \
+                          'use pid filter instead'
+                    continue
+                if len(pids) == 0:
+                    msg = '"' + gname + '": Not an active guest'
+                    continue
+                if len(pids) > 1:
+                    msg = '"' + gname + '": Multiple matches found, use pid ' \
+                          'filter instead'
+                    continue
+                self.refresh_header(pids[0])
+                self.update_pid(pids[0])
+                break
+
     def show_stats(self):
         """Refreshes the screen and processes user input."""
         sleeptime = DELAY_INITIAL
@@ -1035,8 +1104,11 @@ class Tui(object):
                 if char == 'f':
                     self.show_filter_selection()
                     sleeptime = DELAY_INITIAL
+                if char == 'g':
+                    self.show_vm_selection_by_guest_name()
+                    sleeptime = DELAY_INITIAL
                 if char == 'p':
-                    self.show_vm_selection()
+                    self.show_vm_selection_by_pid()
                     sleeptime = DELAY_INITIAL
             except KeyboardInterrupt:
                 break
@@ -1106,6 +1178,7 @@ Requirements:
 
 Interactive Commands:
    f     filter by regular expression
+   g     filter by guest name
    p     filter by PID
    q     quit
    x     toggle reporting of stats for individual child trace events
@@ -1119,6 +1192,22 @@ Press any other key to refresh statistics immediately.
             else:
                 return ""
 
+    def cb_guest_to_pid(option, opt, val, parser):
+        try:
+            pids = get_pid_from_gname(val)
+        except:
+            raise optparse.OptionValueError('Error while searching for guest '
+                                            '"{}", use "-p" to specify a pid '
+                                            'instead'.format(val))
+        if len(pids) == 0:
+            raise optparse.OptionValueError('No guest by the name "{}" '
+                                            'found'.format(val))
+        if len(pids) > 1:
+            raise optparse.OptionValueError('Multiple processes found (pids: '
+                                            '{}) - use "-p" to specify a pid '
+                                            'instead'.format(" ".join(pids)))
+        parser.values.pid = pids[0]
+
     optparser = optparse.OptionParser(description=description_text,
                                       formatter=PlainHelpFormatter())
     optparser.add_option('-1', '--once', '--batch',
@@ -1158,6 +1247,14 @@ Press any other key to refresh statistics immediately.
                          dest='pid',
                          help='restrict statistics to pid',
                          )
+    optparser.add_option('-g', '--guest',
+                         action='callback',
+                         type='string',
+                         dest='pid',
+                         metavar='GUEST',
+                         help='restrict statistics to guest by name',
+                         callback=cb_guest_to_pid,
+                         )
     (options, _) = optparser.parse_args(sys.argv)
     return options
 
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index 077bcc7e20dc83..35587c3c26106f 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -31,6 +31,8 @@ INTERACTIVE COMMANDS
 [horizontal]
 *f*::	filter by regular expression
 
+*g*::	filter by guest name
+
 *p*::	filter by PID
 
 *q*::	quit
@@ -62,6 +64,10 @@ OPTIONS
 --pid=<pid>::
 	limit statistics to one virtual machine (pid)
 
+-g<guest>::
+--guest=<guest_name>::
+	limit statistics to one virtual machine (guest name)
+
 -f<fields>::
 --fields=<fields>::
 	fields to display (regex)

From 4443084fa0cf85f91d357c8917b90504b784d925 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:14 +0100
Subject: [PATCH 070/227] tools/kvm_stat: add interactive command 'c'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide a real simple way to erase any active filter.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Reviewed-by: Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat     | 16 ++++++++++++----
 tools/kvm/kvm_stat/kvm_stat.txt |  2 ++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index f263312c9a2999..676a92a4a12ee9 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -861,6 +861,7 @@ DELAY_INITIAL = 0.25
 DELAY_REGULAR = 3.0
 MAX_GUEST_NAME_LEN = 48
 MAX_REGEX_LEN = 44
+DEFAULT_REGEX = r'^[^\(]*$'
 
 
 class Tui(object):
@@ -907,9 +908,9 @@ class Tui(object):
     def update_drilldown(self):
         """Sets or removes a filter that only allows fields without braces."""
         if not self.stats.fields_filter:
-            self.stats.fields_filter = r'^[^\(]*$'
+            self.stats.fields_filter = DEFAULT_REGEX
 
-        elif self.stats.fields_filter == r'^[^\(]*$':
+        elif self.stats.fields_filter == DEFAULT_REGEX:
             self.stats.fields_filter = None
 
     def update_pid(self, pid):
@@ -931,7 +932,8 @@ class Tui(object):
                                .format(pid, gname), curses.A_BOLD)
         else:
             self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
-        if self.stats.fields_filter and self.stats.fields_filter != '^[^\(]*$':
+        if self.stats.fields_filter and self.stats.fields_filter \
+           != DEFAULT_REGEX:
             regex = self.stats.fields_filter
             if len(regex) > MAX_REGEX_LEN:
                 regex = regex[:MAX_REGEX_LEN] + '...'
@@ -991,7 +993,7 @@ class Tui(object):
             regex = self.screen.getstr()
             curses.noecho()
             if len(regex) == 0:
-                self.stats.fields_filter = r'^[^\(]*$'
+                self.stats.fields_filter = DEFAULT_REGEX
                 self.refresh_header()
                 return
             try:
@@ -1101,6 +1103,11 @@ class Tui(object):
                     sleeptime = DELAY_INITIAL
                 if char == 'q':
                     break
+                if char == 'c':
+                    self.stats.fields_filter = DEFAULT_REGEX
+                    self.refresh_header(0)
+                    self.update_pid(0)
+                    sleeptime = DELAY_INITIAL
                 if char == 'f':
                     self.show_filter_selection()
                     sleeptime = DELAY_INITIAL
@@ -1177,6 +1184,7 @@ Requirements:
   the large number of files that are possibly opened.
 
 Interactive Commands:
+   c     clear filter
    f     filter by regular expression
    g     filter by guest name
    p     filter by PID
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index 35587c3c26106f..c3ab6a2c8c3765 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -29,6 +29,8 @@ meaning of events.
 INTERACTIVE COMMANDS
 --------------------
 [horizontal]
+*c*::	clear filter
+
 *f*::	filter by regular expression
 
 *g*::	filter by guest name

From 9f114a03c6854f49065dd036c17e1b4bb947f842 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:15 +0100
Subject: [PATCH 071/227] tools/kvm_stat: add interactive command 'r'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide an interactive command to reset the tracepoint statistics.
Requires some extra work for debugfs, as the counters cannot be reset.

On the up side, this offers us the opportunity to have debugfs values
reset on startup and whenever a filter is modified, becoming consistent
with the tracepoint provider. As a bonus, 'kvmstat -dt' will now provide
useful output, instead of mixing values in totally different orders of
magnitude.
Furthermore, we avoid unnecessary resets when any of the filters is
"changed" interactively to the previous value.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat     | 65 ++++++++++++++++++++++++++-------
 tools/kvm/kvm_stat/kvm_stat.txt |  2 +
 2 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 676a92a4a12ee9..f8c48f8d07a7db 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -716,15 +716,23 @@ class TracepointProvider(object):
                     ret[name] += val
         return ret
 
+    def reset(self):
+        """Reset all field counters"""
+        for group in self.group_leaders:
+            for event in group.events:
+                event.reset()
+
 
 class DebugfsProvider(object):
     """Provides data from the files that KVM creates in the kvm debugfs
     folder."""
     def __init__(self):
         self._fields = self.get_available_fields()
+        self._baseline = {}
         self._pid = 0
         self.do_read = True
         self.paths = []
+        self.reset()
 
     def get_available_fields(self):
         """"Returns a list of available fields.
@@ -741,6 +749,7 @@ class DebugfsProvider(object):
     @fields.setter
     def fields(self, fields):
         self._fields = fields
+        self.reset()
 
     @property
     def pid(self):
@@ -758,10 +767,11 @@ class DebugfsProvider(object):
             self.paths = filter(lambda x: "{}-".format(pid) in x, vms)
 
         else:
-            self.paths = ['']
+            self.paths = []
             self.do_read = True
+        self.reset()
 
-    def read(self):
+    def read(self, reset=0):
         """Returns a dict with format:'file name / field -> current value'."""
         results = {}
 
@@ -769,10 +779,22 @@ class DebugfsProvider(object):
         if not self.do_read:
             return results
 
-        for path in self.paths:
+        paths = self.paths
+        if self._pid == 0:
+            paths = []
+            for entry in os.walk(PATH_DEBUGFS_KVM):
+                for dir in entry[1]:
+                    paths.append(dir)
+        for path in paths:
             for field in self._fields:
-                results[field] = results.get(field, 0) \
-                                 + self.read_field(field, path)
+                value = self.read_field(field, path)
+                key = path + field
+                if reset:
+                    self._baseline[key] = value
+                if self._baseline.get(key, -1) == -1:
+                    self._baseline[key] = value
+                results[field] = (results.get(field, 0) + value -
+                                  self._baseline.get(key, 0))
 
         return results
 
@@ -786,6 +808,11 @@ class DebugfsProvider(object):
         except IOError:
             return 0
 
+    def reset(self):
+        """Reset field counters"""
+        self._baseline = {}
+        self.read(1)
+
 
 class Stats(object):
     """Manages the data providers and the data they provide.
@@ -822,14 +849,20 @@ class Stats(object):
         for provider in self.providers:
             provider.pid = self._pid_filter
 
+    def reset(self):
+        self.values = {}
+        for provider in self.providers:
+            provider.reset()
+
     @property
     def fields_filter(self):
         return self._fields_filter
 
     @fields_filter.setter
     def fields_filter(self, fields_filter):
-        self._fields_filter = fields_filter
-        self.update_provider_filters()
+        if fields_filter != self._fields_filter:
+            self._fields_filter = fields_filter
+            self.update_provider_filters()
 
     @property
     def pid_filter(self):
@@ -837,9 +870,10 @@ class Stats(object):
 
     @pid_filter.setter
     def pid_filter(self, pid):
-        self._pid_filter = pid
-        self.values = {}
-        self.update_provider_pid()
+        if pid != self._pid_filter:
+            self._pid_filter = pid
+            self.values = {}
+            self.update_provider_pid()
 
     def get(self):
         """Returns a dict with field -> (value, delta to last value) of all
@@ -847,11 +881,9 @@ class Stats(object):
         for provider in self.providers:
             new = provider.read()
             for key in provider.fields:
-                oldval = self.values.get(key, (0, 0))
+                oldval = self.values.get(key, (0, 0))[0]
                 newval = new.get(key, 0)
-                newdelta = None
-                if oldval is not None:
-                    newdelta = newval - oldval[0]
+                newdelta = newval - oldval
                 self.values[key] = (newval, newdelta)
         return self.values
 
@@ -1117,6 +1149,10 @@ class Tui(object):
                 if char == 'p':
                     self.show_vm_selection_by_pid()
                     sleeptime = DELAY_INITIAL
+                if char == 'r':
+                    self.refresh_header()
+                    self.stats.reset()
+                    sleeptime = DELAY_INITIAL
             except KeyboardInterrupt:
                 break
             except curses.error:
@@ -1190,6 +1226,7 @@ Interactive Commands:
    p     filter by PID
    q     quit
    x     toggle reporting of stats for individual child trace events
+   r     reset stats
 Press any other key to refresh statistics immediately.
 """
 
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index c3ab6a2c8c3765..109431bdc63c99 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -39,6 +39,8 @@ INTERACTIVE COMMANDS
 
 *q*::	quit
 
+*r*::	reset stats
+
 *x*::	toggle reporting of stats for child trace events
 
 Press any other key to refresh statistics immediately.

From e55fe3ccccc1efb8f20c99728c8863424ae9ee4a Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 13:40:16 +0100
Subject: [PATCH 072/227] tools/kvm_stat: add '%Total' column
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add column '%Total' next to 'Total' for easier comparison of numbers between
hosts.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Marc Hartmayer <mhartmay@linux.vnet.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index f8c48f8d07a7db..8f74ed8e723729 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -973,7 +973,9 @@ class Tui(object):
         self.screen.addstr(2, 1, 'Event')
         self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
                            len('Total'), 'Total')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 -
+                           len('%Total'), '%Total')
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 + 8 -
                            len('Current'), 'Current')
         self.screen.addstr(4, 1, 'Collecting data...')
         self.screen.refresh()
@@ -989,6 +991,9 @@ class Tui(object):
                 return (-stats[x][1], -stats[x][0])
             else:
                 return (0, -stats[x][0])
+        total = 0.
+        for val in stats.values():
+            total += val[0]
         for key in sorted(stats.keys(), key=sortkey):
 
             if row >= self.screen.getmaxyx()[0]:
@@ -1001,6 +1006,8 @@ class Tui(object):
             col += LABEL_WIDTH
             self.screen.addstr(row, col, '%10d' % (values[0],))
             col += NUMBER_WIDTH
+            self.screen.addstr(row, col, '%7.1f' % (values[0] * 100 / total,))
+            col += 7
             if values[1] is not None:
                 self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
             row += 1

From 08fab50da669e5ee5a542592895fcb63be3cd7b1 Mon Sep 17 00:00:00 2001
From: Fei Li <sherrylf@linux.vnet.ibm.com>
Date: Thu, 19 Jan 2017 17:02:26 +0100
Subject: [PATCH 073/227] KVM: s390: interface for suppressible I/O adapters

In order to properly implement adapter-interruption suppression, we
need a way for userspace to specify which adapters are subject to
suppression. Let's convert the existing (and unused) 'pad' field into
a 'flags' field and define a flag value for suppressible adapters.

Besides, add documentation for the interface.

Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/s390_flic.txt | 12 +++++++++---
 arch/s390/include/asm/kvm_host.h                |  1 +
 arch/s390/include/uapi/asm/kvm.h                |  4 +++-
 arch/s390/kvm/interrupt.c                       |  2 ++
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index 6b0e115301c875..e8ee3b6edb05a8 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -64,12 +64,18 @@ struct kvm_s390_io_adapter {
 	__u8 isc;
 	__u8 maskable;
 	__u8 swap;
-	__u8 pad;
+	__u8 flags;
 };
 
    id contains the unique id for the adapter, isc the I/O interruption subclass
-   to use, maskable whether this adapter may be masked (interrupts turned off)
-   and swap whether the indicators need to be byte swapped.
+   to use, maskable whether this adapter may be masked (interrupts turned off),
+   swap whether the indicators need to be byte swapped, and flags contains
+   further characteristics of the adapter.
+   Currently defined values for 'flags' are:
+   - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS
+     (adapter-interrupt-suppression) facility. This flag only has an effect if
+     the AIS capability is enabled.
+   Unknown flag values are ignored.
 
 
   KVM_DEV_FLIC_ADAPTER_MODIFY
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 1af090d93bf581..499c72c2280d39 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -621,6 +621,7 @@ struct s390_io_adapter {
 	bool maskable;
 	bool masked;
 	bool swap;
+	bool suppressible;
 	struct rw_semaphore maps_lock;
 	struct list_head maps;
 	atomic_t nr_maps;
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 5bd23cfd9ae5d4..5fa144d1df0a74 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -41,9 +41,11 @@ struct kvm_s390_io_adapter {
 	__u8 isc;
 	__u8 maskable;
 	__u8 swap;
-	__u8 pad;
+	__u8 flags;
 };
 
+#define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01
+
 #define KVM_S390_IO_ADAPTER_MASK 1
 #define KVM_S390_IO_ADAPTER_MAP 2
 #define KVM_S390_IO_ADAPTER_UNMAP 3
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 311eef0df85565..dba51ad625708a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1997,6 +1997,8 @@ static int register_io_adapter(struct kvm_device *dev,
 	adapter->maskable = adapter_info.maskable;
 	adapter->masked = false;
 	adapter->swap = adapter_info.swap;
+	adapter->suppressible = (adapter_info.flags) &
+				KVM_S390_ADAPTER_SUPPRESSIBLE;
 	dev->kvm->arch.adapters[adapter->id] = adapter;
 
 	return 0;

From 519783935451764b397f2a712de5ea778ff77fdf Mon Sep 17 00:00:00 2001
From: Fei Li <sherrylf@linux.vnet.ibm.com>
Date: Fri, 17 Feb 2017 17:06:26 +0800
Subject: [PATCH 074/227] KVM: s390: introduce ais mode modify function

Provide an interface for userspace to modify AIS
(adapter-interruption-suppression) mode state, and add documentation
for the interface. Allowed target modes are ALL-Interruptions mode
and SINGLE-Interruption mode.

We introduce the 'simm' and 'nimm' fields in kvm_s390_float_interrupt
to store interruption modes for each ISC. Each bit in 'simm' and
'nimm' targets to one ISC, and collaboratively indicate three modes:
ALL-Interruptions, SINGLE-Interruption and NO-Interruptions. This
interface can initiate most transitions between the states; transition
from SINGLE-Interruption to NO-Interruptions via adapter interrupt
injection will be introduced in a following patch. The meaningful
combinations are as follows:

    interruption mode | simm bit | nimm bit
    ------------------|----------|----------
             ALL      |    0     |     0
           SINGLE     |    1     |     0
             NO       |    1     |     1

Besides, add tracepoint to track AIS mode transitions.

Co-Authored-By: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 .../virtual/kvm/devices/s390_flic.txt         | 20 +++++++++
 arch/s390/include/asm/kvm_host.h              | 10 +++++
 arch/s390/include/uapi/asm/kvm.h              |  6 +++
 arch/s390/kvm/interrupt.c                     | 43 +++++++++++++++++++
 arch/s390/kvm/kvm-s390.c                      |  4 ++
 arch/s390/kvm/trace-s390.h                    | 31 +++++++++++++
 6 files changed, 114 insertions(+)

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index e8ee3b6edb05a8..dfd42fd4abd590 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -14,6 +14,7 @@ FLIC provides support to
 - purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ)
 - enable/disable for the guest transparent async page faults
 - register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
+- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM)
 
 Groups:
   KVM_DEV_FLIC_ENQUEUE
@@ -107,6 +108,25 @@ struct kvm_s390_io_adapter_req {
       release a userspace page for the translated address specified in addr
       from the list of mappings
 
+  KVM_DEV_FLIC_AISM
+    modify the adapter-interruption-suppression mode for a given isc if the
+    AIS capability is enabled. Takes a kvm_s390_ais_req describing:
+
+struct kvm_s390_ais_req {
+	__u8 isc;
+	__u16 mode;
+};
+
+    isc contains the target I/O interruption subclass, mode the target
+    adapter-interruption-suppression mode. The following modes are
+    currently supported:
+    - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection
+      is always allowed;
+    - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq
+      injection is only allowed once and the following adapter interrupts
+      will be suppressed until the mode is set again to ALL-Interruptions
+      or SINGLE-Interruption mode.
+
 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
 FLIC with an unknown group or attribute gives the error code EINVAL (instead of
 ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 499c72c2280d39..552c319483c63d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -521,6 +521,12 @@ struct kvm_s390_local_interrupt {
 #define FIRQ_CNTR_PFAULT   3
 #define FIRQ_MAX_COUNT     4
 
+/* mask the AIS mode for a given ISC */
+#define AIS_MODE_MASK(isc) (0x80 >> isc)
+
+#define KVM_S390_AIS_MODE_ALL    0
+#define KVM_S390_AIS_MODE_SINGLE 1
+
 struct kvm_s390_float_interrupt {
 	unsigned long pending_irqs;
 	spinlock_t lock;
@@ -530,6 +536,10 @@ struct kvm_s390_float_interrupt {
 	struct kvm_s390_ext_info srv_signal;
 	int next_rr_cpu;
 	unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+	struct mutex ais_lock;
+	u8 simm;
+	u8 nimm;
+	int ais_enabled;
 };
 
 struct kvm_hw_wp_info_arch {
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 5fa144d1df0a74..50d2a927c990d8 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -26,6 +26,7 @@
 #define KVM_DEV_FLIC_ADAPTER_REGISTER	6
 #define KVM_DEV_FLIC_ADAPTER_MODIFY	7
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
+#define KVM_DEV_FLIC_AISM		9
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
@@ -46,6 +47,11 @@ struct kvm_s390_io_adapter {
 
 #define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01
 
+struct kvm_s390_ais_req {
+	__u8 isc;
+	__u16 mode;
+};
+
 #define KVM_S390_IO_ADAPTER_MASK 1
 #define KVM_S390_IO_ADAPTER_MAP 2
 #define KVM_S390_IO_ADAPTER_UNMAP 3
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index dba51ad625708a..96b689e48c084c 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2152,6 +2152,45 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr)
 	return 0;
 }
 
+static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_req req;
+	int ret = 0;
+
+	if (!fi->ais_enabled)
+		return -ENOTSUPP;
+
+	if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
+		return -EFAULT;
+
+	if (req.isc > MAX_ISC)
+		return -EINVAL;
+
+	trace_kvm_s390_modify_ais_mode(req.isc,
+				       (fi->simm & AIS_MODE_MASK(req.isc)) ?
+				       (fi->nimm & AIS_MODE_MASK(req.isc)) ?
+				       2 : KVM_S390_AIS_MODE_SINGLE :
+				       KVM_S390_AIS_MODE_ALL, req.mode);
+
+	mutex_lock(&fi->ais_lock);
+	switch (req.mode) {
+	case KVM_S390_AIS_MODE_ALL:
+		fi->simm &= ~AIS_MODE_MASK(req.isc);
+		fi->nimm &= ~AIS_MODE_MASK(req.isc);
+		break;
+	case KVM_S390_AIS_MODE_SINGLE:
+		fi->simm |= AIS_MODE_MASK(req.isc);
+		fi->nimm &= ~AIS_MODE_MASK(req.isc);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	mutex_unlock(&fi->ais_lock);
+
+	return ret;
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2188,6 +2227,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 		r = clear_io_irq(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AISM:
+		r = modify_ais_mode(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2207,6 +2249,7 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_ADAPTER_REGISTER:
 	case KVM_DEV_FLIC_ADAPTER_MODIFY:
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
+	case KVM_DEV_FLIC_AISM:
 		return 0;
 	}
 	return -ENXIO;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f83f18b77f3d32..977cc1660a8386 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1515,6 +1515,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	kvm_s390_crypto_init(kvm);
 
+	mutex_init(&kvm->arch.float_int.ais_lock);
+	kvm->arch.float_int.simm = 0;
+	kvm->arch.float_int.nimm = 0;
+	kvm->arch.float_int.ais_enabled = 0;
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
 		INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 396485bca191ce..b32994d1546a67 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -280,6 +280,37 @@ TRACE_EVENT(kvm_s390_enable_disable_ibs,
 		      __entry->state ? "enabling" : "disabling", __entry->id)
 	);
 
+/*
+ * Trace point for modifying ais mode for a given isc.
+ */
+TRACE_EVENT(kvm_s390_modify_ais_mode,
+	    TP_PROTO(__u8 isc, __u16 from, __u16 to),
+	    TP_ARGS(isc, from, to),
+
+	    TP_STRUCT__entry(
+		    __field(__u8, isc)
+		    __field(__u16, from)
+		    __field(__u16, to)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->isc = isc;
+		    __entry->from = from;
+		    __entry->to = to;
+		    ),
+
+	    TP_printk("for isc %x, modifying interruption mode from %s to %s",
+		      __entry->isc,
+		      (__entry->from == KVM_S390_AIS_MODE_ALL) ?
+		      "ALL-Interruptions Mode" :
+		      (__entry->from == KVM_S390_AIS_MODE_SINGLE) ?
+		      "Single-Interruption Mode" : "No-Interruptions Mode",
+		      (__entry->to == KVM_S390_AIS_MODE_ALL) ?
+		      "ALL-Interruptions Mode" :
+		      (__entry->to == KVM_S390_AIS_MODE_SINGLE) ?
+		      "Single-Interruption Mode" : "No-Interruptions Mode")
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 

From a8920950131b1394f9e99ff57a5cf5ceeb0cc25c Mon Sep 17 00:00:00 2001
From: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Date: Mon, 20 Feb 2017 10:15:01 +0800
Subject: [PATCH 075/227] KVM: s390: introduce adapter interrupt inject
 function

Inject adapter interrupts on a specified adapter which allows to
retrieve the adapter flags, e.g. if the adapter is subject to AIS
facility or not. And add documentation for this interface.

For adapters subject to AIS, handle the airq injection suppression
for a given ISC according to the interruption mode:
- before injection, if NO-Interruptions Mode, just return 0 and
  suppress, otherwise, allow the injection.
- after injection, if SINGLE-Interruption Mode, change it to
  NO-Interruptions Mode to suppress the following interrupts.

Besides, add tracepoint for suppressed airq and AIS mode transitions.

Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 .../virtual/kvm/devices/s390_flic.txt         |  9 ++++
 arch/s390/include/uapi/asm/kvm.h              |  1 +
 arch/s390/kvm/interrupt.c                     | 53 ++++++++++++++++---
 arch/s390/kvm/trace-s390.h                    | 21 ++++++++
 4 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index dfd42fd4abd590..c2518cea8ab45f 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -15,6 +15,7 @@ FLIC provides support to
 - enable/disable for the guest transparent async page faults
 - register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
 - modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM)
+- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT)
 
 Groups:
   KVM_DEV_FLIC_ENQUEUE
@@ -127,6 +128,14 @@ struct kvm_s390_ais_req {
       will be suppressed until the mode is set again to ALL-Interruptions
       or SINGLE-Interruption mode.
 
+  KVM_DEV_FLIC_AIRQ_INJECT
+    Inject adapter interrupts on a specified adapter.
+    attr->attr contains the unique id for the adapter, which allows for
+    adapter-specific checks and actions.
+    For adapters subject to AIS, handle the airq injection suppression for
+    an isc according to the adapter-interruption-suppression mode on condition
+    that the AIS capability is enabled.
+
 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
 FLIC with an unknown group or attribute gives the error code EINVAL (instead of
 ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 50d2a927c990d8..2c9ad251fa3381 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -27,6 +27,7 @@
 #define KVM_DEV_FLIC_ADAPTER_MODIFY	7
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
 #define KVM_DEV_FLIC_AISM		9
+#define KVM_DEV_FLIC_AIRQ_INJECT	10
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 96b689e48c084c..482673e3436d65 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2191,6 +2191,48 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
 	return ret;
 }
 
+static int kvm_s390_inject_airq(struct kvm *kvm,
+				struct s390_io_adapter *adapter)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_interrupt s390int = {
+		.type = KVM_S390_INT_IO(1, 0, 0, 0),
+		.parm = 0,
+		.parm64 = (adapter->isc << 27) | 0x80000000,
+	};
+	int ret = 0;
+
+	if (!fi->ais_enabled || !adapter->suppressible)
+		return kvm_s390_inject_vm(kvm, &s390int);
+
+	mutex_lock(&fi->ais_lock);
+	if (fi->nimm & AIS_MODE_MASK(adapter->isc)) {
+		trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc);
+		goto out;
+	}
+
+	ret = kvm_s390_inject_vm(kvm, &s390int);
+	if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) {
+		fi->nimm |= AIS_MODE_MASK(adapter->isc);
+		trace_kvm_s390_modify_ais_mode(adapter->isc,
+					       KVM_S390_AIS_MODE_SINGLE, 2);
+	}
+out:
+	mutex_unlock(&fi->ais_lock);
+	return ret;
+}
+
+static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	unsigned int id = attr->attr;
+	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+
+	if (!adapter)
+		return -EINVAL;
+
+	return kvm_s390_inject_airq(kvm, adapter);
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2230,6 +2272,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_AISM:
 		r = modify_ais_mode(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AIRQ_INJECT:
+		r = flic_inject_airq(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2250,6 +2295,7 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_ADAPTER_MODIFY:
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 	case KVM_DEV_FLIC_AISM:
+	case KVM_DEV_FLIC_AIRQ_INJECT:
 		return 0;
 	}
 	return -ENXIO;
@@ -2360,12 +2406,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
 	up_read(&adapter->maps_lock);
 	if ((ret > 0) && !adapter->masked) {
-		struct kvm_s390_interrupt s390int = {
-			.type = KVM_S390_INT_IO(1, 0, 0, 0),
-			.parm = 0,
-			.parm64 = (adapter->isc << 27) | 0x80000000,
-		};
-		ret = kvm_s390_inject_vm(kvm, &s390int);
+		ret = kvm_s390_inject_airq(kvm, adapter);
 		if (ret == 0)
 			ret = 1;
 	}
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index b32994d1546a67..78b7e847984a6a 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -311,6 +311,27 @@ TRACE_EVENT(kvm_s390_modify_ais_mode,
 		      "Single-Interruption Mode" : "No-Interruptions Mode")
 	);
 
+/*
+ * Trace point for suppressed adapter I/O interrupt.
+ */
+TRACE_EVENT(kvm_s390_airq_suppressed,
+	    TP_PROTO(__u32 id, __u8 isc),
+	    TP_ARGS(id, isc),
+
+	    TP_STRUCT__entry(
+		    __field(__u32, id)
+		    __field(__u8, isc)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->isc = isc;
+		    ),
+
+	    TP_printk("adapter I/O interrupt suppressed (id:%x isc:%x)",
+		      __entry->id, __entry->isc)
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 

From 47a4693e1d3eb09e523c223753fb5a97721f49b8 Mon Sep 17 00:00:00 2001
From: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 09:29:38 +0100
Subject: [PATCH 076/227] KVM: s390: introduce AIS capability

Introduce a cap to enable AIS facility bit, and add documentation
for this capability.

Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt |  8 ++++++++
 arch/s390/kvm/kvm-s390.c          | 15 +++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 3 files changed, 24 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 72525085847908..598278cd0dc5d3 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4110,6 +4110,14 @@ Returns: 0 on success; -EINVAL if the machine does not support
 
 Allows use of guarded storage for the KVM guest.
 
+7.10 KVM_CAP_S390_AIS
+
+Architectures: s390
+Parameters: none
+
+Allow use of adapter-interruption suppression.
+Returns: 0 on success; -EBUSY if a VCPU has already been created.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 977cc1660a8386..11b7d66389916b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -380,6 +380,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
 	case KVM_CAP_S390_USER_INSTR0:
+	case KVM_CAP_S390_AIS:
 		r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
@@ -544,6 +545,20 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s",
 			 r ? "(not available)" : "(success)");
 		break;
+	case KVM_CAP_S390_AIS:
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus) {
+			r = -EBUSY;
+		} else {
+			set_kvm_facility(kvm->arch.model.fac_mask, 72);
+			set_kvm_facility(kvm->arch.model.fac_list, 72);
+			kvm->arch.float_int.ais_enabled = 1;
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: AIS %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	case KVM_CAP_S390_GS:
 		r = -EINVAL;
 		mutex_lock(&kvm->lock);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c9d522765f8f77..33dd2a4e36dc3f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -884,6 +884,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
 #define KVM_CAP_S390_GS 137
+#define KVM_CAP_S390_AIS 138
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From f0b98c02c18a3c76ccf359478f68c9bae81b5f78 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 15 Mar 2017 07:56:11 -0700
Subject: [PATCH 077/227] kvm: vmx: Don't use INVVPID when EPT is enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the Intel SDM, volume 3, section 28.3.2: Creating and
Using Cached Translation Information, "No linear mappings are used
while EPT is in use." INVEPT will invalidate both the guest-physical
mappings and the combined mappings in the TLBs and paging-structure
caches, so an INVVPID is superfluous.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2ee00dbbbd5188..eb776b613d7825 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4017,11 +4017,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
-	vpid_sync_context(vpid);
 	if (enable_ept) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+	} else {
+		vpid_sync_context(vpid);
 	}
 }
 

From 1279a6b124e4c93fac161dd3e481a24a423551c0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 20 Mar 2017 10:00:08 +0100
Subject: [PATCH 078/227] KVM: nVMX: single function for switching between vmcs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's combine it in a single function vmx_switch_vmcs().

Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eb776b613d7825..b19055a3106fdd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9145,16 +9145,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx_complete_interrupts(vmx);
 }
 
-static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int cpu;
 
-	if (vmx->loaded_vmcs == &vmx->vmcs01)
+	if (vmx->loaded_vmcs == vmcs)
 		return;
 
 	cpu = get_cpu();
-	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx->loaded_vmcs = vmcs;
 	vmx_vcpu_put(vcpu);
 	vmx_vcpu_load(vcpu, cpu);
 	vcpu->cpu = cpu;
@@ -9172,7 +9172,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 
        r = vcpu_load(vcpu);
        BUG_ON(r);
-       vmx_load_vmcs01(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
        free_nested(vmx);
        vcpu_put(vcpu);
 }
@@ -10424,7 +10424,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct loaded_vmcs *vmcs02;
-	int cpu;
 	u32 msr_entry_idx;
 	u32 exit_qual;
 
@@ -10437,18 +10436,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
-	cpu = get_cpu();
-	vmx->loaded_vmcs = vmcs02;
-	vmx_vcpu_put(vcpu);
-	vmx_vcpu_load(vcpu, cpu);
-	vcpu->cpu = cpu;
-	put_cpu();
-
+	vmx_switch_vmcs(vcpu, vmcs02);
 	vmx_segment_cache_clear(vmx);
 
 	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
 		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
+		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 		nested_vmx_entry_failure(vcpu, vmcs12,
 					 EXIT_REASON_INVALID_STATE, exit_qual);
 		return 1;
@@ -10461,7 +10454,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 					    vmcs12->vm_entry_msr_load_count);
 	if (msr_entry_idx) {
 		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
+		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 		nested_vmx_entry_failure(vcpu, vmcs12,
 				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
 		return 1;
@@ -11029,7 +11022,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	if (unlikely(vmx->fail))
 		vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
 
-	vmx_load_vmcs01(vcpu);
+	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
 	    && nested_exit_intr_ack_set(vcpu)) {

From 74f169090b6f36b867c9df0454366dd9af6f62d1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 26 Mar 2017 23:51:24 +0200
Subject: [PATCH 079/227] kvm/svm: Setup MCG_CAP on AMD properly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MCG_CAP[63:9] bits are reserved on AMD. However, on an AMD guest, this
MSR returns 0x100010a. More specifically, bit 24 is set, which is simply
wrong. That bit is MCG_SER_P and is present only on Intel. Thus, clean
up the reserved bits in order not to confuse guests.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5fba70646c3279..1b203abf76e1e6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5256,6 +5256,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
 	avic_handle_ldr_update(vcpu);
 }
 
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+	/* [63:9] are reserved. */
+	vcpu->arch.mcg_cap &= 0x1ff;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -5367,6 +5373,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.pmu_ops = &amd_pmu_ops,
 	.deliver_posted_interrupt = svm_deliver_avic_intr,
 	.update_pi_irte = svm_update_pi_irte,
+	.setup_mce = svm_setup_mce,
 };
 
 static int __init svm_init(void)

From 2c82878b0cb38fd516fd612c67852a6bbf282003 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 27 Mar 2017 14:37:28 +0200
Subject: [PATCH 080/227] KVM: VMX: require virtual NMI support

Virtual NMIs are only missing in Prescott and Yonah chips.  Both are obsolete
for virtualization usage---Yonah is 32-bit only even---so drop vNMI emulation.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 143 ++++++++++++---------------------------------
 1 file changed, 38 insertions(+), 105 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b19055a3106fdd..40b80e8959e1a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -615,10 +615,6 @@ struct vcpu_vmx {
 	int vpid;
 	bool emulation_required;
 
-	/* Support for vnmi-less CPUs */
-	int soft_vnmi_blocked;
-	ktime_t entry_time;
-	s64 vnmi_blocked_time;
 	u32 exit_reason;
 
 	/* Posted interrupt descriptor */
@@ -1290,11 +1286,6 @@ static inline bool cpu_has_vmx_invpcid(void)
 		SECONDARY_EXEC_ENABLE_INVPCID;
 }
 
-static inline bool cpu_has_virtual_nmis(void)
-{
-	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-}
-
 static inline bool cpu_has_vmx_wbinvd_exit(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -3623,9 +3614,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
-		 PIN_BASED_VMX_PREEMPTION_TIMER;
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+		PIN_BASED_VIRTUAL_NMIS;
+	opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
 				&_pin_based_exec_control) < 0)
 		return -EIO;
@@ -5298,8 +5289,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vmx->rmode.vm86_active = 0;
 
-	vmx->soft_vnmi_blocked = 0;
-
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(vcpu, 0);
 
@@ -5419,8 +5408,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-	if (!cpu_has_virtual_nmis() ||
-	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+	if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
 		enable_irq_window(vcpu);
 		return;
 	}
@@ -5461,19 +5449,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (!is_guest_mode(vcpu)) {
-		if (!cpu_has_virtual_nmis()) {
-			/*
-			 * Tracking the NMI-blocked state in software is built upon
-			 * finding the next open IRQ window. This, in turn, depends on
-			 * well-behaving guests: They have to keep IRQs disabled at
-			 * least as long as the NMI handler runs. Otherwise we may
-			 * cause NMI nesting, maybe breaking the guest. But as this is
-			 * highly unlikely, we can live with the residual risk.
-			 */
-			vmx->soft_vnmi_blocked = 1;
-			vmx->vnmi_blocked_time = 0;
-		}
-
 		++vcpu->stat.nmi_injections;
 		vmx->nmi_known_unmasked = false;
 	}
@@ -5490,8 +5465,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-	if (!cpu_has_virtual_nmis())
-		return to_vmx(vcpu)->soft_vnmi_blocked;
 	if (to_vmx(vcpu)->nmi_known_unmasked)
 		return false;
 	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
@@ -5501,20 +5474,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (!cpu_has_virtual_nmis()) {
-		if (vmx->soft_vnmi_blocked != masked) {
-			vmx->soft_vnmi_blocked = masked;
-			vmx->vnmi_blocked_time = 0;
-		}
-	} else {
-		vmx->nmi_known_unmasked = !masked;
-		if (masked)
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
-		else
-			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-					GUEST_INTR_STATE_NMI);
-	}
+	vmx->nmi_known_unmasked = !masked;
+	if (masked)
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+	else
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5522,9 +5488,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return 0;
 
-	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-		return 0;
-
 	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
 		   | GUEST_INTR_STATE_NMI));
@@ -6269,7 +6232,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	 * AAK134, BY25.
 	 */
 	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			cpu_has_virtual_nmis() &&
 			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
 
@@ -7820,7 +7782,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 	 * "blocked by NMI" bit has to be set before next VM entry.
 	 */
 	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			cpu_has_virtual_nmis() &&
 			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				GUEST_INTR_STATE_NMI);
@@ -8492,26 +8453,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
-	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-					get_vmcs12(vcpu))))) {
-		if (vmx_interrupt_allowed(vcpu)) {
-			vmx->soft_vnmi_blocked = 0;
-		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
-			   vcpu->arch.nmi_pending) {
-			/*
-			 * This CPU don't support us in finding the end of an
-			 * NMI-blocked window if the guest runs with IRQs
-			 * disabled. So we pull the trigger after 1 s of
-			 * futile waiting, but inform the user about this.
-			 */
-			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
-			       "state on VCPU %d after 1 s timeout\n",
-			       __func__, vcpu->vcpu_id);
-			vmx->soft_vnmi_blocked = 0;
-		}
-	}
-
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu);
@@ -8787,37 +8728,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
 	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-	if (cpu_has_virtual_nmis()) {
-		if (vmx->nmi_known_unmasked)
-			return;
-		/*
-		 * Can't use vmx->exit_intr_info since we're not sure what
-		 * the exit reason is.
-		 */
-		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
-		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
-		/*
-		 * SDM 3: 27.7.1.2 (September 2008)
-		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
-		 * a guest IRET fault.
-		 * SDM 3: 23.2.2 (September 2008)
-		 * Bit 12 is undefined in any of the following cases:
-		 *  If the VM exit sets the valid bit in the IDT-vectoring
-		 *   information field.
-		 *  If the VM exit is due to a double fault.
-		 */
-		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
-		    vector != DF_VECTOR && !idtv_info_valid)
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
-		else
-			vmx->nmi_known_unmasked =
-				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
-				  & GUEST_INTR_STATE_NMI);
-	} else if (unlikely(vmx->soft_vnmi_blocked))
-		vmx->vnmi_blocked_time +=
-			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+	if (vmx->nmi_known_unmasked)
+		return;
+	/*
+	 * Can't use vmx->exit_intr_info since we're not sure what
+	 * the exit reason is.
+	 */
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+	vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+	/*
+	 * SDM 3: 27.7.1.2 (September 2008)
+	 * Re-set bit "block by NMI" before VM entry if vmexit caused by
+	 * a guest IRET fault.
+	 * SDM 3: 23.2.2 (September 2008)
+	 * Bit 12 is undefined in any of the following cases:
+	 *  If the VM exit sets the valid bit in the IDT-vectoring
+	 *   information field.
+	 *  If the VM exit is due to a double fault.
+	 */
+	if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+	    vector != DF_VECTOR && !idtv_info_valid)
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+	else
+		vmx->nmi_known_unmasked =
+			!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+			  & GUEST_INTR_STATE_NMI);
 }
 
 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -8934,10 +8871,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long debugctlmsr, cr4;
 
-	/* Record the guest's net vcpu time for enforced NMI injections. */
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
-		vmx->entry_time = ktime_get();
-
 	/* Don't enter VMX if guest state is invalid, let the exit handler
 	   start emulation until we arrive back to a valid state */
 	if (vmx->emulation_required)

From ad6260da1e23cf937806e42c8490af3ff4530474 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 27 Mar 2017 14:30:40 +0200
Subject: [PATCH 081/227] KVM: x86: drop legacy device assignment

Legacy device assignment has been deprecated since 4.2 (released
1.5 years ago).  VFIO is better and everyone should have switched to it.
If they haven't, this should convince them. :)

Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  204 ------
 arch/x86/kvm/Kconfig              |   12 -
 arch/x86/kvm/Makefile             |    2 -
 arch/x86/kvm/assigned-dev.c       | 1058 -----------------------------
 arch/x86/kvm/assigned-dev.h       |   32 -
 arch/x86/kvm/iommu.c              |  356 ----------
 arch/x86/kvm/x86.c                |   14 +-
 include/linux/kvm_host.h          |   16 -
 virt/kvm/kvm_main.c               |   17 -
 9 files changed, 1 insertion(+), 1710 deletions(-)
 delete mode 100644 arch/x86/kvm/assigned-dev.c
 delete mode 100644 arch/x86/kvm/assigned-dev.h
 delete mode 100644 arch/x86/kvm/iommu.c

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 753e88e5eb2ae7..1a184843bf9c5a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1326,130 +1326,6 @@ The flags bitmap is defined as:
    /* the host supports the ePAPR idle hcall
    #define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
 
-4.48 KVM_ASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Assigns a host PCI device to the VM.
-
-struct kvm_assigned_pci_dev {
-	__u32 assigned_dev_id;
-	__u32 busnr;
-	__u32 devfn;
-	__u32 flags;
-	__u32 segnr;
-	union {
-		__u32 reserved[11];
-	};
-};
-
-The PCI device is specified by the triple segnr, busnr, and devfn.
-Identification in succeeding service requests is done via assigned_dev_id. The
-following flags are specified:
-
-/* Depends on KVM_CAP_IOMMU */
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-/* The following two depend on KVM_CAP_PCI_2_3 */
-#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-
-If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
-via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
-assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
-guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
-
-The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
-isolation of the device.  Usages not specifying this flag are deprecated.
-
-Only PCI header type 0 devices with PCI BAR resources are supported by
-device assignment.  The user requesting this ioctl must have read/write
-access to the PCI sysfs resource files associated with the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.49 KVM_DEASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Ends PCI device assignment, releasing all associated resources.
-
-See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is
-used in kvm_assigned_pci_dev to identify the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-4.50 KVM_ASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Assigns an IRQ to a passed-through device.
-
-struct kvm_assigned_irq {
-	__u32 assigned_dev_id;
-	__u32 host_irq; /* ignored (legacy field) */
-	__u32 guest_irq;
-	__u32 flags;
-	union {
-		__u32 reserved[12];
-	};
-};
-
-The following flags are defined:
-
-#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
-#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
-#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
-
-#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
-#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
-#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
-
-It is not valid to specify multiple types per host or guest IRQ. However, the
-IRQ type of host and guest can differ or can even be null.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.51 KVM_DEASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Ends an IRQ assignment to a passed-through device.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
-by assigned_dev_id, flags must correspond to the IRQ type specified on
-KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
-
-
 4.52 KVM_SET_GSI_ROUTING
 
 Capability: KVM_CAP_IRQ_ROUTING
@@ -1536,52 +1412,6 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
-4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_nr (in)
-Returns: 0 on success, -1 on error
-
-Set the number of MSI-X interrupts for an assigned device. The number is
-reset again by terminating the MSI-X assignment of the device via
-KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
-point will fail.
-
-struct kvm_assigned_msix_nr {
-	__u32 assigned_dev_id;
-	__u16 entry_nr;
-	__u16 padding;
-};
-
-#define KVM_MAX_MSIX_PER_DEV		256
-
-
-4.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_entry (in)
-Returns: 0 on success, -1 on error
-
-Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
-the GSI vector to zero means disabling the interrupt.
-
-struct kvm_assigned_msix_entry {
-	__u32 assigned_dev_id;
-	__u32 gsi;
-	__u16 entry; /* The index of entry in the MSI-X table */
-	__u16 padding[3];
-};
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
 
 4.55 KVM_SET_TSC_KHZ
 
@@ -1733,40 +1563,6 @@ should skip processing the bitmap and just invalidate everything.  It must
 be set to the number of set bits in the bitmap.
 
 
-4.61 KVM_ASSIGN_SET_INTX_MASK (deprecated)
-
-Capability: KVM_CAP_PCI_2_3
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Allows userspace to mask PCI INTx interrupts from the assigned device.  The
-kernel will not deliver INTx interrupts to the guest between setting and
-clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
-and emulation of PCI 2.3 INTx disable command register behavior.
-
-This may be used for both PCI 2.3 devices supporting INTx disable natively and
-older devices lacking this support. Userspace is responsible for emulating the
-read value of the INTx disable bit in the guest visible PCI command register.
-When modifying the INTx disable state, userspace should precede updating the
-physical device command register by calling this ioctl to inform the kernel of
-the new intended INTx mask state.
-
-Note that the kernel uses the device INTx disable bit to internally manage the
-device interrupt state for PCI 2.3 devices.  Reads of this register may
-therefore not match the expected value.  Writes should always use the guest
-intended INTx disable value rather than attempting to read-copy-update the
-current physical device state.  Races between user and kernel updates to the
-INTx disable bit are handled lazily in the kernel.  It's possible the device
-may generate unintended interrupts, but they will not be injected into the
-guest.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
-by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
-evaluated.
-
-
 4.62 KVM_CREATE_SPAPR_TCE
 
 Capability: KVM_CAP_SPAPR_TCE
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ab8e32f7b9a868..760433b2574a50 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,18 +86,6 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 auditing of KVM MMU events at runtime.
 
-config KVM_DEVICE_ASSIGNMENT
-	bool "KVM legacy PCI device assignment support (DEPRECATED)"
-	depends on KVM && PCI && IOMMU_API
-	default n
-	---help---
-	  Provide support for legacy PCI device assignment through KVM.  The
-	  kernel now also supports a full featured userspace device driver
-	  framework through VFIO, which supersedes this support and provides
-	  better security.
-
-	  If unsure, say N.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3bff2071047146..09d4b17be02266 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,8 +15,6 @@ kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
 			   hyperv.o page_track.o debugfs.o
 
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
-
 kvm-intel-y		+= vmx.o pmu_intel.o
 kvm-amd-y		+= svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
deleted file mode 100644
index 308b8597c6913c..00000000000000
--- a/arch/x86/kvm/assigned-dev.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-#include "assigned-dev.h"
-#include "trace/events/kvm.h"
-
-struct kvm_assigned_dev_kernel {
-	struct kvm_irq_ack_notifier ack_notifier;
-	struct list_head list;
-	int assigned_dev_id;
-	int host_segnr;
-	int host_busnr;
-	int host_devfn;
-	unsigned int entries_nr;
-	int host_irq;
-	bool host_irq_disabled;
-	bool pci_2_3;
-	struct msix_entry *host_msix_entries;
-	int guest_irq;
-	struct msix_entry *guest_msix_entries;
-	unsigned long irq_requested_type;
-	int irq_source_id;
-	int flags;
-	struct pci_dev *dev;
-	struct kvm *kvm;
-	spinlock_t intx_lock;
-	spinlock_t intx_mask_lock;
-	char irq_name[32];
-	struct pci_saved_state *pci_saved_state;
-};
-
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each_entry(match, head, list) {
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-				    *assigned_dev, int irq)
-{
-	int i, index;
-	struct msix_entry *host_msix_entries;
-
-	host_msix_entries = assigned_dev->host_msix_entries;
-
-	index = -1;
-	for (i = 0; i < assigned_dev->entries_nr; i++)
-		if (irq == host_msix_entries[i].vector) {
-			index = i;
-			break;
-		}
-	if (index < 0)
-		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-
-	return index;
-}
-
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret;
-
-	spin_lock(&assigned_dev->intx_lock);
-	if (pci_check_and_mask_intx(assigned_dev->dev)) {
-		assigned_dev->host_irq_disabled = true;
-		ret = IRQ_WAKE_THREAD;
-	} else
-		ret = IRQ_NONE;
-	spin_unlock(&assigned_dev->intx_lock);
-
-	return ret;
-}
-
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
-				 int vector)
-{
-	if (unlikely(assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_GUEST_INTX)) {
-		spin_lock(&assigned_dev->intx_mask_lock);
-		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
-			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id, vector, 1,
-				    false);
-		spin_unlock(&assigned_dev->intx_mask_lock);
-	} else
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1, false);
-}
-
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-		spin_lock_irq(&assigned_dev->intx_lock);
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-		spin_unlock_irq(&assigned_dev->intx_lock);
-	}
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
-				int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
-						irq, level);
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
-
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-				       assigned_dev->irq_source_id,
-				       assigned_dev->guest_irq, 1);
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-	int ret = 0;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-					   assigned_dev->irq_source_id,
-					   vector, 1);
-	}
-
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-	}
-
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev =
-		container_of(kian, struct kvm_assigned_dev_kernel,
-			     ack_notifier);
-
-	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-
-	spin_lock(&dev->intx_mask_lock);
-
-	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-		bool reassert = false;
-
-		spin_lock_irq(&dev->intx_lock);
-		/*
-		 * The guest IRQ may be shared so this ack can come from an
-		 * IRQ for another guest device.
-		 */
-		if (dev->host_irq_disabled) {
-			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-				enable_irq(dev->host_irq);
-			else if (!pci_check_and_unmask_intx(dev->dev))
-				reassert = true;
-			dev->host_irq_disabled = reassert;
-		}
-		spin_unlock_irq(&dev->intx_lock);
-
-		if (reassert)
-			kvm_set_irq(dev->kvm, dev->irq_source_id,
-				    dev->guest_irq, 1, false);
-	}
-
-	spin_unlock(&dev->intx_mask_lock);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-			       struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	if (assigned_dev->ack_notifier.gsi != -1)
-		kvm_unregister_irq_ack_notifier(kvm,
-						&assigned_dev->ack_notifier);
-
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 0, false);
-
-	if (assigned_dev->irq_source_id != -1)
-		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-	assigned_dev->irq_source_id = -1;
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-			      struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	/*
-	 * We disable irq here to prevent further events.
-	 *
-	 * Notice this maybe result in nested disable if the interrupt type is
-	 * INTx, but it's OK for we are going to free it.
-	 *
-	 * If this function is a part of VM destroy, please ensure that till
-	 * now, the kvm state is still legal for probably we also have to wait
-	 * on a currently running IRQ handler.
-	 */
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int i;
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq(assigned_dev->host_msix_entries[i].vector);
-
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			free_irq(assigned_dev->host_msix_entries[i].vector,
-				 assigned_dev);
-
-		assigned_dev->entries_nr = 0;
-		kfree(assigned_dev->host_msix_entries);
-		kfree(assigned_dev->guest_msix_entries);
-		pci_disable_msix(assigned_dev->dev);
-	} else {
-		/* Deal with MSI and INTx */
-		if ((assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_HOST_INTX) &&
-		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			spin_lock_irq(&assigned_dev->intx_lock);
-			pci_intx(assigned_dev->dev, false);
-			spin_unlock_irq(&assigned_dev->intx_lock);
-			synchronize_irq(assigned_dev->host_irq);
-		} else
-			disable_irq(assigned_dev->host_irq);
-
-		free_irq(assigned_dev->host_irq, assigned_dev);
-
-		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-			pci_disable_msi(assigned_dev->dev);
-	}
-
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *assigned_dev,
-			    unsigned long irq_requested_type)
-{
-	unsigned long guest_irq_type, host_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return -EINVAL;
-	/* no irq assignment to deassign */
-	if (!assigned_dev->irq_requested_type)
-		return -ENXIO;
-
-	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-	if (host_irq_type)
-		deassign_host_irq(kvm, assigned_dev);
-	if (guest_irq_type)
-		deassign_guest_irq(kvm, assigned_dev);
-
-	return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	kvm_free_assigned_irq(kvm, assigned_dev);
-
-	pci_reset_function(assigned_dev->dev);
-	if (pci_load_and_free_saved_state(assigned_dev->dev,
-					  &assigned_dev->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&assigned_dev->dev->dev));
-	else
-		pci_restore_state(assigned_dev->dev);
-
-	pci_clear_dev_assigned(assigned_dev->dev);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
-
-	list_for_each_entry_safe(assigned_dev, tmp,
-				 &kvm->arch.assigned_dev_head, list) {
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	irq_handler_t irq_handler;
-	unsigned long flags;
-
-	dev->host_irq = dev->dev->irq;
-
-	/*
-	 * We can only share the IRQ line with other host devices if we are
-	 * able to disable the IRQ source at device-level - independently of
-	 * the guest driver. Otherwise host devices may suffer from unbounded
-	 * IRQ latencies when the guest keeps the line asserted.
-	 */
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		irq_handler = kvm_assigned_dev_intx;
-		flags = IRQF_SHARED;
-	} else {
-		irq_handler = NULL;
-		flags = IRQF_ONESHOT;
-	}
-	if (request_threaded_irq(dev->host_irq, irq_handler,
-				 kvm_assigned_dev_thread_intx, flags,
-				 dev->irq_name, dev))
-		return -EIO;
-
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		spin_lock_irq(&dev->intx_lock);
-		pci_intx(dev->dev, true);
-		spin_unlock_irq(&dev->intx_lock);
-	}
-	return 0;
-}
-
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-					   struct kvm_assigned_dev_kernel *dev)
-{
-	int r;
-
-	if (!dev->dev->msi_enabled) {
-		r = pci_enable_msi(dev->dev);
-		if (r)
-			return r;
-	}
-
-	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
-				 kvm_assigned_dev_thread_msi, 0,
-				 dev->irq_name, dev)) {
-		pci_disable_msi(dev->dev);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	int i, r = -EINVAL;
-
-	/* host_msix_entries and guest_msix_entries should have been
-	 * initialized */
-	if (dev->entries_nr == 0)
-		return r;
-
-	r = pci_enable_msix_exact(dev->dev,
-				  dev->host_msix_entries, dev->entries_nr);
-	if (r)
-		return r;
-
-	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_threaded_irq(dev->host_msix_entries[i].vector,
-					 kvm_assigned_dev_msix,
-					 kvm_assigned_dev_thread_msix,
-					 0, dev->irq_name, dev);
-		if (r)
-			goto err;
-	}
-
-	return 0;
-err:
-	for (i -= 1; i >= 0; i--)
-		free_irq(dev->host_msix_entries[i].vector, dev);
-	pci_disable_msix(dev->dev);
-	return r;
-}
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-				struct kvm_assigned_dev_kernel *dev,
-				struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = irq->guest_irq;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assign_host_irq(struct kvm *kvm,
-			   struct kvm_assigned_dev_kernel *dev,
-			   __u32 host_irq_type)
-{
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-		return r;
-
-	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
-		 pci_name(dev->dev));
-
-	switch (host_irq_type) {
-	case KVM_DEV_IRQ_HOST_INTX:
-		r = assigned_device_enable_host_intx(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSI:
-		r = assigned_device_enable_host_msi(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSIX:
-		r = assigned_device_enable_host_msix(kvm, dev);
-		break;
-	default:
-		r = -EINVAL;
-	}
-	dev->host_irq_disabled = false;
-
-	if (!r)
-		dev->irq_requested_type |= host_irq_type;
-
-	return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *dev,
-			    struct kvm_assigned_irq *irq,
-			    unsigned long guest_irq_type)
-{
-	int id;
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-		return r;
-
-	id = kvm_request_irq_source_id(kvm);
-	if (id < 0)
-		return id;
-
-	dev->irq_source_id = id;
-
-	switch (guest_irq_type) {
-	case KVM_DEV_IRQ_GUEST_INTX:
-		r = assigned_device_enable_guest_intx(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSI:
-		r = assigned_device_enable_guest_msi(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSIX:
-		r = assigned_device_enable_guest_msix(kvm, dev, irq);
-		break;
-	default:
-		r = -EINVAL;
-	}
-
-	if (!r) {
-		dev->irq_requested_type |= guest_irq_type;
-		if (dev->ack_notifier.gsi != -1)
-			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-	} else {
-		kvm_free_irq_source_id(kvm, dev->irq_source_id);
-		dev->irq_source_id = -1;
-	}
-
-	return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq *assigned_irq)
-{
-	int r = -EINVAL;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long host_irq_type, guest_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return r;
-
-	mutex_lock(&kvm->lock);
-	r = -ENODEV;
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-	r = -EINVAL;
-	/* can only assign one type at a time */
-	if (hweight_long(host_irq_type) > 1)
-		goto out;
-	if (hweight_long(guest_irq_type) > 1)
-		goto out;
-	if (host_irq_type == 0 && guest_irq_type == 0)
-		goto out;
-
-	r = 0;
-	if (host_irq_type)
-		r = assign_host_irq(kvm, match, host_irq_type);
-	if (r)
-		goto out;
-
-	if (guest_irq_type)
-		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-					 struct kvm_assigned_irq
-					 *assigned_irq)
-{
-	int r = -ENODEV;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long irq_type;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
-					  KVM_DEV_IRQ_GUEST_MASK);
-	r = kvm_deassign_irq(kvm, match, irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device.  To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs.  PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file.  We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
-	int i;
-	bool bar_found = false;
-
-	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
-		char *kpath, *syspath;
-		struct path path;
-		struct inode *inode;
-		int r;
-
-		if (!pci_resource_len(dev, i))
-			continue;
-
-		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
-		if (!kpath)
-			return -ENOMEM;
-
-		/* Per sysfs-rules, sysfs is always at /sys */
-		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
-		kfree(kpath);
-		if (!syspath)
-			return -ENOMEM;
-
-		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
-		kfree(syspath);
-		if (r)
-			return r;
-
-		inode = d_backing_inode(path.dentry);
-
-		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
-		path_put(&path);
-		if (r)
-			return r;
-
-		bar_found = true;
-	}
-
-	/* If no resources, probably something special */
-	if (!bar_found)
-		return -EPERM;
-
-	return 0;
-#else
-	return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0, idx;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
-		return -EINVAL;
-
-	mutex_lock(&kvm->lock);
-	idx = srcu_read_lock(&kvm->srcu);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EEXIST;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
-				   assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-
-	/* Don't allow bridges to be assigned */
-	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
-		r = -EPERM;
-		goto out_put;
-	}
-
-	r = probe_sysfs_permissions(dev);
-	if (r)
-		goto out_put;
-
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-
-	pci_reset_function(dev);
-	pci_save_state(dev);
-	match->pci_saved_state = pci_store_saved_state(dev);
-	if (!match->pci_saved_state)
-		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-
-	if (!pci_intx_mask_supported(dev))
-		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_segnr = assigned_dev->segnr;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->flags = assigned_dev->flags;
-	match->dev = dev;
-	spin_lock_init(&match->intx_lock);
-	spin_lock_init(&match->intx_mask_lock);
-	match->irq_source_id = -1;
-	match->kvm = kvm;
-	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (!kvm->arch.iommu_domain) {
-		r = kvm_iommu_map_guest(kvm);
-		if (r)
-			goto out_list_del;
-	}
-	r = kvm_assign_device(kvm, match->dev);
-	if (r)
-		goto out_list_del;
-
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-out_list_del:
-	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		printk(KERN_INFO "%s: device hasn't been assigned before, "
-		  "so cannot be deassigned\n", __func__);
-		r = -EINVAL;
-		goto out;
-	}
-
-	kvm_deassign_device(kvm, match->dev);
-
-	kvm_free_assigned_device(kvm, match);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-				    struct kvm_assigned_msix_nr *entry_nr)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry_nr->assigned_dev_id);
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_nr_out;
-	}
-
-	if (adev->entries_nr == 0) {
-		adev->entries_nr = entry_nr->entry_nr;
-		if (adev->entries_nr == 0 ||
-		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
-			r = -EINVAL;
-			goto msix_nr_out;
-		}
-
-		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-						entry_nr->entry_nr,
-						GFP_KERNEL);
-		if (!adev->host_msix_entries) {
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-		adev->guest_msix_entries =
-			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
-				GFP_KERNEL);
-		if (!adev->guest_msix_entries) {
-			kfree(adev->host_msix_entries);
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-	} else /* Not allowed set MSI-X number twice */
-		r = -EINVAL;
-msix_nr_out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-				       struct kvm_assigned_msix_entry *entry)
-{
-	int r = 0, i;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry->assigned_dev_id);
-
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_entry_out;
-	}
-
-	for (i = 0; i < adev->entries_nr; i++)
-		if (adev->guest_msix_entries[i].vector == 0 ||
-		    adev->guest_msix_entries[i].entry == entry->entry) {
-			adev->guest_msix_entries[i].entry = entry->entry;
-			adev->guest_msix_entries[i].vector = entry->gsi;
-			adev->host_msix_entries[i].entry = entry->entry;
-			break;
-		}
-	if (i == adev->entries_nr) {
-		r = -ENOSPC;
-		goto msix_entry_out;
-	}
-
-msix_entry_out:
-	mutex_unlock(&kvm->lock);
-
-	return r;
-}
-
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		r = -ENODEV;
-		goto out;
-	}
-
-	spin_lock(&match->intx_mask_lock);
-
-	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-
-	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-			kvm_set_irq(match->kvm, match->irq_source_id,
-				    match->guest_irq, 0, false);
-			/*
-			 * Masking at hardware-level is performed on demand,
-			 * i.e. when an IRQ actually arrives at the host.
-			 */
-		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			/*
-			 * Unmask the IRQ line if required. Unmasking at
-			 * device level will be performed by user space.
-			 */
-			spin_lock_irq(&match->intx_lock);
-			if (match->host_irq_disabled) {
-				enable_irq(match->host_irq);
-				match->host_irq_disabled = false;
-			}
-			spin_unlock_irq(&match->intx_lock);
-		}
-	}
-
-	spin_unlock(&match->intx_mask_lock);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int r;
-
-	switch (ioctl) {
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		r = -EOPNOTSUPP;
-		break;
-	}
-	case KVM_ASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_NR: {
-		struct kvm_assigned_msix_nr entry_nr;
-		r = -EFAULT;
-		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_ENTRY: {
-		struct kvm_assigned_msix_entry entry;
-		r = -EFAULT;
-		if (copy_from_user(&entry, argp, sizeof entry))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_INTX_MASK: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-		break;
-	}
-	default:
-		r = -ENOTTY;
-		break;
-	}
-out:
-	return r;
-}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
deleted file mode 100644
index a428c1a211b2ef..00000000000000
--- a/arch/x86/kvm/assigned-dev.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
-#define ARCH_X86_KVM_ASSIGNED_DEV_H
-
-#include <linux/kvm_host.h>
-
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
-
-int kvm_iommu_map_guest(struct kvm *kvm);
-int kvm_iommu_unmap_guest(struct kvm *kvm);
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg);
-
-void kvm_free_all_assigned_devices(struct kvm *kvm);
-#else
-static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	return 0;
-}
-
-static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-						unsigned long arg)
-{
-	return -ENOTTY;
-}
-
-static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
-#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
-
-#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
deleted file mode 100644
index b181426f67b4cf..00000000000000
--- a/arch/x86/kvm/iommu.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/iommu.h>
-#include "assigned-dev.h"
-
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
-		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages);
-
-static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
-			   unsigned long npages)
-{
-	gfn_t end_gfn;
-	kvm_pfn_t pfn;
-
-	pfn     = gfn_to_pfn_memslot(slot, gfn);
-	end_gfn = gfn + npages;
-	gfn    += 1;
-
-	if (is_error_noslot_pfn(pfn))
-		return pfn;
-
-	while (gfn < end_gfn)
-		gfn_to_pfn_memslot(slot, gfn++);
-
-	return pfn;
-}
-
-static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
-		unsigned long npages)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i)
-		kvm_release_pfn_clean(pfn + i);
-}
-
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	gfn_t gfn, end_gfn;
-	kvm_pfn_t pfn;
-	int r = 0;
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int flags;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	gfn     = slot->base_gfn;
-	end_gfn = gfn + slot->npages;
-
-	flags = IOMMU_READ;
-	if (!(slot->flags & KVM_MEM_READONLY))
-		flags |= IOMMU_WRITE;
-	if (!kvm->arch.iommu_noncoherent)
-		flags |= IOMMU_CACHE;
-
-
-	while (gfn < end_gfn) {
-		unsigned long page_size;
-
-		/* Check if already mapped */
-		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Get the page size we could use to map */
-		page_size = kvm_host_page_size(kvm, gfn);
-
-		/* Make sure the page_size does not exceed the memslot */
-		while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
-			page_size >>= 1;
-
-		/* Make sure gfn is aligned to the page size we want to map */
-		while ((gfn << PAGE_SHIFT) & (page_size - 1))
-			page_size >>= 1;
-
-		/* Make sure hva is aligned to the page size we want to map */
-		while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
-			page_size >>= 1;
-
-		/*
-		 * Pin all pages we are about to map in memory. This is
-		 * important because we unmap and unpin in 4kb steps later.
-		 */
-		pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
-		if (is_error_noslot_pfn(pfn)) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Map into IO address space */
-		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
-			      page_size, flags);
-		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%llx\n", pfn);
-			kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
-			goto unmap_pages;
-		}
-
-		gfn += page_size >> PAGE_SHIFT;
-
-		cond_resched();
-	}
-
-	return 0;
-
-unmap_pages:
-	kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
-	return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-	int idx, r = 0;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_register_noncoherent_dma(kvm);
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots) {
-		r = kvm_iommu_map_pages(kvm, memslot);
-		if (r)
-			break;
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return r;
-}
-
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
-	bool noncoherent;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	r = iommu_attach_device(domain, &pdev->dev);
-	if (r) {
-		dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
-		return r;
-	}
-
-	noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
-
-	/* Check if need to update IOMMU page table for guest memory */
-	if (noncoherent != kvm->arch.iommu_noncoherent) {
-		kvm_iommu_unmap_memslots(kvm);
-		kvm->arch.iommu_noncoherent = noncoherent;
-		r = kvm_iommu_map_memslots(kvm);
-		if (r)
-			goto out_unmap;
-	}
-
-	kvm_arch_start_assignment(kvm);
-	pci_set_dev_assigned(pdev);
-
-	dev_info(&pdev->dev, "kvm assign device\n");
-
-	return 0;
-out_unmap:
-	kvm_iommu_unmap_memslots(kvm);
-	return r;
-}
-
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	iommu_detach_device(domain, &pdev->dev);
-
-	pci_clear_dev_assigned(pdev);
-	kvm_arch_end_assignment(kvm);
-
-	dev_info(&pdev->dev, "kvm deassign device\n");
-
-	return 0;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
-	int r;
-
-	if (!iommu_present(&pci_bus_type)) {
-		printk(KERN_ERR "%s: iommu not found\n", __func__);
-		return -ENODEV;
-	}
-
-	mutex_lock(&kvm->slots_lock);
-
-	kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-	if (!kvm->arch.iommu_domain) {
-		r = -ENOMEM;
-		goto out_unlock;
-	}
-
-	if (!allow_unsafe_assigned_interrupts &&
-	    !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
-		printk(KERN_WARNING "%s: No interrupt remapping support,"
-		       " disallowing device assignment."
-		       " Re-enable with \"allow_unsafe_assigned_interrupts=1\""
-		       " module option.\n", __func__);
-		iommu_domain_free(kvm->arch.iommu_domain);
-		kvm->arch.iommu_domain = NULL;
-		r = -EPERM;
-		goto out_unlock;
-	}
-
-	r = kvm_iommu_map_memslots(kvm);
-	if (r)
-		kvm_iommu_unmap_memslots(kvm);
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages)
-{
-	struct iommu_domain *domain;
-	gfn_t end_gfn, gfn;
-	kvm_pfn_t pfn;
-	u64 phys;
-
-	domain  = kvm->arch.iommu_domain;
-	end_gfn = base_gfn + npages;
-	gfn     = base_gfn;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return;
-
-	while (gfn < end_gfn) {
-		unsigned long unmap_pages;
-		size_t size;
-
-		/* Get physical address */
-		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-
-		if (!phys) {
-			gfn++;
-			continue;
-		}
-
-		pfn  = phys >> PAGE_SHIFT;
-
-		/* Unmap address from IO address space */
-		size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
-		unmap_pages = 1ULL << get_order(size);
-
-		/* Unpin all pages we just unmapped to not leak any memory */
-		kvm_unpin_pages(kvm, pfn, unmap_pages);
-
-		gfn += unmap_pages;
-
-		cond_resched();
-	}
-}
-
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-	int idx;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots)
-		kvm_iommu_unmap_pages(kvm, memslot);
-
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_unregister_noncoherent_dma(kvm);
-
-	return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	mutex_lock(&kvm->slots_lock);
-	kvm_iommu_unmap_memslots(kvm);
-	kvm->arch.iommu_domain = NULL;
-	kvm->arch.iommu_noncoherent = false;
-	mutex_unlock(&kvm->slots_lock);
-
-	iommu_domain_free(domain);
-	return 0;
-}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ccbd45ecd41a3f..1853cda7f6d58b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,7 +27,6 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
-#include "assigned-dev.h"
 #include "pmu.h"
 #include "hyperv.h"
 
@@ -2675,10 +2674,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
 	case KVM_CAP_IMMEDIATE_EXIT:
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_ASSIGN_DEV_IRQ:
-	case KVM_CAP_PCI_2_3:
-#endif
 		r = 1;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
@@ -2713,11 +2708,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_IOMMU:
-		r = iommu_present(&pci_bus_type);
-		break;
-#endif
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
@@ -4230,7 +4220,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	default:
-		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+		r = -ENOTTY;
 	}
 out:
 	return r;
@@ -8068,7 +8058,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 }
 
@@ -8152,7 +8141,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 	if (kvm_x86_ops->vm_destroy)
 		kvm_x86_ops->vm_destroy(kvm);
-	kvm_iommu_unmap_guest(kvm);
 	kvm_pic_destroy(kvm);
 	kvm_ioapic_destroy(kvm);
 	kvm_free_vcpus(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d0250744507a28..f1339a7756b3da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -877,22 +877,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-#else
-static inline int kvm_iommu_map_pages(struct kvm *kvm,
-				      struct kvm_memory_slot *slot)
-{
-	return 0;
-}
-
-static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
-					 struct kvm_memory_slot *slot)
-{
-}
-#endif
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 88257b311cb579..ff3bf5d26e0b19 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1019,8 +1019,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 		old_memslots = install_new_memslots(kvm, as_id, slots);
 
-		/* slot was deleted or moved, clear iommu mapping */
-		kvm_iommu_unmap_pages(kvm, &old);
 		/* From this point no new shadow pages pointing to a deleted,
 		 * or moved, memslot will be created.
 		 *
@@ -1055,21 +1053,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	kvm_free_memslot(kvm, &old, &new);
 	kvfree(old_memslots);
-
-	/*
-	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
-	 * un-mapped and re-mapped if their base changes.  Since base change
-	 * unmapping is handled above with slot deletion, mapping alone is
-	 * needed here.  Anything else the iommu might care about for existing
-	 * slots (size changes, userspace addr changes and read-only flag
-	 * changes) is disallowed above, so any other attribute changes getting
-	 * here can be skipped.
-	 */
-	if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) {
-		r = kvm_iommu_map_pages(kvm, &new);
-		return r;
-	}
-
 	return 0;
 
 out_slots:

From 7db742654dd59fde90af79d77372c5439399a9e6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 8 Mar 2017 10:49:19 +0100
Subject: [PATCH 082/227] KVM: nVMX: we support 1GB EPT pages

Large pages at the PDPE level can be emulated by the MMU, so the bit
can be set unconditionally in the EPT capabilities MSR.  The same is
true of 2MB EPT pages, though all Intel processors with EPT in practice
support those.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 40b80e8959e1a5..0e61b9226bf23a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2759,14 +2759,14 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_ENABLE_EPT;
 		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
-			 VMX_EPT_INVEPT_BIT;
+			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
 		if (cpu_has_vmx_ept_execute_only())
 			vmx->nested.nested_vmx_ept_caps |=
 				VMX_EPT_EXECUTE_ONLY_BIT;
 		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
 		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
-			VMX_EPT_EXTENT_CONTEXT_BIT;
+			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+			VMX_EPT_1GB_PAGE_BIT;
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 

From 004772313040ed5b78560de1fe5d51ea678977c7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 30 Mar 2017 11:55:28 +0200
Subject: [PATCH 083/227] KVM: VMX: remove bogus check for invalid EPT
 violation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

handle_ept_violation is checking for "guest-linear-address invalid" +
"not a paging-structure walk".  However, _all_ EPT violations without
a valid guest linear address are paging structure walks, because those
EPT violations happen when loading the guest PDPTEs.

Therefore, the check can never be true, and even if it were, KVM doesn't
care about the guest linear address; it only uses the guest *physical*
address VMCS field.  So, remove the check altogether.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0e61b9226bf23a..1c372600a962ce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6208,23 +6208,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	unsigned long exit_qualification;
 	gpa_t gpa;
 	u32 error_code;
-	int gla_validity;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
-	gla_validity = (exit_qualification >> 7) & 0x3;
-	if (gla_validity == 0x2) {
-		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
-		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
-			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			vmcs_readl(GUEST_LINEAR_ADDRESS));
-		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
-			(long unsigned int)exit_qualification);
-		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
-		return 0;
-	}
-
 	/*
 	 * EPT violation happened while executing iret from NMI,
 	 * "blocked by NMI" bit has to be set before next VM entry.

From 86407bcb5c8320add17b57dc6af1586ff8e14bc0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 30 Mar 2017 11:55:29 +0200
Subject: [PATCH 084/227] kvm: x86: MMU support for EPT accessed/dirty bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This prepares the MMU paging code for EPT accessed and dirty bits,
which can be enabled optionally at runtime.  Code that updates the
accessed and dirty bits will need a pointer to the struct kvm_mmu.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a01105485315ab..3e20f7b33892c2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -43,6 +43,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
 	#ifdef CONFIG_X86_64
 	#define PT_MAX_FULL_LEVELS 4
 	#define CMPXCHG cmpxchg
@@ -64,6 +65,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
 	#define CMPXCHG cmpxchg
 #elif PTTYPE == PTTYPE_EPT
 	#define pt_element_t u64
@@ -78,6 +80,7 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_GUEST_DIRTY_MASK 0
 	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
 	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) false
 	#define CMPXCHG cmpxchg64
 	#define PT_MAX_FULL_LEVELS 4
 #else
@@ -111,12 +114,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
-static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+					     unsigned gpte)
 {
 	unsigned mask;
 
 	/* dirty bit is not supported, so no need to track it */
-	if (!PT_GUEST_DIRTY_MASK)
+	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
 		return;
 
 	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
@@ -171,7 +175,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 		goto no_present;
 
 	/* if accessed bit is not supported prefetch non accessed gpte */
-	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+	if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
 		goto no_present;
 
 	return false;
@@ -217,7 +221,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	int ret;
 
 	/* dirty/accessed bits are not supported, so no need to update them */
-	if (!PT_GUEST_DIRTY_MASK)
+	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
 		return 0;
 
 	for (level = walker->max_level; level >= walker->level; --level) {
@@ -287,6 +291,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
 	gpa_t pte_gpa;
+	bool have_ad;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
@@ -299,6 +304,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 retry_walk:
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
+	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
 	if (walker->level == PT32E_ROOT_LEVEL) {
@@ -312,7 +318,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	walker->max_level = walker->level;
 	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
 
-	accessed_dirty = PT_GUEST_ACCESSED_MASK;
+	accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;
 
@@ -394,7 +400,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	walker->gfn = real_gpa >> PAGE_SHIFT;
 
 	if (!write_fault)
-		FNAME(protect_clean_gpte)(&pte_access, pte);
+		FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
 	else
 		/*
 		 * On a write fault, fold the dirty bit into accessed_dirty.
@@ -485,7 +491,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	gfn = gpte_to_gfn(gpte);
 	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-	FNAME(protect_clean_gpte)(&pte_access, gpte);
+	FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
 	if (is_error_pfn(pfn))
@@ -979,7 +985,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
 		pte_access &= FNAME(gpte_access)(vcpu, gpte);
-		FNAME(protect_clean_gpte)(&pte_access, gpte);
+		FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 
 		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
 		      &nr_present))
@@ -1025,3 +1031,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef PT_GUEST_DIRTY_MASK
 #undef PT_GUEST_DIRTY_SHIFT
 #undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY

From ae1e2d1082ae6969ff8c626ef80804d950bf256b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 30 Mar 2017 11:55:30 +0200
Subject: [PATCH 085/227] kvm: nVMX: support EPT accessed/dirty bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now use bit 6 of EPTP to optionally enable A/D bits for EPTP.  Another
thing to change is that, when EPT accessed and dirty bits are not in use,
VMX treats accesses to guest paging structures as data reads.  When they
are in use (bit 6 of EPTP is set), they are treated as writes and the
corresponding EPT dirty bit is set.  The MMU didn't know this detail,
so this patch adds it.

We also have to fix up the exit qualification.  It may be wrong because
KVM sets bit 6 but the guest might not.

L1 emulates EPT A/D bits using write permissions, so in principle it may
be possible for EPT A/D bits to be used by L1 even though not available
in hardware.  The problem is that guest page-table walks will be treated
as reads rather than writes, so they would not cause an EPT violation.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Fixed typo in walk_addr_generic() comment and changed bit clear +
 conditional-set pattern in handle_ept_violation() to conditional-clear]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/include/asm/vmx.h      |  2 ++
 arch/x86/kvm/mmu.c              |  4 +++-
 arch/x86/kvm/mmu.h              |  3 ++-
 arch/x86/kvm/paging_tmpl.h      | 33 ++++++++++++++++-----------------
 arch/x86/kvm/vmx.c              | 32 ++++++++++++++++++++++++++++----
 6 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74ef58c8ff5330..7dbb8d62268369 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -343,9 +343,10 @@ struct kvm_mmu {
 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   u64 *spte, const void *pte);
 	hpa_t root_hpa;
-	int root_level;
-	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	u8 root_level;
+	u8 shadow_root_level;
+	u8 ept_ad;
 	bool direct_map;
 
 	/*
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index cc54b70265674e..dffe8d68fb2707 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -516,12 +516,14 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_READABLE_BIT	3
 #define EPT_VIOLATION_WRITABLE_BIT	4
 #define EPT_VIOLATION_EXECUTABLE_BIT	5
+#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 #define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT)
 #define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT)
 #define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT)
 #define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
 /*
  * VM-instruction error numbers
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ac7810513d0e95..558676538fca3c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4340,7 +4340,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+			     bool accessed_dirty)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
@@ -4349,6 +4350,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
 	context->nx = true;
+	context->ept_ad = accessed_dirty;
 	context->page_fault = ept_page_fault;
 	context->gva_to_gpa = ept_gva_to_gpa;
 	context->sync_page = ept_sync_page;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ddc56e91f2e491..d8ccb32f7308ab 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -74,7 +74,8 @@ enum {
 
 int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+			     bool accessed_dirty);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3e20f7b33892c2..314d2071b3376e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,13 +23,6 @@
  * so the code in this file is compiled twice, once per pte size.
  */
 
-/*
- * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
- * uses for EPT without A/D paging type.
- */
-extern u64 __pure __using_nonexistent_pte_bit(void)
-	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
-
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
@@ -39,8 +32,6 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
@@ -61,8 +52,6 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
-	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
@@ -76,17 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#define PT_GUEST_ACCESSED_MASK 0
-	#define PT_GUEST_DIRTY_MASK 0
-	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
-	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
-	#define PT_HAVE_ACCESSED_DIRTY(mmu) false
+	#define PT_GUEST_DIRTY_SHIFT 9
+	#define PT_GUEST_ACCESSED_SHIFT 8
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
 	#define CMPXCHG cmpxchg64
 	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
 
+#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
 
@@ -290,6 +280,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+	unsigned nested_access;
 	gpa_t pte_gpa;
 	bool have_ad;
 	int offset;
@@ -319,6 +310,14 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
 
 	accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
+
+	/*
+	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+	 * by the MOV to CR instruction are treated as reads and do not cause the
+	 * processor to set the dirty flag in any EPT paging-structure entry.
+	 */
+	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;
 
@@ -338,7 +337,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
 		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
-					      PFERR_USER_MASK|PFERR_WRITE_MASK,
+					      nested_access,
 					      &walker->fault);
 
 		/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1c372600a962ce..5556cce99749c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2767,6 +2767,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
 			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
 			VMX_EPT_1GB_PAGE_BIT;
+	       if (enable_ept_ad_bits)
+		       vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
@@ -6211,6 +6213,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
+	if (is_guest_mode(vcpu)
+	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
+		/*
+		 * Fix up exit_qualification according to whether guest
+		 * page table accesses are reads or writes.
+		 */
+		u64 eptp = nested_ept_get_cr3(vcpu);
+		if (eptp & VMX_EPT_AD_ENABLE_BIT)
+			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
+	}
+
 	/*
 	 * EPT violation happened while executing iret from NMI,
 	 * "blocked by NMI" bit has to be set before next VM entry.
@@ -9416,17 +9429,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 	return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
+	u64 eptp;
+
 	WARN_ON(mmu_is_nested(vcpu));
+	eptp = nested_ept_get_cr3(vcpu);
+	if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+		return 1;
+
+	kvm_mmu_unload(vcpu);
 	kvm_init_shadow_ept_mmu(vcpu,
 			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
-			VMX_EPT_EXECUTE_ONLY_BIT);
+			VMX_EPT_EXECUTE_ONLY_BIT,
+			eptp & VMX_EPT_AD_ENABLE_BIT);
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+	return 0;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -10188,8 +10210,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	if (nested_cpu_has_ept(vmcs12)) {
-		kvm_mmu_unload(vcpu);
-		nested_ept_init_mmu_context(vcpu);
+		if (nested_ept_init_mmu_context(vcpu)) {
+			*entry_failure_code = ENTRY_FAIL_DEFAULT;
+			return 1;
+		}
 	} else if (nested_cpu_has2(vmcs12,
 				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
 		vmx_flush_tlb_ept_only(vcpu);

From 1f5199927034007e42a1ee508f62b9b85cc4714a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 30 Mar 2017 11:55:31 +0200
Subject: [PATCH 086/227] KVM: VMX: add missing exit reasons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to simplify adding exit reasons in the future,
the array of exit reason names is now also sorted by
exit reason code.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/uapi/asm/vmx.h | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 14458658e988bb..690a2dcf407860 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -76,7 +76,11 @@
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
+#define EXIT_REASON_RDRAND              57
 #define EXIT_REASON_INVPCID             58
+#define EXIT_REASON_VMFUNC              59
+#define EXIT_REASON_ENCLS               60
+#define EXIT_REASON_RDSEED              61
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
@@ -90,6 +94,7 @@
 	{ EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
 	{ EXIT_REASON_CPUID,                 "CPUID" }, \
 	{ EXIT_REASON_HLT,                   "HLT" }, \
+	{ EXIT_REASON_INVD,                  "INVD" }, \
 	{ EXIT_REASON_INVLPG,                "INVLPG" }, \
 	{ EXIT_REASON_RDPMC,                 "RDPMC" }, \
 	{ EXIT_REASON_RDTSC,                 "RDTSC" }, \
@@ -108,6 +113,8 @@
 	{ EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
 	{ EXIT_REASON_MSR_READ,              "MSR_READ" }, \
 	{ EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
+	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
+	{ EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
 	{ EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
 	{ EXIT_REASON_MONITOR_TRAP_FLAG,     "MONITOR_TRAP_FLAG" }, \
 	{ EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
@@ -115,20 +122,24 @@
 	{ EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
 	{ EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
 	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
-	{ EXIT_REASON_GDTR_IDTR,	     "GDTR_IDTR" }, \
-	{ EXIT_REASON_LDTR_TR,		     "LDTR_TR" }, \
+	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
+	{ EXIT_REASON_GDTR_IDTR,             "GDTR_IDTR" }, \
+	{ EXIT_REASON_LDTR_TR,               "LDTR_TR" }, \
 	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
 	{ EXIT_REASON_INVEPT,                "INVEPT" }, \
+	{ EXIT_REASON_RDTSCP,                "RDTSCP" }, \
 	{ EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
+	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_WBINVD,                "WBINVD" }, \
+	{ EXIT_REASON_XSETBV,                "XSETBV" }, \
 	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
-	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
-	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
-	{ EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
-	{ EXIT_REASON_INVD,                  "INVD" }, \
-	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
+	{ EXIT_REASON_RDRAND,                "RDRAND" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
+	{ EXIT_REASON_VMFUNC,                "VMFUNC" }, \
+	{ EXIT_REASON_ENCLS,                 "ENCLS" }, \
+	{ EXIT_REASON_RDSEED,                "RDSEED" }, \
+	{ EXIT_REASON_PML_FULL,              "PML_FULL" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 

From a5f46457047bceaaf32ed7c1d47e3e15bd197e27 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 30 Mar 2017 11:55:32 +0200
Subject: [PATCH 087/227] KVM: nVMX: support RDRAND and RDSEED exiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/vmx.h | 2 ++
 arch/x86/kvm/vmx.c         | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index dffe8d68fb2707..35cd06f636abc9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,8 +70,10 @@
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
+#define SECONDARY_EXEC_RDRAND			0x00000800
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
+#define SECONDARY_EXEC_RDSEED			0x00010000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5556cce99749c9..cfdb0d9389d1f6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2745,6 +2745,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_secondary_ctls_high);
 	vmx->nested.nested_vmx_secondary_ctls_low = 0;
 	vmx->nested.nested_vmx_secondary_ctls_high &=
+		SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_DESC |
@@ -8082,6 +8083,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
 	case EXIT_REASON_RDPMC:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+	case EXIT_REASON_RDRAND:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+	case EXIT_REASON_RDSEED:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:

From 3042255899540d4985a616017af9e2b80a84a8cf Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 31 Mar 2017 13:53:22 +0200
Subject: [PATCH 088/227] kvm: make KVM_CAP_COALESCED_MMIO architecture
 agnostic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove code from architecture files that can be moved to virt/kvm, since there
is already common code for coalesced MMIO.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
[Removed a pointless 'break' after 'return'.]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/arm/kvm/arm.c         | 3 ---
 arch/mips/kvm/mips.c       | 3 ---
 arch/powerpc/kvm/powerpc.c | 5 -----
 arch/x86/kvm/x86.c         | 3 ---
 virt/kvm/kvm_main.c        | 4 ++++
 5 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 96dba7cd8be7b4..e3c8105ada65a8 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -209,9 +209,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
 		r = 1;
 		break;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 1fc6fef463db15..d4b2ad18eef202 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1070,9 +1070,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
 	case KVM_CAP_NR_VCPUS:
 		r = num_online_cpus();
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95c91a9de351c4..0e42aa8a279f32 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -524,11 +524,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		/* We support this only for PR */
 		r = !hv_enabled;
 		break;
-#ifdef CONFIG_KVM_MMIO
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
-#endif
 #ifdef CONFIG_KVM_MPIC
 	case KVM_CAP_IRQ_MPIC:
 		r = 1;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1853cda7f6d58b..bb3a1531b2490b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2690,9 +2690,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		 */
 		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
 		break;
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
 	case KVM_CAP_VAPIC:
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 		break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ff3bf5d26e0b19..b5dcde10c53b9c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2918,6 +2918,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 		return 1;
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+	case KVM_CAP_COALESCED_MMIO:
+		return KVM_COALESCED_MMIO_PAGE_OFFSET;
+#endif
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 	case KVM_CAP_IRQ_ROUTING:
 		return KVM_MAX_IRQ_ROUTES;

From 4b4357e02523ec63ad853f927f5d93a25101a1d2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 31 Mar 2017 13:53:23 +0200
Subject: [PATCH 089/227] kvm: make KVM_COALESCED_MMIO_PAGE_OFFSET public
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Its value has never changed; we might as well make it part of the ABI instead
of using the return value of KVM_CHECK_EXTENSION(KVM_CAP_COALESCED_MMIO).

Because PPC does not always make MMIO available, the code has to be made
dependent on CONFIG_KVM_MMIO rather than KVM_COALESCED_MMIO_PAGE_OFFSET.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/arm/include/asm/kvm_host.h     |  1 -
 arch/arm/include/uapi/asm/kvm.h     |  2 ++
 arch/arm64/include/asm/kvm_host.h   |  1 -
 arch/arm64/include/uapi/asm/kvm.h   |  2 ++
 arch/mips/include/asm/kvm_host.h    |  1 -
 arch/mips/include/uapi/asm/kvm.h    |  2 ++
 arch/powerpc/include/asm/kvm_host.h |  3 ---
 arch/powerpc/include/uapi/asm/kvm.h |  3 +++
 arch/x86/include/asm/kvm_host.h     |  2 --
 arch/x86/include/uapi/asm/kvm.h     |  3 +++
 include/linux/kvm_host.h            |  2 +-
 virt/kvm/kvm_main.c                 | 10 +++++-----
 12 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 31ee468ce667de..de67ce64750173 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -30,7 +30,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 32
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 6ebd3e6a1fd12d..254a38cace2a2d 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -27,6 +27,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)						\
 	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e7705e7bb07b13..522e4f60976ee1 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -31,7 +31,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 512
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #include <kvm/arm_vgic.h>
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index c2860358ae3e0c..aa5ab69c1312dc 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -39,6 +39,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)						\
 	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5c518c148f9d5a..2998479fd4e83f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -83,7 +83,6 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS	0
 
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #ifdef CONFIG_KVM_MIPS_VZ
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index 3107095d7f0a46..0318c6b442ab4d 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -21,6 +21,8 @@
 
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7bba8f41562705..01d05c76f1c7dd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -45,9 +45,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#ifdef CONFIG_KVM_MMIO
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
-#endif
 #define KVM_HALT_POLL_NS_DEFAULT 10000	/* 10 us */
 
 /* These values are internal and can be increased later */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 4edbe4bb0e8b0c..07fbeb927834f3 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -29,6 +29,9 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_GUEST_DEBUG
 
+/* Not always available, but if it is, this is the correct offset.  */
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7dbb8d62268369..d962fa998a6fc5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,8 +43,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 #define KVM_HALT_POLL_NS_DEFAULT 400000
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 739c0c5940226d..c2824d02ba3762 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,9 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
 #define DE_VECTOR 0
 #define DB_VECTOR 1
 #define BP_VECTOR 3
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f1339a7756b3da..7e74ae4d99bbef 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -403,7 +403,7 @@ struct kvm {
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	refcount_t users_count;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 	spinlock_t ring_lock;
 	struct list_head coalesced_zones;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5dcde10c53b9c..f489167839c48b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2349,7 +2349,7 @@ static int kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->arch.pio_data);
 #endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
@@ -2918,7 +2918,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 		return 1;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO:
 		return KVM_COALESCED_MMIO_PAGE_OFFSET;
 #endif
@@ -2971,7 +2971,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
 		break;
 	}
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	case KVM_REGISTER_COALESCED_MMIO: {
 		struct kvm_coalesced_mmio_zone zone;
 
@@ -3163,7 +3163,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	kvm = kvm_create_vm(type);
 	if (IS_ERR(kvm))
 		return PTR_ERR(kvm);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	r = kvm_coalesced_mmio_init(kvm);
 	if (r < 0) {
 		kvm_put_kvm(kvm);
@@ -3216,7 +3216,7 @@ static long kvm_dev_ioctl(struct file *filp,
 #ifdef CONFIG_X86
 		r += PAGE_SIZE;    /* pio data page */
 #endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 		r += PAGE_SIZE;    /* coalesced mmio ring page */
 #endif
 		break;

From 28bf28887976d8881a3a59491896c718fade7355 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 23 Mar 2017 11:46:03 +0100
Subject: [PATCH 090/227] KVM: x86: fix user triggerable warning in
 kvm_apic_accept_events()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If we already entered/are about to enter SMM, don't allow switching to
INIT/SIPI_RECEIVED, otherwise the next call to kvm_apic_accept_events()
will report a warning.

Same applies if we are already in MP state INIT_RECEIVED and SMM is
requested to be turned on. Refuse to set the VCPU events in this case.

Fixes: cd7764fe9f73 ("KVM: x86: latch INITs while in system management mode")
Cc: stable@vger.kernel.org # 4.2+
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bb3a1531b2490b..722fe854985ec8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3114,6 +3114,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
 		return -EINVAL;
 
+	/* INITs are latched while in SMM */
+	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+	    (events->smi.smm || events->smi.pending) &&
+	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+		return -EINVAL;
+
 	process_nmi(vcpu);
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
@@ -7342,6 +7348,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
 		return -EINVAL;
 
+	/* INITs are latched while in SMM */
+	if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+		return -EINVAL;
+
 	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
 		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);

From 28d06353881939703c34d82a1465136af176c620 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 5 Apr 2017 09:14:40 -0700
Subject: [PATCH 091/227] kvm: nVMX: Disallow userspace-injected exceptions in
 guest mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The userspace exception injection API and code path are entirely
unprepared for exceptions that might cause a VM-exit from L2 to L1, so
the best course of action may be to simply disallow this for now.

1. The API provides no mechanism for userspace to specify the new DR6
bits for a #DB exception or the new CR2 value for a #PF
exception. Presumably, userspace is expected to modify these registers
directly with KVM_SET_SREGS before the next KVM_RUN ioctl. However, in
the event that L1 intercepts the exception, these registers should not
be changed. Instead, the new values should be provided in the
exit_qualification field of vmcs12 (Intel SDM vol 3, section 27.1).

2. In the case of a userspace-injected #DB, inject_pending_event()
clears DR7.GD before calling vmx_queue_exception(). However, in the
event that L1 intercepts the exception, this is too early, because
DR7.GD should not be modified by a #DB that causes a VM-exit directly
(Intel SDM vol 3, section 27.1).

3. If the injected exception is a #PF, nested_vmx_check_exception()
doesn't properly check whether or not L1 is interested in the
associated error code (using the #PF error code mask and match fields
from vmcs12). It may either return 0 when it should call
nested_vmx_vmexit() or vice versa.

4. nested_vmx_check_exception() assumes that it is dealing with a
hardware-generated exception intercept from L2, with some of the
relevant details (the VM-exit interruption-information and the exit
qualification) live in vmcs02. For userspace-injected exceptions, this
is not the case.

5. prepare_vmcs12() assumes that when its exit_intr_info argument
specifies valid information with a valid error code that it can VMREAD
the VM-exit interruption error code from vmcs02. For
userspace-injected exceptions, this is not the case.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 722fe854985ec8..6bc47e2712c87d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3111,7 +3111,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	if (events->exception.injected &&
-	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
+	     is_guest_mode(vcpu)))
 		return -EINVAL;
 
 	/* INITs are latched while in SMM */

From 056aad67f836e20568be9b7ca07703a8dea26e0c Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 20 Mar 2017 18:26:42 +0000
Subject: [PATCH 092/227] kvm: arm/arm64: Rework gpa callback handlers

In order to perform an operation on a gpa range, we currently iterate
over each page in a user memory slot for the given range. This is
inefficient while dealing with a big range (e.g, a VMA), especially
while unmaping a range. At present, with stage2 unmap on a range with
a hugepage backed region, we clear the PMD when we unmap the first
page in the loop. The remaining iterations simply traverse the page table
down to the PMD level only to see that nothing is in there.

This patch reworks the code to invoke the callback handlers on the
biggest range possible within the memory slot to to reduce the number of
times the handler is called.

Cc: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/mmu.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 962616fd4ddd63..69554bd32c2934 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1512,7 +1512,8 @@ static int handle_hva_to_gpa(struct kvm *kvm,
 			     unsigned long start,
 			     unsigned long end,
 			     int (*handler)(struct kvm *kvm,
-					    gpa_t gpa, void *data),
+					    gpa_t gpa, u64 size,
+					    void *data),
 			     void *data)
 {
 	struct kvm_memslots *slots;
@@ -1524,7 +1525,7 @@ static int handle_hva_to_gpa(struct kvm *kvm,
 	/* we only care about the pages that the guest sees */
 	kvm_for_each_memslot(memslot, slots) {
 		unsigned long hva_start, hva_end;
-		gfn_t gfn, gfn_end;
+		gfn_t gpa;
 
 		hva_start = max(start, memslot->userspace_addr);
 		hva_end = min(end, memslot->userspace_addr +
@@ -1532,25 +1533,16 @@ static int handle_hva_to_gpa(struct kvm *kvm,
 		if (hva_start >= hva_end)
 			continue;
 
-		/*
-		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-		 */
-		gfn = hva_to_gfn_memslot(hva_start, memslot);
-		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-		for (; gfn < gfn_end; ++gfn) {
-			gpa_t gpa = gfn << PAGE_SHIFT;
-			ret |= handler(kvm, gpa, data);
-		}
+		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
+		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
 	}
 
 	return ret;
 }
 
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
+static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-	unmap_stage2_range(kvm, gpa, PAGE_SIZE);
+	unmap_stage2_range(kvm, gpa, size);
 	return 0;
 }
 
@@ -1577,10 +1569,11 @@ int kvm_unmap_hva_range(struct kvm *kvm,
 	return 0;
 }
 
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
+static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
+	WARN_ON(size != PAGE_SIZE);
 	/*
 	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
 	 * flag clear because MMU notifiers will have unmapped a huge PMD before
@@ -1606,11 +1599,12 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
 }
 
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
+static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
 	pmd_t *pmd;
 	pte_t *pte;
 
+	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
 	pmd = stage2_get_pmd(kvm, NULL, gpa);
 	if (!pmd || pmd_none(*pmd))	/* Nothing there */
 		return 0;
@@ -1625,11 +1619,12 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
 	return stage2_ptep_test_and_clear_young(pte);
 }
 
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
+static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
 	pmd_t *pmd;
 	pte_t *pte;
 
+	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
 	pmd = stage2_get_pmd(kvm, NULL, gpa);
 	if (!pmd || pmd_none(*pmd))	/* Nothing there */
 		return 0;

From 328e566479449194979d64685ae6d74c989599bb Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@cs.columbia.edu>
Date: Thu, 24 Mar 2016 11:21:04 +0100
Subject: [PATCH 093/227] KVM: arm/arm64: vgic: Defer touching GICH_VMCR to
 vcpu_load/put

We don't have to save/restore the VMCR on every entry to/from the guest,
since on GICv2 we can access the control interface from EL1 and on VHE
systems with GICv3 we can access the control interface from KVM running
in EL2.

GICv3 systems without VHE becomes the rare case, which has to
save/restore the register on each round trip.

Note that userspace accesses may see out-of-date values if the VCPU is
running while accessing the VGIC state via the KVM device API, but this
is already the case and it is up to userspace to quiesce the CPUs before
reading the CPU registers from the GIC for an up-to-date view.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@cs.columbia.edu>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/kvm_asm.h   |  3 +++
 arch/arm/kvm/arm.c               | 11 ++++++-----
 arch/arm64/include/asm/kvm_asm.h |  2 ++
 include/kvm/arm_vgic.h           |  3 +++
 virt/kvm/arm/hyp/vgic-v2-sr.c    |  3 ---
 virt/kvm/arm/hyp/vgic-v3-sr.c    | 14 ++++++++++----
 virt/kvm/arm/vgic/vgic-init.c    | 12 ++++++++++++
 virt/kvm/arm/vgic/vgic-v2.c      | 24 ++++++++++++++++++++++--
 virt/kvm/arm/vgic/vgic-v3.c      | 22 ++++++++++++++++++++--
 virt/kvm/arm/vgic/vgic.c         | 22 ++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.h         |  6 ++++++
 11 files changed, 106 insertions(+), 16 deletions(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 8ef05381984b1b..dd16044b34b613 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -75,7 +75,10 @@ extern void __init_stage2_translation(void);
 extern void __kvm_hyp_reset(unsigned long);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern u64 __vgic_v3_read_vmcr(void);
+extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 96dba7cd8be7b4..46fd3757869363 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -351,15 +351,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
 	kvm_arm_set_running_vcpu(vcpu);
+
+	kvm_vgic_load(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	/*
-	 * The arch-generic KVM code expects the cpu field of a vcpu to be -1
-	 * if the vcpu is no longer assigned to a cpu.  This is used for the
-	 * optimized make_all_cpus_request path.
-	 */
+	kvm_vgic_put(vcpu);
+
 	vcpu->cpu = -1;
 
 	kvm_arm_set_running_vcpu(NULL);
@@ -633,7 +632,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * non-preemptible context.
 		 */
 		preempt_disable();
+
 		kvm_pmu_flush_hwstate(vcpu);
+
 		kvm_timer_flush_hwstate(vcpu);
 		kvm_vgic_flush_hwstate(vcpu);
 
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index ec3553eb934909..49f99cd02613cf 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -59,6 +59,8 @@ extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern u64 __vgic_v3_read_vmcr(void);
+extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index b72dd2ad5f440c..f7a2e31eb4c1bb 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -306,6 +306,9 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 
+void kvm_vgic_load(struct kvm_vcpu *vcpu);
+void kvm_vgic_put(struct kvm_vcpu *vcpu);
+
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)	((k)->arch.vgic.initialized)
 #define vgic_ready(k)		((k)->arch.vgic.ready)
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index c8aeb7b91ec89b..d3d3b9b0c2c3e9 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -114,8 +114,6 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
 	if (!base)
 		return;
 
-	cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR);
-
 	if (vcpu->arch.vgic_cpu.live_lrs) {
 		cpu_if->vgic_apr = readl_relaxed(base + GICH_APR);
 
@@ -165,7 +163,6 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
 	vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 }
 
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 3947095cc0a1ce..e51ee7edf9533a 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -159,8 +159,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 	if (!cpu_if->vgic_sre)
 		dsb(st);
 
-	cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
-
 	if (vcpu->arch.vgic_cpu.live_lrs) {
 		int i;
 		u32 max_lr_idx, nr_pri_bits;
@@ -261,8 +259,6 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 			live_lrs |= (1 << i);
 	}
 
-	write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
 	if (live_lrs) {
 		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
 
@@ -326,3 +322,13 @@ u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
 {
 	return read_gicreg(ICH_VTR_EL2);
 }
+
+u64 __hyp_text __vgic_v3_read_vmcr(void)
+{
+	return read_gicreg(ICH_VMCR_EL2);
+}
+
+void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
+{
+	write_gicreg(vmcr, ICH_VMCR_EL2);
+}
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 276139a24e6fd0..e8e973b72ca539 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -262,6 +262,18 @@ int vgic_init(struct kvm *kvm)
 	vgic_debug_init(kvm);
 
 	dist->initialized = true;
+
+	/*
+	 * If we're initializing GICv2 on-demand when first running the VCPU
+	 * then we need to load the VGIC state onto the CPU.  We can detect
+	 * this easily by checking if we are in between vcpu_load and vcpu_put
+	 * when we just initialized the VGIC.
+	 */
+	preempt_disable();
+	vcpu = kvm_arm_get_running_vcpu();
+	if (vcpu)
+		kvm_vgic_load(vcpu);
+	preempt_enable();
 out:
 	return ret;
 }
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index b834ecdf322503..2f241e026c8f38 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -184,6 +184,7 @@ void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
 
 void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 	u32 vmcr;
 
 	vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
@@ -194,12 +195,15 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 	vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
 		GICH_VMCR_PRIMASK_MASK;
 
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
+	cpu_if->vgic_vmcr = vmcr;
 }
 
 void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
-	u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	u32 vmcr;
+
+	vmcr = cpu_if->vgic_vmcr;
 
 	vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >>
 			GICH_VMCR_CTRL_SHIFT;
@@ -375,3 +379,19 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
 
 	return ret;
 }
+
+void vgic_v2_load(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+
+	writel_relaxed(cpu_if->vgic_vmcr, vgic->vctrl_base + GICH_VMCR);
+}
+
+void vgic_v2_put(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+
+	cpu_if->vgic_vmcr = readl_relaxed(vgic->vctrl_base + GICH_VMCR);
+}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index be0f4c3e0142e0..99213d744e4fe6 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -173,6 +173,7 @@ void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
 
 void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 	u32 vmcr;
 
 	/*
@@ -188,12 +189,15 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 	vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
 	vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
 
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
+	cpu_if->vgic_vmcr = vmcr;
 }
 
 void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
-	u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 vmcr;
+
+	vmcr = cpu_if->vgic_vmcr;
 
 	/*
 	 * Ignore the FIQen bit, because GIC emulation always implies
@@ -386,3 +390,17 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
 
 	return 0;
 }
+
+void vgic_v3_load(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+}
+
+void vgic_v3_put(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 654dfd40e449cb..2ac0def57424f8 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -656,6 +656,28 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
 }
 
+void kvm_vgic_load(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!vgic_initialized(vcpu->kvm)))
+		return;
+
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_load(vcpu);
+	else
+		vgic_v3_load(vcpu);
+}
+
+void kvm_vgic_put(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!vgic_initialized(vcpu->kvm)))
+		return;
+
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_put(vcpu);
+	else
+		vgic_v3_put(vcpu);
+}
+
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index db28f7cadab28b..9afb4557c7e88c 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -130,6 +130,9 @@ int vgic_v2_map_resources(struct kvm *kvm);
 int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
 			     enum vgic_type);
 
+void vgic_v2_load(struct kvm_vcpu *vcpu);
+void vgic_v2_put(struct kvm_vcpu *vcpu);
+
 static inline void vgic_get_irq_kref(struct vgic_irq *irq)
 {
 	if (irq->intid < VGIC_MIN_LPI)
@@ -150,6 +153,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 
+void vgic_v3_load(struct kvm_vcpu *vcpu);
+void vgic_v3_put(struct kvm_vcpu *vcpu);
+
 int vgic_register_its_iodevs(struct kvm *kvm);
 bool vgic_has_its(struct kvm *kvm);
 int kvm_vgic_register_its_device(void);

From f6769581e90ba2535b3e587fe15b74f6cbc4aaab Mon Sep 17 00:00:00 2001
From: Shih-Wei Li <shihwei@cs.columbia.edu>
Date: Wed, 19 Oct 2016 18:12:34 +0000
Subject: [PATCH 094/227] KVM: arm/arm64: vgic: Avoid flushing vgic state when
 there's no pending IRQ

We do not need to flush vgic states in each world switch unless
there is pending IRQ queued to the vgic's ap list. We can thus reduce
the overhead by not grabbing the spinlock and not making the extra
function call to vgic_flush_lr_state.

Note: list_empty is a single atomic read (uses READ_ONCE) and can
therefore check if a list is empty or not without the need to take the
spinlock protecting the list.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Shih-Wei Li <shihwei@cs.columbia.edu>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/vgic/vgic.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 2ac0def57424f8..104329139f24b3 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -637,12 +637,17 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 /* Sync back the hardware VGIC state into our emulation after a guest's run. */
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
 	if (unlikely(!vgic_initialized(vcpu->kvm)))
 		return;
 
 	vgic_process_maintenance_interrupt(vcpu);
 	vgic_fold_lr_state(vcpu);
 	vgic_prune_ap_list(vcpu);
+
+	/* Make sure we can fast-path in flush_hwstate */
+	vgic_cpu->used_lrs = 0;
 }
 
 /* Flush our emulation state into the GIC hardware before entering the guest. */
@@ -651,6 +656,18 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 	if (unlikely(!vgic_initialized(vcpu->kvm)))
 		return;
 
+	/*
+	 * If there are no virtual interrupts active or pending for this
+	 * VCPU, then there is no work to do and we can bail out without
+	 * taking any lock.  There is a potential race with someone injecting
+	 * interrupts to the VCPU, but it is a benign race as the VCPU will
+	 * either observe the new interrupt before or after doing this check,
+	 * and introducing additional synchronization mechanism doesn't change
+	 * this.
+	 */
+	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+		return;
+
 	spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
 	vgic_flush_lr_state(vcpu);
 	spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);

From 00dafa0fcfe9fb1d863f08dc45d6f05ac9505d46 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 23 Dec 2016 00:04:59 +0100
Subject: [PATCH 095/227] KVM: arm/arm64: vgic: Get rid of live_lrs

There is no need to calculate and maintain live_lrs when we always
populate the lowest numbered LRs first on every entry and clear all LRs
on every exit.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/kvm/arm_vgic.h        |  2 --
 virt/kvm/arm/hyp/vgic-v2-sr.c | 39 +++++++++-----------------------
 virt/kvm/arm/hyp/vgic-v3-sr.c | 42 ++++++++++-------------------------
 3 files changed, 22 insertions(+), 61 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f7a2e31eb4c1bb..ea940dbb5dba7e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -264,8 +264,6 @@ struct vgic_cpu {
 	 */
 	struct list_head ap_list_head;
 
-	u64 live_lrs;
-
 	/*
 	 * Members below are used with GICv3 emulation only and represent
 	 * parts of the redistributor.
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index d3d3b9b0c2c3e9..34b37ce0d4be4f 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -26,27 +26,23 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 					    void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 	u32 eisr0, eisr1;
 	int i;
 	bool expect_mi;
 
 	expect_mi = !!(cpu_if->vgic_hcr & GICH_HCR_UIE);
 
-	for (i = 0; i < nr_lr; i++) {
-		if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-				continue;
-
+	for (i = 0; i < used_lrs && !expect_mi; i++)
 		expect_mi |= (!(cpu_if->vgic_lr[i] & GICH_LR_HW) &&
 			      (cpu_if->vgic_lr[i] & GICH_LR_EOI));
-	}
 
 	if (expect_mi) {
 		cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR);
 
 		if (cpu_if->vgic_misr & GICH_MISR_EOI) {
 			eisr0  = readl_relaxed(base + GICH_EISR0);
-			if (unlikely(nr_lr > 32))
+			if (unlikely(used_lrs > 32))
 				eisr1  = readl_relaxed(base + GICH_EISR1);
 			else
 				eisr1 = 0;
@@ -87,13 +83,10 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
 	int i;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 
-	for (i = 0; i < nr_lr; i++) {
-		if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-			continue;
-
+	for (i = 0; i < used_lrs; i++) {
 		if (cpu_if->vgic_elrsr & (1UL << i))
 			cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
 		else
@@ -110,11 +103,12 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 	struct vgic_dist *vgic = &kvm->arch.vgic;
 	void __iomem *base = kern_hyp_va(vgic->vctrl_base);
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 
 	if (!base)
 		return;
 
-	if (vcpu->arch.vgic_cpu.live_lrs) {
+	if (used_lrs) {
 		cpu_if->vgic_apr = readl_relaxed(base + GICH_APR);
 
 		save_maint_int_state(vcpu, base);
@@ -122,8 +116,6 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
 		save_lrs(vcpu, base);
 
 		writel_relaxed(0, base + GICH_HCR);
-
-		vcpu->arch.vgic_cpu.live_lrs = 0;
 	} else {
 		cpu_if->vgic_eisr = 0;
 		cpu_if->vgic_elrsr = ~0UL;
@@ -139,31 +131,20 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 	struct vgic_dist *vgic = &kvm->arch.vgic;
 	void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
 	int i;
-	u64 live_lrs = 0;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 
 	if (!base)
 		return;
 
-
-	for (i = 0; i < nr_lr; i++)
-		if (cpu_if->vgic_lr[i] & GICH_LR_STATE)
-			live_lrs |= 1UL << i;
-
-	if (live_lrs) {
+	if (used_lrs) {
 		writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
 		writel_relaxed(cpu_if->vgic_apr, base + GICH_APR);
-		for (i = 0; i < nr_lr; i++) {
-			if (!(live_lrs & (1UL << i)))
-				continue;
-
+		for (i = 0; i < used_lrs; i++) {
 			writel_relaxed(cpu_if->vgic_lr[i],
 				       base + GICH_LR0 + (i * 4));
 		}
 	}
-
-	vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 }
 
 #ifdef CONFIG_ARM64
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index e51ee7edf9533a..b3c36b64df3403 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -118,18 +118,16 @@ static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
 	}
 }
 
-static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
+static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 	int i;
 	bool expect_mi;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 
 	expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
 
-	for (i = 0; i < nr_lr; i++) {
-		if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-				continue;
-
+	for (i = 0; i < used_lrs; i++) {
 		expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
 			      (cpu_if->vgic_lr[i] & ICH_LR_EOI));
 	}
@@ -150,6 +148,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
 void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 	u64 val;
 
 	/*
@@ -159,23 +158,19 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 	if (!cpu_if->vgic_sre)
 		dsb(st);
 
-	if (vcpu->arch.vgic_cpu.live_lrs) {
+	if (used_lrs) {
 		int i;
-		u32 max_lr_idx, nr_pri_bits;
+		u32 nr_pri_bits;
 
 		cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
 
 		write_gicreg(0, ICH_HCR_EL2);
 		val = read_gicreg(ICH_VTR_EL2);
-		max_lr_idx = vtr_to_max_lr_idx(val);
 		nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-		save_maint_int_state(vcpu, max_lr_idx + 1);
-
-		for (i = 0; i <= max_lr_idx; i++) {
-			if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-				continue;
+		save_maint_int_state(vcpu);
 
+		for (i = 0; i <= used_lrs; i++) {
 			if (cpu_if->vgic_elrsr & (1 << i))
 				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
 			else
@@ -203,8 +198,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 		default:
 			cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
 		}
-
-		vcpu->arch.vgic_cpu.live_lrs = 0;
 	} else {
 		cpu_if->vgic_misr  = 0;
 		cpu_if->vgic_eisr  = 0;
@@ -232,9 +225,9 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
 	u64 val;
-	u32 max_lr_idx, nr_pri_bits;
-	u16 live_lrs = 0;
+	u32 nr_pri_bits;
 	int i;
 
 	/*
@@ -251,15 +244,9 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	}
 
 	val = read_gicreg(ICH_VTR_EL2);
-	max_lr_idx = vtr_to_max_lr_idx(val);
 	nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-	for (i = 0; i <= max_lr_idx; i++) {
-		if (cpu_if->vgic_lr[i] & ICH_LR_STATE)
-			live_lrs |= (1 << i);
-	}
-
-	if (live_lrs) {
+	if (used_lrs) {
 		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
 
 		switch (nr_pri_bits) {
@@ -282,12 +269,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 			write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
 		}
 
-		for (i = 0; i <= max_lr_idx; i++) {
-			if (!(live_lrs & (1 << i)))
-				continue;
-
+		for (i = 0; i < used_lrs; i++)
 			__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
-		}
 	}
 
 	/*
@@ -299,7 +282,6 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 		isb();
 		dsb(sy);
 	}
-	vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 
 	/*
 	 * Prevent the guest from touching the GIC system registers if

From 90cac1f52ad1db73b6ed99143ce7ad473bd90a95 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Tue, 21 Mar 2017 21:16:12 +0100
Subject: [PATCH 096/227] KVM: arm/arm64: vgic: Only set underflow when
 actually out of LRs

We currently assume that all the interrupts in our AP list will be
queued to LRs, but that's not necessarily the case, because some of them
could have been migrated away to different VCPUs and only the VCPU
thread itself can remove interrupts from its AP list.

Therefore, slightly change the logic to only setting the underflow
interrupt when we actually run out of LRs.

As it turns out, this allows us to further simplify the handling in
vgic_sync_hwstate in later patches.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/vgic/vgic.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 104329139f24b3..442f7df2a46a17 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -601,10 +601,8 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 
 	DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
 
-	if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) {
-		vgic_set_underflow(vcpu);
+	if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr)
 		vgic_sort_ap_list(vcpu);
-	}
 
 	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
 		spin_lock(&irq->irq_lock);
@@ -623,8 +621,12 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 next:
 		spin_unlock(&irq->irq_lock);
 
-		if (count == kvm_vgic_global_state.nr_lr)
+		if (count == kvm_vgic_global_state.nr_lr) {
+			if (!list_is_last(&irq->ap_list,
+					  &vgic_cpu->ap_list_head))
+				vgic_set_underflow(vcpu);
 			break;
+		}
 	}
 
 	vcpu->arch.vgic_cpu.used_lrs = count;

From af0614991ab64a55f86cda257cedff1be4e435fa Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 29 Dec 2016 15:44:27 +0100
Subject: [PATCH 097/227] KVM: arm/arm64: vgic: Get rid of unnecessary
 process_maintenance operation

Since we always read back the LRs that we wrote to the guest and the
MISR and EISR registers simply provide a summary of the configuration of
the bits in the LRs, there is really no need to read back those status
registers and process them.  We might as well just signal the
notifyfd when folding the LR state and save some cycles in the process.
We now clear the underflow bit in the fold_lr_state functions as we only
need to clear this bit if we had used all the LRs, so this is as good a
place as any to do that work.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-v2.c | 59 ++++++++-----------------------------
 virt/kvm/arm/vgic/vgic-v3.c | 51 ++++++++------------------------
 virt/kvm/arm/vgic/vgic.c    |  9 ------
 virt/kvm/arm/vgic/vgic.h    |  2 --
 4 files changed, 25 insertions(+), 96 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 2f241e026c8f38..b58b086d8d0768 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -22,59 +22,17 @@
 
 #include "vgic.h"
 
-/*
- * Call this function to convert a u64 value to an unsigned long * bitmask
- * in a way that works on both 32-bit and 64-bit LE and BE platforms.
- *
- * Warning: Calling this function may modify *val.
- */
-static unsigned long *u64_to_bitmask(u64 *val)
-{
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
-	*val = (*val >> 32) | (*val << 32);
-#endif
-	return (unsigned long *)val;
-}
-
-void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu)
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
 
-	if (cpuif->vgic_misr & GICH_MISR_EOI) {
-		u64 eisr = cpuif->vgic_eisr;
-		unsigned long *eisr_bmap = u64_to_bitmask(&eisr);
-		int lr;
-
-		for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) {
-			u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID;
-
-			WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE);
-
-			/* Only SPIs require notification */
-			if (vgic_valid_spi(vcpu->kvm, intid))
-				kvm_notify_acked_irq(vcpu->kvm, 0,
-						     intid - VGIC_NR_PRIVATE_IRQS);
-		}
-	}
-
-	/* check and disable underflow maintenance IRQ */
-	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
-	/*
-	 * In the next iterations of the vcpu loop, if we sync the
-	 * vgic state after flushing it, but before entering the guest
-	 * (this happens for pending signals and vmid rollovers), then
-	 * make sure we don't pick up any old maintenance interrupts
-	 * here.
-	 */
-	cpuif->vgic_eisr = 0;
+	cpuif->vgic_hcr |= GICH_HCR_UIE;
 }
 
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+static bool lr_signals_eoi_mi(u32 lr_val)
 {
-	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	cpuif->vgic_hcr |= GICH_HCR_UIE;
+	return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
+	       !(lr_val & GICH_LR_HW);
 }
 
 /*
@@ -89,11 +47,18 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
 	int lr;
 
+	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
 	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
 		u32 val = cpuif->vgic_lr[lr];
 		u32 intid = val & GICH_LR_VIRTUALID;
 		struct vgic_irq *irq;
 
+		/* Notify fds when the guest EOI'ed a level-triggered SPI */
+		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+
 		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
 
 		spin_lock(&irq->irq_lock);
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 99213d744e4fe6..4f2dce68660011 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -21,50 +21,17 @@
 
 #include "vgic.h"
 
-void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-
-	if (cpuif->vgic_misr & ICH_MISR_EOI) {
-		unsigned long eisr_bmap = cpuif->vgic_eisr;
-		int lr;
-
-		for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) {
-			u32 intid;
-			u64 val = cpuif->vgic_lr[lr];
-
-			if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
-				intid = val & ICH_LR_VIRTUAL_ID_MASK;
-			else
-				intid = val & GICH_LR_VIRTUALID;
-
-			WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE);
-
-			/* Only SPIs require notification */
-			if (vgic_valid_spi(vcpu->kvm, intid))
-				kvm_notify_acked_irq(vcpu->kvm, 0,
-						     intid - VGIC_NR_PRIVATE_IRQS);
-		}
-
-		/*
-		 * In the next iterations of the vcpu loop, if we sync
-		 * the vgic state after flushing it, but before
-		 * entering the guest (this happens for pending
-		 * signals and vmid rollovers), then make sure we
-		 * don't pick up any old maintenance interrupts here.
-		 */
-		cpuif->vgic_eisr = 0;
-	}
 
-	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+	cpuif->vgic_hcr |= ICH_HCR_UIE;
 }
 
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+static bool lr_signals_eoi_mi(u64 lr_val)
 {
-	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	cpuif->vgic_hcr |= ICH_HCR_UIE;
+	return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
+	       !(lr_val & ICH_LR_HW);
 }
 
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
@@ -73,6 +40,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 	u32 model = vcpu->kvm->arch.vgic.vgic_model;
 	int lr;
 
+	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+
 	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
 		u64 val = cpuif->vgic_lr[lr];
 		u32 intid;
@@ -82,6 +51,12 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 			intid = val & ICH_LR_VIRTUAL_ID_MASK;
 		else
 			intid = val & GICH_LR_VIRTUALID;
+
+		/* Notify fds when the guest EOI'ed a level-triggered IRQ */
+		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+
 		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
 		if (!irq)	/* An LPI could have been unmapped. */
 			continue;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 442f7df2a46a17..b64b143e59f99c 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -527,14 +527,6 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 	spin_unlock(&vgic_cpu->ap_list_lock);
 }
 
-static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_process_maintenance(vcpu);
-	else
-		vgic_v3_process_maintenance(vcpu);
-}
-
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
 {
 	if (kvm_vgic_global_state.type == VGIC_V2)
@@ -644,7 +636,6 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	if (unlikely(!vgic_initialized(vcpu->kvm)))
 		return;
 
-	vgic_process_maintenance_interrupt(vcpu);
 	vgic_fold_lr_state(vcpu);
 	vgic_prune_ap_list(vcpu);
 
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 9afb4557c7e88c..44445dac083547 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -112,7 +112,6 @@ void vgic_kick_vcpus(struct kvm *kvm);
 int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
 		      phys_addr_t addr, phys_addr_t alignment);
 
-void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
@@ -141,7 +140,6 @@ static inline void vgic_get_irq_kref(struct vgic_irq *irq)
 	kref_get(&irq->refcount);
 }
 
-void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
 void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);

From b6095b084d875ef40fd294a3ce53cffc028f6884 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 29 Dec 2016 15:48:57 +0100
Subject: [PATCH 098/227] KVM: arm/arm64: vgic: Get rid of unnecessary
 save_maint_int_state

Now when we don't look at the MISR and EISR values anymore, we can get
rid of the logic to save them in the GIC save/restore code.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/hyp/vgic-v2-sr.c | 40 -----------------------------------
 virt/kvm/arm/hyp/vgic-v3-sr.c | 29 -------------------------
 2 files changed, 69 deletions(-)

diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index 34b37ce0d4be4f..a4c3bb00572587 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -22,45 +22,6 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 
-static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
-					    void __iomem *base)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	u32 eisr0, eisr1;
-	int i;
-	bool expect_mi;
-
-	expect_mi = !!(cpu_if->vgic_hcr & GICH_HCR_UIE);
-
-	for (i = 0; i < used_lrs && !expect_mi; i++)
-		expect_mi |= (!(cpu_if->vgic_lr[i] & GICH_LR_HW) &&
-			      (cpu_if->vgic_lr[i] & GICH_LR_EOI));
-
-	if (expect_mi) {
-		cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR);
-
-		if (cpu_if->vgic_misr & GICH_MISR_EOI) {
-			eisr0  = readl_relaxed(base + GICH_EISR0);
-			if (unlikely(used_lrs > 32))
-				eisr1  = readl_relaxed(base + GICH_EISR1);
-			else
-				eisr1 = 0;
-		} else {
-			eisr0 = eisr1 = 0;
-		}
-	} else {
-		cpu_if->vgic_misr = 0;
-		eisr0 = eisr1 = 0;
-	}
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-	cpu_if->vgic_eisr = ((u64)eisr0 << 32) | eisr1;
-#else
-	cpu_if->vgic_eisr = ((u64)eisr1 << 32) | eisr0;
-#endif
-}
-
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
@@ -111,7 +72,6 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
 	if (used_lrs) {
 		cpu_if->vgic_apr = readl_relaxed(base + GICH_APR);
 
-		save_maint_int_state(vcpu, base);
 		save_elrsr(vcpu, base);
 		save_lrs(vcpu, base);
 
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index b3c36b64df3403..41bbbb054a6fa3 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -118,33 +118,6 @@ static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
 	}
 }
 
-static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	int i;
-	bool expect_mi;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-	expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
-
-	for (i = 0; i < used_lrs; i++) {
-		expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
-			      (cpu_if->vgic_lr[i] & ICH_LR_EOI));
-	}
-
-	if (expect_mi) {
-		cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
-
-		if (cpu_if->vgic_misr & ICH_MISR_EOI)
-			cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2);
-		else
-			cpu_if->vgic_eisr = 0;
-	} else {
-		cpu_if->vgic_misr = 0;
-		cpu_if->vgic_eisr = 0;
-	}
-}
-
 void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -168,8 +141,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 		val = read_gicreg(ICH_VTR_EL2);
 		nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-		save_maint_int_state(vcpu);
-
 		for (i = 0; i <= used_lrs; i++) {
 			if (cpu_if->vgic_elrsr & (1 << i))
 				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;

From 096f31c4360f6bab130e3f68513719ec6890128c Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 29 Dec 2016 15:57:31 +0100
Subject: [PATCH 099/227] KVM: arm/arm64: vgic: Get rid of MISR and EISR fields

We don't use these fields anymore so let's nuke them completely.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/kvm/arm_vgic.h        | 4 ----
 virt/kvm/arm/hyp/vgic-v2-sr.c | 2 --
 virt/kvm/arm/hyp/vgic-v3-sr.c | 2 --
 3 files changed, 8 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ea940dbb5dba7e..26ed4fb896bb48 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -225,8 +225,6 @@ struct vgic_dist {
 struct vgic_v2_cpu_if {
 	u32		vgic_hcr;
 	u32		vgic_vmcr;
-	u32		vgic_misr;	/* Saved only */
-	u64		vgic_eisr;	/* Saved only */
 	u64		vgic_elrsr;	/* Saved only */
 	u32		vgic_apr;
 	u32		vgic_lr[VGIC_V2_MAX_LRS];
@@ -236,8 +234,6 @@ struct vgic_v3_cpu_if {
 	u32		vgic_hcr;
 	u32		vgic_vmcr;
 	u32		vgic_sre;	/* Restored only, change ignored */
-	u32		vgic_misr;	/* Saved only */
-	u32		vgic_eisr;	/* Saved only */
 	u32		vgic_elrsr;	/* Saved only */
 	u32		vgic_ap0r[4];
 	u32		vgic_ap1r[4];
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index a4c3bb00572587..a3f18d3623661a 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -77,9 +77,7 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
 
 		writel_relaxed(0, base + GICH_HCR);
 	} else {
-		cpu_if->vgic_eisr = 0;
 		cpu_if->vgic_elrsr = ~0UL;
-		cpu_if->vgic_misr = 0;
 		cpu_if->vgic_apr = 0;
 	}
 }
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 41bbbb054a6fa3..3d0b1ddb692982 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -170,8 +170,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 			cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
 		}
 	} else {
-		cpu_if->vgic_misr  = 0;
-		cpu_if->vgic_eisr  = 0;
 		cpu_if->vgic_elrsr = 0xffff;
 		cpu_if->vgic_ap0r[0] = 0;
 		cpu_if->vgic_ap0r[1] = 0;

From 966e0149196fe02c6d239f00162e9f92a5bbf3d5 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Sat, 18 Mar 2017 13:40:37 +0100
Subject: [PATCH 100/227] KVM: arm/arm64: vgic: Implement early VGIC init
 functionality

Implement early initialization for both the distributor and the CPU
interfaces.  The basic idea is that even though the VGIC is not
functional or not requested from user space, the critical path of the
run loop can still call VGIC functions that just won't do anything,
without them having to check additional initialization flags to ensure
they don't look at uninitialized data structures.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-init.c | 96 ++++++++++++++++++++---------------
 1 file changed, 56 insertions(+), 40 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index e8e973b72ca539..87de048fe14726 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -24,7 +24,12 @@
 
 /*
  * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.
+ * initialization, both for the distributor and the CPU interfaces.  The basic
+ * idea is that even though the VGIC is not functional or not requested from
+ * user space, the critical path of the run loop can still call VGIC functions
+ * that just won't do anything, without them having to check additional
+ * initialization flags to ensure they don't look at uninitialized data
+ * structures.
  *
  * Distributor:
  *
@@ -39,23 +44,67 @@
  *
  * CPU Interface:
  *
- * - kvm_vgic_cpu_early_init(): initialization of static data that
+ * - kvm_vgic_vcpu_early_init(): initialization of static data that
  *   doesn't depend on any sizing information or emulation type. No
  *   allocation is allowed there.
  */
 
 /* EARLY INIT */
 
-/*
- * Those 2 functions should not be needed anymore but they
- * still are called from arm.c
+/**
+ * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
+ * @kvm: The VM whose VGIC districutor should be initialized
+ *
+ * Only do initialization of static structures that don't require any
+ * allocation or sizing information from userspace.  vgic_init() called
+ * kvm_vgic_dist_init() which takes care of the rest.
  */
 void kvm_vgic_early_init(struct kvm *kvm)
 {
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	INIT_LIST_HEAD(&dist->lpi_list_head);
+	spin_lock_init(&dist->lpi_list_lock);
 }
 
+/**
+ * kvm_vgic_vcpu_early_init() - Initialize static VGIC VCPU data structures
+ * @vcpu: The VCPU whose VGIC data structures whould be initialized
+ *
+ * Only do initialization, but do not actually enable the VGIC CPU interface
+ * yet.
+ */
 void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
 {
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	int i;
+
+	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+	spin_lock_init(&vgic_cpu->ap_list_lock);
+
+	/*
+	 * Enable and configure all SGIs to be edge-triggered and
+	 * configure all PPIs as level-triggered.
+	 */
+	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+		INIT_LIST_HEAD(&irq->ap_list);
+		spin_lock_init(&irq->irq_lock);
+		irq->intid = i;
+		irq->vcpu = NULL;
+		irq->target_vcpu = vcpu;
+		irq->targets = 1U << vcpu->vcpu_id;
+		kref_init(&irq->refcount);
+		if (vgic_irq_is_sgi(i)) {
+			/* SGIs */
+			irq->enabled = 1;
+			irq->config = VGIC_CONFIG_EDGE;
+		} else {
+			/* PPIs */
+			irq->config = VGIC_CONFIG_LEVEL;
+		}
+	}
 }
 
 /* CREATION */
@@ -148,9 +197,6 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
 	int i;
 
-	INIT_LIST_HEAD(&dist->lpi_list_head);
-	spin_lock_init(&dist->lpi_list_lock);
-
 	dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
 	if (!dist->spis)
 		return  -ENOMEM;
@@ -181,41 +227,11 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 }
 
 /**
- * kvm_vgic_vcpu_init: initialize the vcpu data structures and
- * enable the VCPU interface
- * @vcpu: the VCPU which's VGIC should be initialized
+ * kvm_vgic_vcpu_init() - Enable the VCPU interface
+ * @vcpu: the VCPU which's VGIC should be enabled
  */
 static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	int i;
-
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-	spin_lock_init(&vgic_cpu->ap_list_lock);
-
-	/*
-	 * Enable and configure all SGIs to be edge-triggered and
-	 * configure all PPIs as level-triggered.
-	 */
-	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-
-		INIT_LIST_HEAD(&irq->ap_list);
-		spin_lock_init(&irq->irq_lock);
-		irq->intid = i;
-		irq->vcpu = NULL;
-		irq->target_vcpu = vcpu;
-		irq->targets = 1U << vcpu->vcpu_id;
-		kref_init(&irq->refcount);
-		if (vgic_irq_is_sgi(i)) {
-			/* SGIs */
-			irq->enabled = 1;
-			irq->config = VGIC_CONFIG_EDGE;
-		} else {
-			/* PPIs */
-			irq->config = VGIC_CONFIG_LEVEL;
-		}
-	}
 	if (kvm_vgic_global_state.type == VGIC_V2)
 		vgic_v2_enable(vcpu);
 	else

From 0b09b6e51931ef5b4e0d7adee0a666c7f4b1867b Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Sat, 18 Mar 2017 13:41:54 +0100
Subject: [PATCH 101/227] KVM: arm/arm64: vgic: Don't check vgic_initialized in
 sync/flush

Now when we do an early init of the static parts of the VGIC data
structures, we can do things like checking if the AP lists are empty
directly without having to explicitly check if the vgic is initialized
and reduce a bit of work in our critical path.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/vgic/vgic.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index b64b143e59f99c..04a405ad562280 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -633,9 +633,6 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 
-	if (unlikely(!vgic_initialized(vcpu->kvm)))
-		return;
-
 	vgic_fold_lr_state(vcpu);
 	vgic_prune_ap_list(vcpu);
 
@@ -646,9 +643,6 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 /* Flush our emulation state into the GIC hardware before entering the guest. */
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(!vgic_initialized(vcpu->kvm)))
-		return;
-
 	/*
 	 * If there are no virtual interrupts active or pending for this
 	 * VCPU, then there is no work to do and we can bail out without

From 8ac76ef4b5139a1d10e459ae43b6c14f49391977 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Sat, 18 Mar 2017 13:48:42 +0100
Subject: [PATCH 102/227] KVM: arm/arm64: vgic: Improve sync_hwstate
 performance

There is no need to call any functions to fold LRs when we don't use any
LRs and we don't need to mess with overflow flags, take spinlocks, or
prune the AP list if the AP list is empty.

Note: list_empty is a single atomic read (uses READ_ONCE) and can
therefore check if a list is empty or not without the need to take the
spinlock protecting the list.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-v2.c |  7 +++++--
 virt/kvm/arm/vgic/vgic-v3.c |  7 +++++--
 virt/kvm/arm/vgic/vgic.c    | 10 ++++++----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index b58b086d8d0768..025b57d5787eda 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -44,12 +44,13 @@ static bool lr_signals_eoi_mi(u32 lr_val)
  */
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 {
-	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
 	int lr;
 
 	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
 
-	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
 		u32 val = cpuif->vgic_lr[lr];
 		u32 intid = val & GICH_LR_VIRTUALID;
 		struct vgic_irq *irq;
@@ -91,6 +92,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 		spin_unlock(&irq->irq_lock);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
+
+	vgic_cpu->used_lrs = 0;
 }
 
 /*
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 4f2dce68660011..bc7010db9f4d36 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -36,13 +36,14 @@ static bool lr_signals_eoi_mi(u64 lr_val)
 
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 {
-	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
 	u32 model = vcpu->kvm->arch.vgic.vgic_model;
 	int lr;
 
 	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
 
-	for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
 		u64 val = cpuif->vgic_lr[lr];
 		u32 intid;
 		struct vgic_irq *irq;
@@ -92,6 +93,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 		spin_unlock(&irq->irq_lock);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
+
+	vgic_cpu->used_lrs = 0;
 }
 
 /* Requires the irq to be locked already */
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 04a405ad562280..3d0979c307212c 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -633,11 +633,13 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 
-	vgic_fold_lr_state(vcpu);
-	vgic_prune_ap_list(vcpu);
+	/* An empty ap_list_head implies used_lrs == 0 */
+	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+		return;
 
-	/* Make sure we can fast-path in flush_hwstate */
-	vgic_cpu->used_lrs = 0;
+	if (vgic_cpu->used_lrs)
+		vgic_fold_lr_state(vcpu);
+	vgic_prune_ap_list(vcpu);
 }
 
 /* Flush our emulation state into the GIC hardware before entering the guest. */

From 6c0070366deadd16503853708a0e32a612b7f628 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:37 +0100
Subject: [PATCH 103/227] arm64: KVM: PMU: Refactor pmu_*_el0_disabled

There is a lot of duplication in the pmu_*_el0_disabled helpers,
and as we're going to modify them shortly, let's move all the
common stuff in a single function.

No functional change.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/sys_regs.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 0e26f8c2b56f82..036efc971e8948 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -460,35 +460,32 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	vcpu_sys_reg(vcpu, PMCR_EL0) = val;
 }
 
-static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu)
+static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
 {
 	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+	bool enabled = (reg & flags) || vcpu_mode_priv(vcpu);
 
-	return !((reg & ARMV8_PMU_USERENR_EN) || vcpu_mode_priv(vcpu));
+	return !enabled;
 }
 
-static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu)
+static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu)
 {
-	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+	return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN);
+}
 
-	return !((reg & (ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN))
-		 || vcpu_mode_priv(vcpu));
+static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu)
+{
+	return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN);
 }
 
 static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu)
 {
-	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
-
-	return !((reg & (ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN))
-		 || vcpu_mode_priv(vcpu));
+	return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN);
 }
 
 static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu)
 {
-	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
-
-	return !((reg & (ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN))
-		 || vcpu_mode_priv(vcpu));
+	return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN);
 }
 
 static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,

From 24d5950f6befae3f4b7eaab890ad9bc0988824b6 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:38 +0100
Subject: [PATCH 104/227] arm64: KVM: PMU: Inject UNDEF exception on illegal
 register access

Both pmu_*_el0_disabled() and pmu_counter_idx_valid() perform checks
on the validity of an access, but only return a boolean indicating
if the access is valid or not.

Let's allow these functions to also inject an UNDEF exception if
the access was illegal.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kvm/sys_regs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 036efc971e8948..750c129fa3b863 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -465,6 +465,9 @@ static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
 	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
 	bool enabled = (reg & flags) || vcpu_mode_priv(vcpu);
 
+	if (!enabled)
+		kvm_inject_undefined(vcpu);
+
 	return !enabled;
 }
 
@@ -564,8 +567,10 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
 
 	pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
 	val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
-	if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX)
+	if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) {
+		kvm_inject_undefined(vcpu);
 		return false;
+	}
 
 	return true;
 }

From 9008c235cb8162a4110c9153cace8530bc716210 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:39 +0100
Subject: [PATCH 105/227] arm64: KVM: PMU: Inject UNDEF on non-privileged
 accesses

access_pminten() and access_pmuserenr() can only be accessed when
the CPU is in a priviledged mode. If it is not, let's inject an
UNDEF exception.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kvm/sys_regs.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 750c129fa3b863..d343c0febdd60c 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -709,8 +709,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	if (!kvm_arm_pmu_v3_ready(vcpu))
 		return trap_raz_wi(vcpu, p, r);
 
-	if (!vcpu_mode_priv(vcpu))
+	if (!vcpu_mode_priv(vcpu)) {
+		kvm_inject_undefined(vcpu);
 		return false;
+	}
 
 	if (p->is_write) {
 		u64 val = p->regval & mask;
@@ -780,8 +782,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		return trap_raz_wi(vcpu, p, r);
 
 	if (p->is_write) {
-		if (!vcpu_mode_priv(vcpu))
+		if (!vcpu_mode_priv(vcpu)) {
+			kvm_inject_undefined(vcpu);
 			return false;
+		}
 
 		vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval
 						    & ARMV8_PMU_USERENR_MASK;

From 7b5b4df1a79954e0b208630fc63b16ec0231a516 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:40 +0100
Subject: [PATCH 106/227] arm64: KVM: Make unexpected reads from WO registers
 inject an undef

Reads from write-only system registers are generally confined to
EL1 and not propagated to EL2 (that's what the architecture
mantates). In order to be sure that we have a sane behaviour
even in the unlikely event that we have a broken system, we still
handle it in KVM.

In that case, let's inject an undef into the guest.

Let's also remove write_to_read_only which isn't used anywhere.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kvm/sys_regs.c |  9 +++++++++
 arch/arm64/kvm/sys_regs.h | 18 ------------------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index d343c0febdd60c..20f90c01af33ba 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -55,6 +55,15 @@
  * 64bit interface.
  */
 
+static bool read_from_write_only(struct kvm_vcpu *vcpu,
+				 const struct sys_reg_params *params)
+{
+	WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n");
+	print_sys_reg_instr(params);
+	kvm_inject_undefined(vcpu);
+	return false;
+}
+
 /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
 static u32 cache_levels;
 
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 9c6ffd0f0196ed..638f724e45afff 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -83,24 +83,6 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
 	return true;
 }
 
-static inline bool write_to_read_only(struct kvm_vcpu *vcpu,
-				      const struct sys_reg_params *params)
-{
-	kvm_debug("sys_reg write to read-only register at: %lx\n",
-		  *vcpu_pc(vcpu));
-	print_sys_reg_instr(params);
-	return false;
-}
-
-static inline bool read_from_write_only(struct kvm_vcpu *vcpu,
-					const struct sys_reg_params *params)
-{
-	kvm_debug("sys_reg read to write-only register at: %lx\n",
-		  *vcpu_pc(vcpu));
-	print_sys_reg_instr(params);
-	return false;
-}
-
 /* Reset functions */
 static inline void reset_unknown(struct kvm_vcpu *vcpu,
 				 const struct sys_reg_desc *r)

From e04432301607a6750e31d7db4cc4dfdd742ee7f8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:41 +0100
Subject: [PATCH 107/227] arm64: KVM: PMU: Inject UNDEF on read access to
 PMSWINC_EL0

PMSWINC_EL0 is a WO register, so let's UNDEF when reading from it
(in the highly hypothetical case where this doesn't UNDEF at EL1).

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kvm/sys_regs.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 20f90c01af33ba..3fef01d751642d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -772,16 +772,15 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	if (!kvm_arm_pmu_v3_ready(vcpu))
 		return trap_raz_wi(vcpu, p, r);
 
+	if (!p->is_write)
+		return read_from_write_only(vcpu, p);
+
 	if (pmu_write_swinc_el0_disabled(vcpu))
 		return false;
 
-	if (p->is_write) {
-		mask = kvm_pmu_valid_counter_mask(vcpu);
-		kvm_pmu_software_increment(vcpu, p->regval & mask);
-		return true;
-	}
-
-	return false;
+	mask = kvm_pmu_valid_counter_mask(vcpu);
+	kvm_pmu_software_increment(vcpu, p->regval & mask);
+	return true;
 }
 
 static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,

From e70b952263445ef3f0e4934d7277145f0fecd709 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:42 +0100
Subject: [PATCH 108/227] arm64: KVM: Treat sysreg accessors returning false as
 successful

Instead of considering that a sysreg accessor has failed when
returning false, let's consider that it is *always* successful
(after all, we won't stand for an incomplete emulation).

The return value now simply indicates whether we should skip
the instruction (because it has now been emulated), or if we
should leave the PC alone if the emulation has injected an
exception.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kvm/sys_regs.c | 49 ++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 3fef01d751642d..2f4418ed4a70b2 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1571,6 +1571,22 @@ int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	return 1;
 }
 
+static void perform_access(struct kvm_vcpu *vcpu,
+			   struct sys_reg_params *params,
+			   const struct sys_reg_desc *r)
+{
+	/*
+	 * Not having an accessor means that we have configured a trap
+	 * that we don't know how to handle. This certainly qualifies
+	 * as a gross bug that should be fixed right away.
+	 */
+	BUG_ON(!r->access);
+
+	/* Skip instruction if instructed so */
+	if (likely(r->access(vcpu, params, r)))
+		kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+}
+
 /*
  * emulate_cp --  tries to match a sys_reg access in a handling table, and
  *                call the corresponding trap handler.
@@ -1594,20 +1610,8 @@ static int emulate_cp(struct kvm_vcpu *vcpu,
 	r = find_reg(params, table, num);
 
 	if (r) {
-		/*
-		 * Not having an accessor means that we have
-		 * configured a trap that we don't know how to
-		 * handle. This certainly qualifies as a gross bug
-		 * that should be fixed right away.
-		 */
-		BUG_ON(!r->access);
-
-		if (likely(r->access(vcpu, params, r))) {
-			/* Skip instruction, since it was emulated */
-			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-			/* Handled */
-			return 0;
-		}
+		perform_access(vcpu, params, r);
+		return 0;
 	}
 
 	/* Not handled */
@@ -1777,26 +1781,13 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
 		r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
 
 	if (likely(r)) {
-		/*
-		 * Not having an accessor means that we have
-		 * configured a trap that we don't know how to
-		 * handle. This certainly qualifies as a gross bug
-		 * that should be fixed right away.
-		 */
-		BUG_ON(!r->access);
-
-		if (likely(r->access(vcpu, params, r))) {
-			/* Skip instruction, since it was emulated */
-			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-			return 1;
-		}
-		/* If access function fails, it should complain. */
+		perform_access(vcpu, params, r);
 	} else {
 		kvm_err("Unsupported guest sys_reg access at: %lx\n",
 			*vcpu_pc(vcpu));
 		print_sys_reg_instr(params);
+		kvm_inject_undefined(vcpu);
 	}
-	kvm_inject_undefined(vcpu);
 	return 1;
 }
 

From b6b7a8069d5f44bcc248f5d067cdb22debadfa56 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:43 +0100
Subject: [PATCH 109/227] arm64: KVM: Do not corrupt registers on failed 64bit
 CP read

If we fail to emulate a mrrc instruction, we:
1) deliver an exception,
2) spit a nastygram on the console,
3) write back some garbage to Rt/Rt2

While 1) and 2) are perfectly acceptable, 3) is out of the scope of
the architecture... Let's mimick the code in kvm_handle_cp_32 and
be more cautious.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/sys_regs.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 2f4418ed4a70b2..582d68ee4b01dd 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1678,20 +1678,25 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
 		params.regval |= vcpu_get_reg(vcpu, Rt2) << 32;
 	}
 
-	if (!emulate_cp(vcpu, &params, target_specific, nr_specific))
-		goto out;
-	if (!emulate_cp(vcpu, &params, global, nr_global))
-		goto out;
-
-	unhandled_cp_access(vcpu, &params);
+	/*
+	 * Try to emulate the coprocessor access using the target
+	 * specific table first, and using the global table afterwards.
+	 * If either of the tables contains a handler, handle the
+	 * potential register operation in the case of a read and return
+	 * with success.
+	 */
+	if (!emulate_cp(vcpu, &params, target_specific, nr_specific) ||
+	    !emulate_cp(vcpu, &params, global, nr_global)) {
+		/* Split up the value between registers for the read side */
+		if (!params.is_write) {
+			vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval));
+			vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval));
+		}
 
-out:
-	/* Split up the value between registers for the read side */
-	if (!params.is_write) {
-		vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval));
-		vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval));
+		return 1;
 	}
 
+	unhandled_cp_access(vcpu, &params);
 	return 1;
 }
 

From b1d4cb69838f71d62dc0ecf21566a1425835f015 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:44 +0100
Subject: [PATCH 110/227] arm: KVM: Make unexpected register accesses inject an
 undef

Reads from write-only system registers are generally confined to
EL1 and not propagated to EL2 (that's what the architecture
mantates). In order to be sure that we have a sane behaviour
even in the unlikely event that we have a broken system, we still
handle it in KVM. Same goes for write to RO registers.

In that case, let's inject an undef into the guest.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/kvm/coproc.c | 18 ++++++++++++++++++
 arch/arm/kvm/coproc.h | 18 ------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 3e5e4194ef8696..519aac12b3651f 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -40,6 +40,24 @@
  * Co-processor emulation
  *****************************************************************************/
 
+static bool write_to_read_only(struct kvm_vcpu *vcpu,
+			       const struct coproc_params *params)
+{
+	WARN_ONCE(1, "CP15 write to read-only register\n");
+	print_cp_instr(params);
+	kvm_inject_undefined(vcpu);
+	return false;
+}
+
+static bool read_from_write_only(struct kvm_vcpu *vcpu,
+				 const struct coproc_params *params)
+{
+	WARN_ONCE(1, "CP15 read to write-only register\n");
+	print_cp_instr(params);
+	kvm_inject_undefined(vcpu);
+	return false;
+}
+
 /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
 static u32 cache_levels;
 
diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h
index eef1759c2b65b9..3a41b7d1eb86ef 100644
--- a/arch/arm/kvm/coproc.h
+++ b/arch/arm/kvm/coproc.h
@@ -81,24 +81,6 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
 	return true;
 }
 
-static inline bool write_to_read_only(struct kvm_vcpu *vcpu,
-				      const struct coproc_params *params)
-{
-	kvm_debug("CP15 write to read-only register at: %08lx\n",
-		  *vcpu_pc(vcpu));
-	print_cp_instr(params);
-	return false;
-}
-
-static inline bool read_from_write_only(struct kvm_vcpu *vcpu,
-					const struct coproc_params *params)
-{
-	kvm_debug("CP15 read to write-only register at: %08lx\n",
-		  *vcpu_pc(vcpu));
-	print_cp_instr(params);
-	return false;
-}
-
 /* Reset functions */
 static inline void reset_unknown(struct kvm_vcpu *vcpu,
 				 const struct coproc_reg *r)

From 9d0d4d34d9b527a1514d32836708be31f54c3c0b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 27 Mar 2017 17:03:45 +0100
Subject: [PATCH 111/227] arm: KVM: Treat CP15 accessors returning false as
 successful

Instead of considering that a CP15 accessor has failed when
returning false, let's consider that it is *always* successful
(after all, we won't stand for an incomplete emulation).

The return value now simply indicates whether we should skip
the instruction (because it has now been emulated), or if we
should leave the PC alone if the emulation has injected an
exception.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/kvm/coproc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 519aac12b3651f..2c14b69511e981 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -520,15 +520,15 @@ static int emulate_cp15(struct kvm_vcpu *vcpu,
 		if (likely(r->access(vcpu, params, r))) {
 			/* Skip instruction, since it was emulated */
 			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-			return 1;
 		}
-		/* If access function fails, it should complain. */
 	} else {
+		/* If access function fails, it should complain. */
 		kvm_err("Unsupported guest CP15 access at: %08lx\n",
 			*vcpu_pc(vcpu));
 		print_cp_instr(params);
+		kvm_inject_undefined(vcpu);
 	}
-	kvm_inject_undefined(vcpu);
+
 	return 1;
 }
 

From 50d912cc3e5055c14e13a8e7de5c078b01a2e2b7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:34 +0100
Subject: [PATCH 112/227] arm64: hyp-stub: Stop pointlessly clobbering lr

When entering the kernel hyp stub, we check whether or not we've
made it here through an HVC instruction, clobbering lr (aka x30)
in the process.

This is completely pointless, as HVC is the only way to get here
(all traps to EL2 are disabled, no interrupt override is applied).

So let's remove this bit of code whose only point is to corrupt
a valuable register.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kernel/hyp-stub.S | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index d3b5f75e652efc..e4215ad06930ad 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -55,12 +55,6 @@ ENDPROC(__hyp_stub_vectors)
 	.align 11
 
 el1_sync:
-	mrs	x30, esr_el2
-	lsr	x30, x30, #ESR_ELx_EC_SHIFT
-
-	cmp	x30, #ESR_ELx_EC_HVC64
-	b.ne	9f				// Not an HVC trap
-
 	cmp	x0, #HVC_GET_VECTORS
 	b.ne	1f
 	mrs	x0, vbar_el2

From 6c9ae25dfcaea32512fc27308c64e78159c46aeb Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:35 +0100
Subject: [PATCH 113/227] arm64: KVM: Move lr save/restore to do_el2_call

At the moment, we only save/restore lr if on VHE, as we rely only
the EL1 code to have preserved it in the non-VHE case.

As we're about to get rid of the latter, let's move the save/restore
code to the do_el2_call macro, unifying both code paths.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/hyp.S           | 3 ---
 arch/arm64/kvm/hyp/hyp-entry.S | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 2726635dceba83..f6f20b58d65a58 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -38,13 +38,10 @@
  * A function pointer with a value less than 0xfff has a special meaning,
  * and is used to implement __hyp_get_vectors in the same way as in
  * arch/arm64/kernel/hyp_stub.S.
- * HVC behaves as a 'bl' call and will clobber lr.
  */
 ENTRY(__kvm_call_hyp)
 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	str     lr, [sp, #-16]!
 	hvc	#0
-	ldr     lr, [sp], #16
 	ret
 alternative_else_nop_endif
 	b	__vhe_hyp_call
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 5e9052f087f289..d8ef788646c605 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -32,17 +32,17 @@
 	 * Shuffle the parameters before calling the function
 	 * pointed to in x0. Assumes parameters in x[1,2,3].
 	 */
+	str	lr, [sp, #-16]!
 	mov	lr, x0
 	mov	x0, x1
 	mov	x1, x2
 	mov	x2, x3
 	blr	lr
+	ldr	lr, [sp], #16
 .endm
 
 ENTRY(__vhe_hyp_call)
-	str	lr, [sp, #-16]!
 	do_el2_call
-	ldr	lr, [sp], #16
 	/*
 	 * We used to rely on having an exception return to get
 	 * an implicit isb. In the E2H case, we don't have it anymore.

From 9bae3ae513015eea9534cc2c2b6f7e405abad71a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:36 +0100
Subject: [PATCH 114/227] arm64: hyp-stub: Don't save lr in the EL1 code

The EL2 code is not corrupting lr anymore, so don't bother preserving
it in the EL1 trampoline code.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kernel/hyp-stub.S | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index e4215ad06930ad..193dfb25ce2f8b 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -116,18 +116,14 @@ ENDPROC(\label)
  */
 
 ENTRY(__hyp_get_vectors)
-	str	lr, [sp, #-16]!
 	mov	x0, #HVC_GET_VECTORS
 	hvc	#0
-	ldr	lr, [sp], #16
 	ret
 ENDPROC(__hyp_get_vectors)
 
 ENTRY(__hyp_set_vectors)
-	str	lr, [sp, #-16]!
 	mov	x1, x0
 	mov	x0, #HVC_SET_VECTORS
 	hvc	#0
-	ldr	lr, [sp], #16
 	ret
 ENDPROC(__hyp_set_vectors)

From 4993fdcf399f59ed56a16ecfedf8a61066198816 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:37 +0100
Subject: [PATCH 115/227] arm64: hyp-stub: Define a return value for failed
 stub calls

Define a standard return value to be returned when a hyp stub
call fails, and make KVM use it for ARM_EXCEPTION_HYP_GONE
(instead of using a KVM-specific value).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/include/asm/kvm_asm.h | 2 +-
 arch/arm64/include/asm/virt.h    | 3 +++
 arch/arm64/kernel/hyp-stub.S     | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 49f99cd02613cf..b7e4ef5fcc410f 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -28,7 +28,7 @@
 #define ARM_EXCEPTION_EL1_SERROR  1
 #define ARM_EXCEPTION_TRAP	  2
 /* The hyp-stub will return this for any kvm_call_hyp() call */
-#define ARM_EXCEPTION_HYP_GONE	  3
+#define ARM_EXCEPTION_HYP_GONE	  HVC_STUB_ERR
 
 #define KVM_ARM64_DEBUG_DIRTY_SHIFT	0
 #define KVM_ARM64_DEBUG_DIRTY		(1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 439f6b5d31f675..1466d14c664d9f 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -39,6 +39,9 @@
  */
 #define HVC_SOFT_RESTART 2
 
+/* Error returned when an invalid stub number is passed into x0 */
+#define HVC_STUB_ERR	0xbadca11
+
 #define BOOT_CPU_MODE_EL1	(0xe11)
 #define BOOT_CPU_MODE_EL2	(0xe12)
 
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 193dfb25ce2f8b..f53e8b84cd3a62 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -74,7 +74,7 @@ el1_sync:
 	br	x4				// no return
 
 	/* Someone called kvm_call_hyp() against the hyp-stub... */
-3:	mov	x0, #ARM_EXCEPTION_HYP_GONE
+3:	ldr	x0, =HVC_STUB_ERR
 
 9:	eret
 ENDPROC(el1_sync)

From 810c86ee4ee70928952fe17bc593d569052d8198 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:38 +0100
Subject: [PATCH 116/227] arm64: hyp-stub: Update documentation in asm/virt.h

Comments in asm/virt.h are slightly out of date, so let's align
them with the new behaviour of the code.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/include/asm/virt.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 1466d14c664d9f..1569c3a0d7943b 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -19,9 +19,14 @@
 #define __ASM__VIRT_H
 
 /*
- * The arm64 hcall implementation uses x0 to specify the hcall type. A value
- * less than 0xfff indicates a special hcall, such as get/set vector.
- * Any other value is used as a pointer to the function to call.
+ * The arm64 hcall implementation uses x0 to specify the hcall
+ * number. A value less than HVC_STUB_HCALL_NR indicates a special
+ * hcall, such as set vector. Any other value is handled in a
+ * hypervisor specific way.
+ *
+ * The hypercall is allowed to clobber any of the caller-saved
+ * registers (x0-x18), so it is advisable to use it through the
+ * indirection of a function call (as implemented in hyp-stub.S).
  */
 
 /* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */

From fd0e0c617089e7ba8333c55bd2e318dfc128bf33 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:39 +0100
Subject: [PATCH 117/227] arm64: hyp-stub: Implement HVC_RESET_VECTORS stub
 hypercall

Let's define a new stub hypercall that resets the HYP configuration
to its default: hyp-stub vectors, and MMU disabled.

Of course, for the hyp-stub itself, this is a trivial no-op.
Hypervisors will have a bit more work to do.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/include/asm/virt.h |  9 +++++++++
 arch/arm64/kernel/hyp-stub.S  | 11 ++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 1569c3a0d7943b..435514c52b0fea 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -44,6 +44,14 @@
  */
 #define HVC_SOFT_RESTART 2
 
+/*
+ * HVC_RESET_VECTORS - Restore the vectors to the original HYP stubs
+ */
+#define HVC_RESET_VECTORS 3
+
+/* Max number of HYP stub hypercalls */
+#define HVC_STUB_HCALL_NR 4
+
 /* Error returned when an invalid stub number is passed into x0 */
 #define HVC_STUB_ERR	0xbadca11
 
@@ -70,6 +78,7 @@ extern u32 __boot_cpu_mode[2];
 
 void __hyp_set_vectors(phys_addr_t phys_vector_base);
 phys_addr_t __hyp_get_vectors(void);
+void __hyp_reset_vectors(void);
 
 /* Reports the availability of HYP mode */
 static inline bool is_hyp_mode_available(void)
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index f53e8b84cd3a62..8226fd90f73c63 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -73,8 +73,11 @@ el1_sync:
 	mov	x1, x3
 	br	x4				// no return
 
+3:	cmp	x0, #HVC_RESET_VECTORS
+	beq	9f				// Nothing to reset!
+
 	/* Someone called kvm_call_hyp() against the hyp-stub... */
-3:	ldr	x0, =HVC_STUB_ERR
+	ldr	x0, =HVC_STUB_ERR
 
 9:	eret
 ENDPROC(el1_sync)
@@ -127,3 +130,9 @@ ENTRY(__hyp_set_vectors)
 	hvc	#0
 	ret
 ENDPROC(__hyp_set_vectors)
+
+ENTRY(__hyp_reset_vectors)
+	mov	x0, #HVC_RESET_VECTORS
+	hvc	#0
+	ret
+ENDPROC(__hyp_reset_vectors)

From fb1b4e01d2ad9faec82c626f9bd7699daf8d09f5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:40 +0100
Subject: [PATCH 118/227] arm64: KVM: Implement HVC_RESET_VECTORS stub
 hypercall in the init code

In order to restore HYP mode to its original condition, KVM currently
implements __kvm_hyp_reset(). As we're moving towards a hyp-stub
defined API, it becomes necessary to implement HVC_RESET_VECTORS.

This patch adds the HVC_RESET_VECTORS hypercall to the KVM init
code, which so far lacked any form of hypercall support.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/hyp-init.S | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 6b29d3d9e1f285..5e39ad53bcae22 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -22,6 +22,7 @@
 #include <asm/kvm_mmu.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/sysreg.h>
+#include <asm/virt.h>
 
 	.text
 	.pushsection	.hyp.idmap.text, "ax"
@@ -58,6 +59,9 @@ __invalid:
 	 * x2: HYP vectors
 	 */
 __do_hyp_init:
+	/* Check for a stub HVC call */
+	cmp	x0, #HVC_STUB_HCALL_NR
+	b.lo	__kvm_handle_stub_hvc
 
 	msr	ttbr0_el2, x0
 
@@ -119,6 +123,9 @@ __do_hyp_init:
 	eret
 ENDPROC(__kvm_hyp_init)
 
+ENTRY(__kvm_handle_stub_hvc)
+	cmp	x0, #HVC_RESET_VECTORS
+	b.ne	1f
 	/*
 	 * Reset kvm back to the hyp stub.
 	 */
@@ -133,9 +140,15 @@ ENTRY(__kvm_hyp_reset)
 	/* Install stub vectors */
 	adr_l	x0, __hyp_stub_vectors
 	msr	vbar_el2, x0
+	b	exit
 
+1:	/* Bad stub call */
+	ldr	x0, =HVC_STUB_ERR
+
+exit:
 	eret
 ENDPROC(__kvm_hyp_reset)
+ENDPROC(__kvm_handle_stub_hvc)
 
 	.ltorg
 

From 82529d9bfd6b7b0e8177be94ff768161d5064264 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:41 +0100
Subject: [PATCH 119/227] arm64: KVM: Implement HVC_GET_VECTORS in the init
 code

Now that we have an infrastructure to handle hypercalls in the KVM
init code, let's implement HVC_GET_VECTORS there.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/hyp-init.S | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 5e39ad53bcae22..fded932ede1b2f 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -124,7 +124,12 @@ __do_hyp_init:
 ENDPROC(__kvm_hyp_init)
 
 ENTRY(__kvm_handle_stub_hvc)
-	cmp	x0, #HVC_RESET_VECTORS
+	cmp	x0, #HVC_GET_VECTORS
+	b.ne	1f
+	mrs	x0, vbar_el2
+	b	exit
+
+1:	cmp	x0, #HVC_RESET_VECTORS
 	b.ne	1f
 	/*
 	 * Reset kvm back to the hyp stub.

From 5fbe9a599a30940c10ede35f90eb7014a6c2ed70 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:42 +0100
Subject: [PATCH 120/227] arm64: KVM: Allow the main HYP code to use the init
 hyp stub implementation

We now have a full hyp-stub implementation in the KVM init code,
but the main KVM code only supports HVC_GET_VECTORS, which is not
enough.

Instead of reinventing the wheel, let's reuse the init implementation
by branching to the idmap page when called with a hyp-stub hypercall.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/hyp/hyp-entry.S | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index d8ef788646c605..4f34c5996f8635 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -87,10 +87,24 @@ alternative_endif
 	/* Here, we're pretty sure the host called HVC. */
 	ldp	x0, x1, [sp], #16
 
-	cmp	x0, #HVC_GET_VECTORS
-	b.ne	1f
-	mrs	x0, vbar_el2
-	b	2f
+	/* Check for a stub HVC call */
+	cmp	x0, #HVC_STUB_HCALL_NR
+	b.hs	1f
+
+	/*
+	 * Compute the idmap address of __kvm_handle_stub_hvc and
+	 * jump there. Since we use kimage_voffset, do not use the
+	 * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
+	 * (by loading it from the constant pool).
+	 *
+	 * Preserve x0-x4, which may contain stub parameters.
+	 */
+	ldr	x5, =__kvm_handle_stub_hvc
+	ldr_l	x6, kimage_voffset
+
+	/* x5 = __pa(x5) */
+	sub	x5, x5, x6
+	br	x5
 
 1:
 	/*
@@ -99,7 +113,7 @@ alternative_endif
 	kern_hyp_va	x0
 	do_el2_call
 
-2:	eret
+	eret
 
 el1_trap:
 	/*

From 4adb1341c7ef68af54732ef11f69faedffa6acb7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:43 +0100
Subject: [PATCH 121/227] arm64: KVM: Convert __cpu_reset_hyp_mode to using
 __hyp_reset_vectors

We are now able to use the hyp stub to reset HYP mode. Time to
kiss __kvm_hyp_reset goodbye, and use __hyp_reset_vectors.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/include/asm/kvm_asm.h  |  1 -
 arch/arm64/include/asm/kvm_host.h |  3 +--
 arch/arm64/kvm/hyp-init.S         |  2 --
 arch/arm64/kvm/hyp/hyp-entry.S    | 15 ---------------
 4 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index b7e4ef5fcc410f..26a64d0f9ab919 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -47,7 +47,6 @@ struct kvm_vcpu;
 
 extern char __kvm_hyp_init[];
 extern char __kvm_hyp_init_end[];
-extern char __kvm_hyp_reset[];
 
 extern char __kvm_hyp_vector[];
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e7705e7bb07b13..0355dd1099562c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -362,11 +362,10 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
 }
 
-void __kvm_hyp_teardown(void);
 static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
 					phys_addr_t phys_idmap_start)
 {
-	kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
+	__hyp_reset_vectors();
 }
 
 static inline void kvm_arch_hardware_unsetup(void) {}
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index fded932ede1b2f..b7a8f12293cca0 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -134,7 +134,6 @@ ENTRY(__kvm_handle_stub_hvc)
 	/*
 	 * Reset kvm back to the hyp stub.
 	 */
-ENTRY(__kvm_hyp_reset)
 	/* We're now in idmap, disable MMU */
 	mrs	x0, sctlr_el2
 	ldr	x1, =SCTLR_ELx_FLAGS
@@ -152,7 +151,6 @@ ENTRY(__kvm_hyp_reset)
 
 exit:
 	eret
-ENDPROC(__kvm_hyp_reset)
 ENDPROC(__kvm_handle_stub_hvc)
 
 	.ltorg
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 4f34c5996f8635..5170ce1021da05 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -53,21 +53,6 @@ ENTRY(__vhe_hyp_call)
 	ret
 ENDPROC(__vhe_hyp_call)
 
-/*
- * Compute the idmap address of __kvm_hyp_reset based on the idmap
- * start passed as a parameter, and jump there.
- *
- * x0: HYP phys_idmap_start
- */
-ENTRY(__kvm_hyp_teardown)
-	mov	x4, x0
-	adr_l	x3, __kvm_hyp_reset
-
-	/* insert __kvm_hyp_reset()s offset into phys_idmap_start */
-	bfi	x4, x3, #0, #PAGE_SHIFT
-	br	x4
-ENDPROC(__kvm_hyp_teardown)
-	
 el1_sync:				// Guest trapped into EL2
 	stp	x0, x1, [sp, #-16]!
 

From 506c372ac41dc0c7c1f50f718a8a53cb0748ea96 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:44 +0100
Subject: [PATCH 122/227] arm64: KVM: Implement HVC_SOFT_RESTART in the init
 code

Another missing stub hypercall is HVC_SOFT_RESTART. It turns out
that it is pretty easy to implement in terms of HVC_RESET_VECTORS
(since it needs to turn the MMU off).

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kvm/hyp-init.S | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index b7a8f12293cca0..0ad34fd6d2fc86 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -129,21 +129,36 @@ ENTRY(__kvm_handle_stub_hvc)
 	mrs	x0, vbar_el2
 	b	exit
 
+1:	cmp	x0, #HVC_SOFT_RESTART
+	b.ne	1f
+
+	/* This is where we're about to jump, staying at EL2 */
+	msr	elr_el2, x1
+	mov	x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT | PSR_MODE_EL2h)
+	msr	spsr_el2, x0
+
+	/* Shuffle the arguments, and don't come back */
+	mov	x0, x2
+	mov	x1, x3
+	mov	x2, x4
+	b	reset
+
 1:	cmp	x0, #HVC_RESET_VECTORS
 	b.ne	1f
+reset:
 	/*
-	 * Reset kvm back to the hyp stub.
+	 * Reset kvm back to the hyp stub. Do not clobber x0-x4 in
+	 * case we coming via HVC_SOFT_RESTART.
 	 */
-	/* We're now in idmap, disable MMU */
-	mrs	x0, sctlr_el2
-	ldr	x1, =SCTLR_ELx_FLAGS
-	bic	x0, x0, x1		// Clear SCTL_M and etc
-	msr	sctlr_el2, x0
+	mrs	x5, sctlr_el2
+	ldr	x6, =SCTLR_ELx_FLAGS
+	bic	x5, x5, x6		// Clear SCTL_M and etc
+	msr	sctlr_el2, x5
 	isb
 
 	/* Install stub vectors */
-	adr_l	x0, __hyp_stub_vectors
-	msr	vbar_el2, x0
+	adr_l	x5, __hyp_stub_vectors
+	msr	vbar_el2, x5
 	b	exit
 
 1:	/* Bad stub call */

From 1342337bc80a5bfb9aa83574da9fb2e22cc64121 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 3 Apr 2017 19:37:45 +0100
Subject: [PATCH 123/227] ARM: hyp-stub: improve ABI

Improve the hyp-stub ABI to allow it to do more than just get/set the
vectors.  We follow the example in ARM64, where r0 is used as an opcode
with the other registers as an argument.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kernel/hyp-stub.S | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index 15d073ae5da2a2..f3e9ba5fb642b1 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -22,6 +22,9 @@
 #include <asm/assembler.h>
 #include <asm/virt.h>
 
+#define HVC_GET_VECTORS 0
+#define HVC_SET_VECTORS 1
+
 #ifndef ZIMAGE
 /*
  * For the kernel proper, we need to find out the CPU boot mode long after
@@ -202,9 +205,19 @@ ARM_BE8(orr	r7, r7, #(1 << 25))     @ HSCTLR.EE
 ENDPROC(__hyp_stub_install_secondary)
 
 __hyp_stub_do_trap:
-	cmp	r0, #-1
-	mrceq	p15, 4, r0, c12, c0, 0	@ get HVBAR
-	mcrne	p15, 4, r0, c12, c0, 0	@ set HVBAR
+	teq	r0, #HVC_GET_VECTORS
+	bne	1f
+	mrc	p15, 4, r0, c12, c0, 0	@ get HVBAR
+	b	__hyp_stub_exit
+
+1:	teq	r0, #HVC_SET_VECTORS
+	bne	1f
+	mcr	p15, 4, r1, c12, c0, 0	@ set HVBAR
+	b	__hyp_stub_exit
+
+1:	mov	r0, #-1
+
+__hyp_stub_exit:
 	__ERET
 ENDPROC(__hyp_stub_do_trap)
 
@@ -231,10 +244,14 @@ ENDPROC(__hyp_stub_do_trap)
  * initialisation entry point.
  */
 ENTRY(__hyp_get_vectors)
-	mov	r0, #-1
+	mov	r0, #HVC_GET_VECTORS
+	__HVC(0)
+	ret	lr
 ENDPROC(__hyp_get_vectors)
-	@ fall through
+
 ENTRY(__hyp_set_vectors)
+	mov	r1, r0
+	mov	r0, #HVC_SET_VECTORS
 	__HVC(0)
 	ret	lr
 ENDPROC(__hyp_set_vectors)

From 9da5ac236de6ab2189c999eb9ddddeef1431ab68 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 3 Apr 2017 19:37:46 +0100
Subject: [PATCH 124/227] ARM: soft-reboot into same mode that we entered the
 kernel

When we soft-reboot (eg, kexec) from one kernel into the next, we need
to ensure that we enter the new kernel in the same processor mode as
when we were entered, so that (eg) the new kernel can install its own
hypervisor - the old kernel's hypervisor will have been overwritten.

In order to do this, we need to pass a flag to cpu_reset() so it knows
what to do, and we need to modify the kernel's own hypervisor stub to
allow it to handle a soft-reboot.

As we are always guaranteed to install our own hypervisor if we're
entered in HYP32 mode, and KVM will have moved itself out of the way
on kexec/normal reboot, we can assume that our hypervisor is in place
when we want to kexec, so changing our hypervisor API should not be a
problem.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/proc-fns.h |  4 ++--
 arch/arm/kernel/hyp-stub.S      | 13 +++++++++++++
 arch/arm/kernel/reboot.c        |  7 +++++--
 arch/arm/mm/proc-v7.S           | 12 ++++++++----
 4 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/proc-fns.h b/arch/arm/include/asm/proc-fns.h
index 8877ad5ffe103f..f2e1af45bd6fae 100644
--- a/arch/arm/include/asm/proc-fns.h
+++ b/arch/arm/include/asm/proc-fns.h
@@ -43,7 +43,7 @@ extern struct processor {
 	/*
 	 * Special stuff for a reset
 	 */
-	void (*reset)(unsigned long addr) __attribute__((noreturn));
+	void (*reset)(unsigned long addr, bool hvc) __attribute__((noreturn));
 	/*
 	 * Idle the processor
 	 */
@@ -88,7 +88,7 @@ extern void cpu_set_pte_ext(pte_t *ptep, pte_t pte);
 #else
 extern void cpu_set_pte_ext(pte_t *ptep, pte_t pte, unsigned int ext);
 #endif
-extern void cpu_reset(unsigned long addr) __attribute__((noreturn));
+extern void cpu_reset(unsigned long addr, bool hvc) __attribute__((noreturn));
 
 /* These three are private to arch/arm/kernel/suspend.c */
 extern void cpu_do_suspend(void *);
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index f3e9ba5fb642b1..82915231c6f850 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -24,6 +24,7 @@
 
 #define HVC_GET_VECTORS 0
 #define HVC_SET_VECTORS 1
+#define HVC_SOFT_RESTART 2
 
 #ifndef ZIMAGE
 /*
@@ -215,6 +216,10 @@ __hyp_stub_do_trap:
 	mcr	p15, 4, r1, c12, c0, 0	@ set HVBAR
 	b	__hyp_stub_exit
 
+1:	teq	r0, #HVC_SOFT_RESTART
+	bne	1f
+	bx	r3
+
 1:	mov	r0, #-1
 
 __hyp_stub_exit:
@@ -256,6 +261,14 @@ ENTRY(__hyp_set_vectors)
 	ret	lr
 ENDPROC(__hyp_set_vectors)
 
+ENTRY(__hyp_soft_restart)
+	mov	r3, r0
+	mov	r0, #HVC_SOFT_RESTART
+	__HVC(0)
+	mov	r0, r3
+	ret	lr
+ENDPROC(__hyp_soft_restart)
+
 #ifndef ZIMAGE
 .align 2
 .L__boot_cpu_mode_offset:
diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c
index 3fa867a2aae672..3b2aa9a9fe268d 100644
--- a/arch/arm/kernel/reboot.c
+++ b/arch/arm/kernel/reboot.c
@@ -12,10 +12,11 @@
 
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
+#include <asm/virt.h>
 
 #include "reboot.h"
 
-typedef void (*phys_reset_t)(unsigned long);
+typedef void (*phys_reset_t)(unsigned long, bool);
 
 /*
  * Function pointers to optional machine specific functions
@@ -51,7 +52,9 @@ static void __soft_restart(void *addr)
 
 	/* Switch to the identity mapping. */
 	phys_reset = (phys_reset_t)virt_to_idmap(cpu_reset);
-	phys_reset((unsigned long)addr);
+
+	/* original stub should be restored by kvm */
+	phys_reset((unsigned long)addr, is_hyp_mode_available());
 
 	/* Should never get here. */
 	BUG();
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index d00d52c9de3e02..1846ca4255d0f1 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -53,11 +53,15 @@ ENDPROC(cpu_v7_proc_fin)
 	.align	5
 	.pushsection	.idmap.text, "ax"
 ENTRY(cpu_v7_reset)
-	mrc	p15, 0, r1, c1, c0, 0		@ ctrl register
-	bic	r1, r1, #0x1			@ ...............m
- THUMB(	bic	r1, r1, #1 << 30 )		@ SCTLR.TE (Thumb exceptions)
-	mcr	p15, 0, r1, c1, c0, 0		@ disable MMU
+	mrc	p15, 0, r2, c1, c0, 0		@ ctrl register
+	bic	r2, r2, #0x1			@ ...............m
+ THUMB(	bic	r2, r2, #1 << 30 )		@ SCTLR.TE (Thumb exceptions)
+	mcr	p15, 0, r2, c1, c0, 0		@ disable MMU
 	isb
+#ifdef CONFIG_ARM_VIRT_EXT
+	teq	r1, #0
+	bne	__hyp_soft_restart
+#endif
 	bx	r0
 ENDPROC(cpu_v7_reset)
 	.popsection

From 467f97b72b275ca24a390c2aec383c5f3d91b97a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:47 +0100
Subject: [PATCH 125/227] ARM: KVM: Convert KVM to use HVC_GET_VECTORS

The conversion of the HYP stub ABI to something similar to arm64
left the KVM code broken, as it doesn't know about the new
stub numbering. Let's move the various #defines to virt.h, and
let KVM use HVC_GET_VECTORS.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/virt.h  | 8 ++++++++
 arch/arm/kernel/hyp-stub.S   | 4 ----
 arch/arm/kvm/hyp/hyp-entry.S | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index 6dae1956c74d76..4ea16fcaf2edc1 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -94,6 +94,14 @@ extern char __hyp_text_start[];
 extern char __hyp_text_end[];
 #endif
 
+#else
+
+/* Only assembly code should need those */
+
+#define HVC_GET_VECTORS 0
+#define HVC_SET_VECTORS 1
+#define HVC_SOFT_RESTART 2
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* ! VIRT_H */
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index 82915231c6f850..8301db963d8336 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -22,10 +22,6 @@
 #include <asm/assembler.h>
 #include <asm/virt.h>
 
-#define HVC_GET_VECTORS 0
-#define HVC_SET_VECTORS 1
-#define HVC_SOFT_RESTART 2
-
 #ifndef ZIMAGE
 /*
  * For the kernel proper, we need to find out the CPU boot mode long after
diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S
index 96beb53934c976..1f8db7d21fc55b 100644
--- a/arch/arm/kvm/hyp/hyp-entry.S
+++ b/arch/arm/kvm/hyp/hyp-entry.S
@@ -127,7 +127,7 @@ hyp_hvc:
 	pop	{r0, r1, r2}
 
 	/* Check for __hyp_get_vectors */
-	cmp	r0, #-1
+	cmp	r0, #HVC_GET_VECTORS
 	mrceq	p15, 4, r0, c12, c0, 0	@ get HVBAR
 	beq	1f
 

From 6b85677c38510e09d482a9bd57000a099003a898 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:48 +0100
Subject: [PATCH 126/227] ARM: Update cpu_v7_reset documentation

cpu_v7_reset() now takes a second parameter indicating whether
we should reboot in HYP or not. Update the documentation to
reflect this.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/mm/proc-v7.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 1846ca4255d0f1..01d64c0b2563b7 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -39,13 +39,14 @@ ENTRY(cpu_v7_proc_fin)
 ENDPROC(cpu_v7_proc_fin)
 
 /*
- *	cpu_v7_reset(loc)
+ *	cpu_v7_reset(loc, hyp)
  *
  *	Perform a soft reset of the system.  Put the CPU into the
  *	same state as it would be if it had been reset, and branch
  *	to what would be the reset vector.
  *
  *	- loc   - location to jump to for soft reset
+ *	- hyp   - indicate if restart occurs in HYP mode
  *
  *	This code must be executed using a flat identity mapping with
  *      caches disabled.

From 6b52f7bdb8883b9c6a47a5558e6b08834c952fb3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:49 +0100
Subject: [PATCH 127/227] ARM: hyp-stub: Use r1 for the soft-restart address

It is not really obvious why the restart address should be in r3
when communicated to the hyp-stub. r1 should be perfectly adequate,
and consistent with the rest of the code.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kernel/hyp-stub.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index 8301db963d8336..15eaa14322a70d 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -214,7 +214,7 @@ __hyp_stub_do_trap:
 
 1:	teq	r0, #HVC_SOFT_RESTART
 	bne	1f
-	bx	r3
+	bx	r1
 
 1:	mov	r0, #-1
 
@@ -258,10 +258,9 @@ ENTRY(__hyp_set_vectors)
 ENDPROC(__hyp_set_vectors)
 
 ENTRY(__hyp_soft_restart)
-	mov	r3, r0
+	mov	r1, r0
 	mov	r0, #HVC_SOFT_RESTART
 	__HVC(0)
-	mov	r0, r3
 	ret	lr
 ENDPROC(__hyp_soft_restart)
 

From cf763e4ede9deca45c83d7a59c8e798c4d079da2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:50 +0100
Subject: [PATCH 128/227] ARM: Expose the VA/IDMAP offset

The KVM code needs to be able to compute the address of
symbols in its idmap page (the equivalent of a virt_to_idmap()
call). Unfortunately, virt_to_idmap is slightly complicated,
depending on the use of arch_phys_to_idmap_offset or not, and
none of that is readily available at HYP.

Instead, expose a single kimage_voffset variable which contains the
offset between a kernel VA and its idmap address, enabling the
VA->IDMAP conversion. This allows the KVM code to behave similarily
to its arm64 counterpart.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/mm/mmu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4e016d7f37b3af..e98a2b5c4e8506 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -87,6 +87,8 @@ struct cachepolicy {
 #define s2_policy(policy)	0
 #endif
 
+unsigned long kimage_voffset __ro_after_init;
+
 static struct cachepolicy cache_policies[] __initdata = {
 	{
 		.policy		= "uncached",
@@ -1635,4 +1637,7 @@ void __init paging_init(const struct machine_desc *mdesc)
 
 	empty_zero_page = virt_to_page(zero_page);
 	__flush_dcache_page(NULL, empty_zero_page);
+
+	/* Compute the virt/idmap offset, mostly for the sake of KVM */
+	kimage_voffset = (unsigned long)&kimage_voffset - virt_to_idmap(&kimage_voffset);
 }

From 4c70cf07cecab1dd5446249dc8d4795bec693c78 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:51 +0100
Subject: [PATCH 129/227] ARM: hyp-stub: Define a return value for failed stub
 calls

Define a standard return value to be returned when a hyp stub
call fails.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/virt.h | 2 ++
 arch/arm/kernel/hyp-stub.S  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index 4ea16fcaf2edc1..c16f70d6953f1c 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -104,4 +104,6 @@ extern char __hyp_text_end[];
 
 #endif /* __ASSEMBLY__ */
 
+#define HVC_STUB_ERR	0xbadca11
+
 #endif /* ! VIRT_H */
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index 15eaa14322a70d..b20ca88159115f 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -216,7 +216,7 @@ __hyp_stub_do_trap:
 	bne	1f
 	bx	r1
 
-1:	mov	r0, #-1
+1:	ldr	r0, =HVC_STUB_ERR
 
 __hyp_stub_exit:
 	__ERET

From 7d1bf4e05717d900e69b249b4f08f6b36b00fbf0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:52 +0100
Subject: [PATCH 130/227] ARM: hyp-stub: Implement HVC_RESET_VECTORS stub
 hypercall

Let's define a new stub hypercall that resets the HYP configuration
to its default: hyp-stub vectors, and MMU disabled.

Of course, for the hyp-stub itself, this is a trivial no-op.
Hypervisors will have a bit more work to do.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/virt.h |  3 +++
 arch/arm/kernel/hyp-stub.S  | 11 ++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index c16f70d6953f1c..c5a27570b2256b 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -101,6 +101,9 @@ extern char __hyp_text_end[];
 #define HVC_GET_VECTORS 0
 #define HVC_SET_VECTORS 1
 #define HVC_SOFT_RESTART 2
+#define HVC_RESET_VECTORS 3
+
+#define HVC_STUB_HCALL_NR 4
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index b20ca88159115f..e637854335aa43 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -216,7 +216,10 @@ __hyp_stub_do_trap:
 	bne	1f
 	bx	r1
 
-1:	ldr	r0, =HVC_STUB_ERR
+1:	teq	r0, #HVC_RESET_VECTORS
+	beq	__hyp_stub_exit
+
+	ldr	r0, =HVC_STUB_ERR
 
 __hyp_stub_exit:
 	__ERET
@@ -264,6 +267,12 @@ ENTRY(__hyp_soft_restart)
 	ret	lr
 ENDPROC(__hyp_soft_restart)
 
+ENTRY(__hyp_reset_vectors)
+	mov	r0, #HVC_RESET_VECTORS
+	__HVC(0)
+	ret	lr
+ENDPROC(__hyp_reset_vectors)
+
 #ifndef ZIMAGE
 .align 2
 .L__boot_cpu_mode_offset:

From bc845e4fbbbbe97bab3f1bcf688be0b5da420717 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:53 +0100
Subject: [PATCH 131/227] ARM: KVM: Implement HVC_RESET_VECTORS stub hypercall
 in the init code

In order to restore HYP mode to its original condition, KVM currently
implements __kvm_hyp_reset(). As we're moving towards a hyp-stub
defined API, it becomes necessary to implement HVC_RESET_VECTORS.

This patch adds the HVC_RESET_VECTORS hypercall to the KVM init
code, which so far lacked any form of hypercall support.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/virt.h |  1 +
 arch/arm/kernel/hyp-stub.S  |  2 +-
 arch/arm/kvm/init.S         | 33 +++++++++++++++++++++++++++------
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index c5a27570b2256b..663adc09c918be 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -54,6 +54,7 @@ static inline void sync_boot_mode(void)
 
 void __hyp_set_vectors(unsigned long phys_vector_base);
 unsigned long __hyp_get_vectors(void);
+void __hyp_reset_vectors(void);
 #else
 #define __boot_cpu_mode	(SVC_MODE)
 #define sync_boot_mode()
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index e637854335aa43..675c50f5cb5cd5 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -280,7 +280,7 @@ ENDPROC(__hyp_reset_vectors)
 #endif
 
 .align 5
-__hyp_stub_vectors:
+ENTRY(__hyp_stub_vectors)
 __hyp_stub_reset:	W(b)	.
 __hyp_stub_und:		W(b)	.
 __hyp_stub_svc:		W(b)	.
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index bf89c919efc1a2..86a7008e75a352 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -23,6 +23,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/virt.h>
 
 /********************************************************************
  * Hypervisor initialization
@@ -39,6 +40,10 @@
  * - Setup the page tables
  * - Enable the MMU
  * - Profit! (or eret, if you only care about the code).
+ *
+ * Another possibility is to get a HYP stub hypercall.
+ * We discriminate between the two by checking if r0 contains a value
+ * that is less than HVC_STUB_HCALL_NR.
  */
 
 	.text
@@ -58,6 +63,10 @@ __kvm_hyp_init:
 	W(b)	.
 
 __do_hyp_init:
+	@ Check for a stub hypercall
+	cmp	r0, #HVC_STUB_HCALL_NR
+	blo	__kvm_handle_stub_hvc
+
 	@ Set stack pointer
 	mov	sp, r0
 
@@ -112,19 +121,31 @@ __do_hyp_init:
 
 	eret
 
-	@ r0 : stub vectors address
+ENTRY(__kvm_handle_stub_hvc)
+	cmp	r0, #HVC_RESET_VECTORS
+	bne	1f
 ENTRY(__kvm_hyp_reset)
 	/* We're now in idmap, disable MMU */
 	mrc	p15, 4, r1, c1, c0, 0	@ HSCTLR
-	ldr	r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
-	bic	r1, r1, r2
+	ldr	r0, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
+	bic	r1, r1, r0
 	mcr	p15, 4, r1, c1, c0, 0	@ HSCTLR
 
-	/* Install stub vectors */
-	mcr	p15, 4, r0, c12, c0, 0	@ HVBAR
-	isb
+	/*
+	 * Install stub vectors, using ardb's VA->PA trick.
+	 */
+0:	adr	r0, 0b					@ PA(0)
+	movw	r1, #:lower16:__hyp_stub_vectors - 0b   @ VA(stub) - VA(0)
+	movt	r1, #:upper16:__hyp_stub_vectors - 0b
+	add	r1, r1, r0				@ PA(stub)
+	mcr	p15, 4, r1, c12, c0, 0	@ HVBAR
+	b	exit
+
+1:	ldr	r0, =HVC_STUB_ERR
 
+exit:
 	eret
+ENDPROC(__kvm_handle_stub_hvc)
 ENDPROC(__kvm_hyp_reset)
 
 	.ltorg

From 5d224aa7d40f6dd79037fd0647eb9eb5e7aaa915 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:54 +0100
Subject: [PATCH 132/227] ARM: KVM: Implement HVC_GET_VECTORS in the init code

Now that we have an infrastructure to handle hypercalls in the KVM
init code, let's implement HVC_GET_VECTORS there.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/init.S | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 86a7008e75a352..d6b2f496fa1361 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -122,7 +122,12 @@ __do_hyp_init:
 	eret
 
 ENTRY(__kvm_handle_stub_hvc)
-	cmp	r0, #HVC_RESET_VECTORS
+	cmp	r0, #HVC_GET_VECTORS
+	bne	1f
+	mrc	p15, 4, r0, c12, c0, 0	@ get HVBAR
+	b	exit
+
+1:	cmp	r0, #HVC_RESET_VECTORS
 	bne	1f
 ENTRY(__kvm_hyp_reset)
 	/* We're now in idmap, disable MMU */

From 6bebcecb6c5bf65cde18739a4fe57af25c179e44 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:55 +0100
Subject: [PATCH 133/227] ARM: KVM: Allow the main HYP code to use the init hyp
 stub implementation

We now have a full hyp-stub implementation in the KVM init code,
but the main KVM code only supports HVC_GET_VECTORS, which is not
enough.

Instead of reinventing the wheel, let's reuse the init implementation
by branching to the idmap page when called with a hyp-stub hypercall.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/hyp/hyp-entry.S | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S
index 1f8db7d21fc55b..a35baa81fd2323 100644
--- a/arch/arm/kvm/hyp/hyp-entry.S
+++ b/arch/arm/kvm/hyp/hyp-entry.S
@@ -126,11 +126,30 @@ hyp_hvc:
 	 */
 	pop	{r0, r1, r2}
 
-	/* Check for __hyp_get_vectors */
-	cmp	r0, #HVC_GET_VECTORS
-	mrceq	p15, 4, r0, c12, c0, 0	@ get HVBAR
-	beq	1f
+	/*
+	 * Check if we have a kernel function, which is guaranteed to be
+	 * bigger than the maximum hyp stub hypercall
+	 */
+	cmp	r0, #HVC_STUB_HCALL_NR
+	bhs	1f
 
+	/*
+	 * Not a kernel function, treat it as a stub hypercall.
+	 * Compute the physical address for __kvm_handle_stub_hvc
+	 * (as the code lives in the idmaped page) and branch there.
+	 * We hijack ip (r12) as a tmp register.
+	 */
+	push	{r1}
+	ldr	r1, =kimage_voffset
+	ldr	r1, [r1]
+	ldr	ip, =__kvm_handle_stub_hvc
+	sub	ip, ip, r1
+THUMB(	add	ip, ip, #1)
+	pop	{r1}
+
+	bx	ip
+
+1:
 	push	{lr}
 
 	mov	lr, r0
@@ -142,7 +161,7 @@ THUMB(	orr	lr, #1)
 	blx	lr			@ Call the HYP function
 
 	pop	{lr}
-1:	eret
+	eret
 
 guest_trap:
 	load_vcpu r0			@ Load VCPU pointer to r0

From a92ce8f6ab6cb16006f8d73cb0f522079276fd91 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:56 +0100
Subject: [PATCH 134/227] ARM: KVM: Convert __cpu_reset_hyp_mode to using
 __hyp_reset_vectors

We are now able to use the hyp stub to reset HYP mode. Time to
kiss __kvm_hyp_reset goodbye, and use __hyp_reset_vectors.

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/kvm_asm.h  | 2 --
 arch/arm/include/asm/kvm_host.h | 2 +-
 arch/arm/kvm/init.S             | 2 --
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index dd16044b34b613..eae11b3e98fc70 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -72,8 +72,6 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern void __init_stage2_translation(void);
 
-extern void __kvm_hyp_reset(unsigned long);
-
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 extern u64 __vgic_v3_read_vmcr(void);
 extern void __vgic_v3_write_vmcr(u32 vmcr);
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 31ee468ce667de..adea3075318584 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void)
 static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
 					phys_addr_t phys_idmap_start)
 {
-	kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
+	__hyp_reset_vectors();
 }
 
 static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index d6b2f496fa1361..fb336090ee9c63 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -129,7 +129,6 @@ ENTRY(__kvm_handle_stub_hvc)
 
 1:	cmp	r0, #HVC_RESET_VECTORS
 	bne	1f
-ENTRY(__kvm_hyp_reset)
 	/* We're now in idmap, disable MMU */
 	mrc	p15, 4, r1, c1, c0, 0	@ HSCTLR
 	ldr	r0, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
@@ -151,7 +150,6 @@ ENTRY(__kvm_hyp_reset)
 exit:
 	eret
 ENDPROC(__kvm_handle_stub_hvc)
-ENDPROC(__kvm_hyp_reset)
 
 	.ltorg
 

From 4d5f9c14fb638cbe36795cb4b8a0b05ceba0a526 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:57 +0100
Subject: [PATCH 135/227] ARM: KVM: Implement HVC_SOFT_RESTART in the init code

Another missing stub hypercall is HVC_SOFT_RESTART. It turns out
that it is pretty easy to implement in terms of HVC_RESET_VECTORS
(since it needs to turn the MMU off).

Tested-by: Keerthy <j-keerthy@ti.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/init.S | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index fb336090ee9c63..e53360dceb194d 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -127,8 +127,22 @@ ENTRY(__kvm_handle_stub_hvc)
 	mrc	p15, 4, r0, c12, c0, 0	@ get HVBAR
 	b	exit
 
+1:	cmp	r0, #HVC_SOFT_RESTART
+	bne	1f
+
+	/* The target is expected in r1 */
+	msr	ELR_hyp, r1
+	mrs	r0, cpsr
+	bic	r0, r0, #MODE_MASK
+	orr	r0, r0, #HYP_MODE
+THUMB(	orr	r0, r0, #PSR_T_BIT	)
+	msr	spsr_cxsf, r0
+	b	reset
+
 1:	cmp	r0, #HVC_RESET_VECTORS
 	bne	1f
+
+reset:
 	/* We're now in idmap, disable MMU */
 	mrc	p15, 4, r1, c1, c0, 0	@ HSCTLR
 	ldr	r0, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)

From 386627d825d82ed3f7261a0de71a7cc4144e3c4b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:58 +0100
Subject: [PATCH 136/227] ARM: KVM: Gracefully handle hyp-stubs being restored
 from under our feet

Should kvm_reboot() be invoked while guest is running, an IPI
wil be issued, forcing the guest to exit and HYP being reset to
the stubs. We will then try to reenter the guest, only to get
an error (HVC_STUB_ERR).

This patch allows this case to be gracefully handled by exiting
the run loop.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/kvm_asm.h | 2 +-
 arch/arm/kvm/handle_exit.c     | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index eae11b3e98fc70..14d68a4d826f63 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -33,7 +33,7 @@
 #define ARM_EXCEPTION_IRQ	  5
 #define ARM_EXCEPTION_FIQ	  6
 #define ARM_EXCEPTION_HVC	  7
-
+#define ARM_EXCEPTION_HYP_GONE	  HVC_STUB_ERR
 /*
  * The rr_lo_hi macro swaps a pair of registers depending on
  * current endianness. It is used in conjunction with ldrd and strd
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 96af65a30d78b1..5fd7968cdae9ef 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -160,6 +160,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	case ARM_EXCEPTION_DATA_ABORT:
 		kvm_inject_vabt(vcpu);
 		return 1;
+	case ARM_EXCEPTION_HYP_GONE:
+		/*
+		 * HYP has been reset to the hyp-stub. This happens
+		 * when a guest is pre-empted by kvm_reboot()'s
+		 * shutdown call.
+		 */
+		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		return 0;
 	default:
 		kvm_pr_unimpl("Unsupported exception type: %d",
 			      exception_index);

From 0fb265930dbcdb6833094198b8b1b71b1d3ed88d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:37:59 +0100
Subject: [PATCH 137/227] arm/arm64: KVM: Use __hyp_reset_vectors() directly

__cpu_reset_hyp_mode doesn't need to be passed any argument now,
as the hyp-stub implementations are self-contained, and is now
reduced to just calling __hyp_reset_vectors(). Let's drop the
wrapper and use the stub hypercall directly.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/kvm_host.h   | 6 ------
 arch/arm/kvm/arm.c                | 3 +--
 arch/arm64/include/asm/kvm_host.h | 6 ------
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index adea3075318584..d488b8866bc943 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -270,12 +270,6 @@ static inline void __cpu_init_stage2(void)
 	kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
-					phys_addr_t phys_idmap_start)
-{
-	__hyp_reset_vectors();
-}
-
 static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
 	return 0;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 46fd3757869363..c8f4fa69104e82 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1130,8 +1130,7 @@ static void cpu_hyp_reinit(void)
 static void cpu_hyp_reset(void)
 {
 	if (!is_kernel_in_hyp_mode())
-		__cpu_reset_hyp_mode(hyp_default_vectors,
-				     kvm_get_idmap_start());
+		__hyp_reset_vectors();
 }
 
 static void _kvm_arch_hardware_enable(void *discard)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 0355dd1099562c..578df18f66b79d 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -362,12 +362,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
 }
 
-static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
-					phys_addr_t phys_idmap_start)
-{
-	__hyp_reset_vectors();
-}
-
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}

From 9e9ebd01a353b1f8986ced524e30cf5eee6aa08e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:00 +0100
Subject: [PATCH 138/227] arm/arm64: KVM: Remove kvm_get_idmap_start

With __cpu_reset_hyp_mode having become fairly dumb, there is no
need for kvm_get_idmap_start anymore.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   | 1 -
 arch/arm/kvm/mmu.c               | 5 -----
 arch/arm64/include/asm/kvm_mmu.h | 1 -
 3 files changed, 7 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 95f38dcd611dff..fa6f2174276bdd 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -56,7 +56,6 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
-phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 69554bd32c2934..efb4335aa5c4a0 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1669,11 +1669,6 @@ phys_addr_t kvm_get_idmap_vector(void)
 	return hyp_idmap_vector;
 }
 
-phys_addr_t kvm_get_idmap_start(void)
-{
-	return hyp_idmap_start;
-}
-
 static int kvm_map_idmap_text(pgd_t *pgd)
 {
 	int err;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index ed1246014901d0..91d93a5d8fd308 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -155,7 +155,6 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
-phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 

From 47eb3cba48ec11122994057cdc1e5dd7ab9b2be2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:01 +0100
Subject: [PATCH 139/227] arm/arm64: KVM: Use HVC_RESET_VECTORS to reinit HYP
 mode

Instead of trying to compare the value given by __hyp_get_vectors(),
which doesn't offer any real guarantee to be the stub's address, use
HVC_RESET_VECTORS to make sure we're in a sane state to reinstall
KVM across PM events.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/arm.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index c8f4fa69104e82..c378502623f6a9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -53,7 +53,6 @@ __asm__(".arch_extension	virt");
 
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
-static unsigned long hyp_default_vectors;
 
 /* Per-CPU variable containing the currently running vcpu. */
 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
@@ -1113,8 +1112,16 @@ static void cpu_init_hyp_mode(void *dummy)
 	kvm_arm_init_debug();
 }
 
+static void cpu_hyp_reset(void)
+{
+	if (!is_kernel_in_hyp_mode())
+		__hyp_reset_vectors();
+}
+
 static void cpu_hyp_reinit(void)
 {
+	cpu_hyp_reset();
+
 	if (is_kernel_in_hyp_mode()) {
 		/*
 		 * __cpu_init_stage2() is safe to call even if the PM
@@ -1122,17 +1129,10 @@ static void cpu_hyp_reinit(void)
 		 */
 		__cpu_init_stage2();
 	} else {
-		if (__hyp_get_vectors() == hyp_default_vectors)
-			cpu_init_hyp_mode(NULL);
+		cpu_init_hyp_mode(NULL);
 	}
 }
 
-static void cpu_hyp_reset(void)
-{
-	if (!is_kernel_in_hyp_mode())
-		__hyp_reset_vectors();
-}
-
 static void _kvm_arch_hardware_enable(void *discard)
 {
 	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
@@ -1315,12 +1315,6 @@ static int init_hyp_mode(void)
 	if (err)
 		goto out_err;
 
-	/*
-	 * It is probably enough to obtain the default on one
-	 * CPU. It's unlikely to be different on the others.
-	 */
-	hyp_default_vectors = __hyp_get_vectors();
-
 	/*
 	 * Allocate stack pages for Hypervisor-mode
 	 */

From 4897e36c8d37fe508a423d6d15a635aba2557b4e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:02 +0100
Subject: [PATCH 140/227] ARM: decompressor: Remove __hyp_get_vectors usage

When the compressed image needs to be relocated to avoid being
overwritten by the decompression process, we need to relocate
the hyp vectors as well so that we can find them once the
decompression has taken effect.

For that, we perform the following calculation:
	u32 v = __hyp_get_vectors();
	v += offset;
	__hyp_set_vectors(v);

But we're guaranteed that the initial value of v as returned by
__hyp_get_vectors is always __hyp_stub_vectors, because we have
just set it by calling __hyp_stub_install.

So let's remove the use of __hyp_get_vectors, and directly use
__hyp_stub_vectors instead.

Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/boot/compressed/head.S | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 9150f9732785aa..7c711ba614173d 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -422,7 +422,17 @@ dtb_check_done:
 		cmp	r0, #HYP_MODE
 		bne	1f
 
-		bl	__hyp_get_vectors
+		/*
+		 * Compute the address of the hyp vectors after relocation.
+		 * This requires some arithmetic since we cannot directly
+		 * reference __hyp_stub_vectors in a PC-relative way.
+		 * Call __hyp_set_vectors with the new address so that we
+		 * can HVC again after the copy.
+		 */
+0:		adr	r0, 0b
+		movw	r1, #:lower16:__hyp_stub_vectors - 0b
+		movt	r1, #:upper16:__hyp_stub_vectors - 0b
+		add	r0, r0, r1
 		sub	r0, r0, r5
 		add	r0, r0, r10
 		bl	__hyp_set_vectors

From ecb5d61daaa2fc09235191926ea808e00e2e1c0c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:03 +0100
Subject: [PATCH 141/227] ARM: hyp-stub/KVM: Kill __hyp_get_vectors

Nobody is using __hyp_get_vectors anymore, so let's remove both
implementations (hyp-stub and KVM).

Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/include/asm/virt.h | 10 ++++------
 arch/arm/kernel/hyp-stub.S  | 13 +------------
 arch/arm/kvm/init.S         |  7 +------
 arch/arm/kvm/interrupts.S   |  4 ----
 4 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index 663adc09c918be..141144f333a2e3 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -53,7 +53,6 @@ static inline void sync_boot_mode(void)
 }
 
 void __hyp_set_vectors(unsigned long phys_vector_base);
-unsigned long __hyp_get_vectors(void);
 void __hyp_reset_vectors(void);
 #else
 #define __boot_cpu_mode	(SVC_MODE)
@@ -99,12 +98,11 @@ extern char __hyp_text_end[];
 
 /* Only assembly code should need those */
 
-#define HVC_GET_VECTORS 0
-#define HVC_SET_VECTORS 1
-#define HVC_SOFT_RESTART 2
-#define HVC_RESET_VECTORS 3
+#define HVC_SET_VECTORS 0
+#define HVC_SOFT_RESTART 1
+#define HVC_RESET_VECTORS 2
 
-#define HVC_STUB_HCALL_NR 4
+#define HVC_STUB_HCALL_NR 3
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index 675c50f5cb5cd5..918c64f171282c 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -202,12 +202,7 @@ ARM_BE8(orr	r7, r7, #(1 << 25))     @ HSCTLR.EE
 ENDPROC(__hyp_stub_install_secondary)
 
 __hyp_stub_do_trap:
-	teq	r0, #HVC_GET_VECTORS
-	bne	1f
-	mrc	p15, 4, r0, c12, c0, 0	@ get HVBAR
-	b	__hyp_stub_exit
-
-1:	teq	r0, #HVC_SET_VECTORS
+	teq	r0, #HVC_SET_VECTORS
 	bne	1f
 	mcr	p15, 4, r1, c12, c0, 0	@ set HVBAR
 	b	__hyp_stub_exit
@@ -247,12 +242,6 @@ ENDPROC(__hyp_stub_do_trap)
  * so you will need to set that to something sensible at the new hypervisor's
  * initialisation entry point.
  */
-ENTRY(__hyp_get_vectors)
-	mov	r0, #HVC_GET_VECTORS
-	__HVC(0)
-	ret	lr
-ENDPROC(__hyp_get_vectors)
-
 ENTRY(__hyp_set_vectors)
 	mov	r1, r0
 	mov	r0, #HVC_SET_VECTORS
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index e53360dceb194d..87bcd7ae4552b4 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -122,12 +122,7 @@ __do_hyp_init:
 	eret
 
 ENTRY(__kvm_handle_stub_hvc)
-	cmp	r0, #HVC_GET_VECTORS
-	bne	1f
-	mrc	p15, 4, r0, c12, c0, 0	@ get HVBAR
-	b	exit
-
-1:	cmp	r0, #HVC_SOFT_RESTART
+	cmp	r0, #HVC_SOFT_RESTART
 	bne	1f
 
 	/* The target is expected in r1 */
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index b1bd316f14c0ad..80a1d6cd261cec 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -37,10 +37,6 @@
  * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c).  Return values are
  * passed in r0 (strictly 32bit).
  *
- * A function pointer with a value of 0xffffffff has a special meaning,
- * and is used to implement __hyp_get_vectors in the same way as in
- * arch/arm/kernel/hyp_stub.S.
- *
  * The calling convention follows the standard AAPCS:
  *   r0 - r3: caller save
  *   r12:     caller save

From 0b51c547fdb7ee05b6d65471785a1d899c8a667c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:04 +0100
Subject: [PATCH 142/227] arm64: hyp-stub/KVM: Kill __hyp_get_vectors

Nobody is using __hyp_get_vectors anymore, so let's remove both
implementations (hyp-stub and KVM).

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/include/asm/virt.h | 12 ++++--------
 arch/arm64/kernel/hyp-stub.S  | 13 +------------
 arch/arm64/kvm/hyp-init.S     |  7 +------
 arch/arm64/kvm/hyp.S          |  2 +-
 4 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 435514c52b0fea..c5f89442785c79 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -29,28 +29,25 @@
  * indirection of a function call (as implemented in hyp-stub.S).
  */
 
-/* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */
-#define HVC_GET_VECTORS 0
-
 /*
  * HVC_SET_VECTORS - Set the value of the vbar_el2 register.
  *
  * @x1: Physical address of the new vector table.
  */
-#define HVC_SET_VECTORS 1
+#define HVC_SET_VECTORS 0
 
 /*
  * HVC_SOFT_RESTART - CPU soft reset, used by the cpu_soft_restart routine.
  */
-#define HVC_SOFT_RESTART 2
+#define HVC_SOFT_RESTART 1
 
 /*
  * HVC_RESET_VECTORS - Restore the vectors to the original HYP stubs
  */
-#define HVC_RESET_VECTORS 3
+#define HVC_RESET_VECTORS 2
 
 /* Max number of HYP stub hypercalls */
-#define HVC_STUB_HCALL_NR 4
+#define HVC_STUB_HCALL_NR 3
 
 /* Error returned when an invalid stub number is passed into x0 */
 #define HVC_STUB_ERR	0xbadca11
@@ -77,7 +74,6 @@
 extern u32 __boot_cpu_mode[2];
 
 void __hyp_set_vectors(phys_addr_t phys_vector_base);
-phys_addr_t __hyp_get_vectors(void);
 void __hyp_reset_vectors(void);
 
 /* Reports the availability of HYP mode */
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 8226fd90f73c63..d55604d91e333e 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -55,12 +55,7 @@ ENDPROC(__hyp_stub_vectors)
 	.align 11
 
 el1_sync:
-	cmp	x0, #HVC_GET_VECTORS
-	b.ne	1f
-	mrs	x0, vbar_el2
-	b	9f
-
-1:	cmp	x0, #HVC_SET_VECTORS
+	cmp	x0, #HVC_SET_VECTORS
 	b.ne	2f
 	msr	vbar_el2, x1
 	b	9f
@@ -118,12 +113,6 @@ ENDPROC(\label)
  * initialisation entry point.
  */
 
-ENTRY(__hyp_get_vectors)
-	mov	x0, #HVC_GET_VECTORS
-	hvc	#0
-	ret
-ENDPROC(__hyp_get_vectors)
-
 ENTRY(__hyp_set_vectors)
 	mov	x1, x0
 	mov	x0, #HVC_SET_VECTORS
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 0ad34fd6d2fc86..3734e6315cd7ae 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -124,12 +124,7 @@ __do_hyp_init:
 ENDPROC(__kvm_hyp_init)
 
 ENTRY(__kvm_handle_stub_hvc)
-	cmp	x0, #HVC_GET_VECTORS
-	b.ne	1f
-	mrs	x0, vbar_el2
-	b	exit
-
-1:	cmp	x0, #HVC_SOFT_RESTART
+	cmp	x0, #HVC_SOFT_RESTART
 	b.ne	1f
 
 	/* This is where we're about to jump, staying at EL2 */
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index f6f20b58d65a58..952f6cb9cf7205 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -36,7 +36,7 @@
  * passed in x0.
  *
  * A function pointer with a value less than 0xfff has a special meaning,
- * and is used to implement __hyp_get_vectors in the same way as in
+ * and is used to implement hyp stubs in the same way as in
  * arch/arm64/kernel/hyp_stub.S.
  */
 ENTRY(__kvm_call_hyp)

From af42f20480bf13d9dbf099ccca65f3de8c44fcaf Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:05 +0100
Subject: [PATCH 143/227] arm64: hyp-stub: Zero x0 on successful stub handling

We now return HVC_STUB_ERR when a stub hypercall fails, but we
leave whatever was in x0 on success. Zeroing it on return seems
like a good idea.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm64/kernel/hyp-stub.S | 4 +++-
 arch/arm64/kvm/hyp-init.S    | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index d55604d91e333e..e1261fbaa374e0 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -73,8 +73,10 @@ el1_sync:
 
 	/* Someone called kvm_call_hyp() against the hyp-stub... */
 	ldr	x0, =HVC_STUB_ERR
+	eret
 
-9:	eret
+9:	mov	x0, xzr
+	eret
 ENDPROC(el1_sync)
 
 .macro invalid_vector	label
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 3734e6315cd7ae..839425c24b1c70 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -154,13 +154,13 @@ reset:
 	/* Install stub vectors */
 	adr_l	x5, __hyp_stub_vectors
 	msr	vbar_el2, x5
-	b	exit
+	mov	x0, xzr
+	eret
 
 1:	/* Bad stub call */
 	ldr	x0, =HVC_STUB_ERR
-
-exit:
 	eret
+
 ENDPROC(__kvm_handle_stub_hvc)
 
 	.ltorg

From d9118c87d21ee799058a55eeae84d392ae887c91 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:06 +0100
Subject: [PATCH 144/227] ARM: hyp-stub: Zero r0 on successful stub handling

We now return HVC_STUB_ERR when a stub hypercall fails, but we
leave whatever was in r0 on success. Zeroing it on return seems
like a good idea.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kernel/hyp-stub.S | 2 ++
 arch/arm/kvm/init.S        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index 918c64f171282c..d8523cc47a5463 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -215,8 +215,10 @@ __hyp_stub_do_trap:
 	beq	__hyp_stub_exit
 
 	ldr	r0, =HVC_STUB_ERR
+	__ERET
 
 __hyp_stub_exit:
+	mov	r0, #0
 	__ERET
 ENDPROC(__hyp_stub_do_trap)
 
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 87bcd7ae4552b4..570ed4a9c2618b 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -155,8 +155,10 @@ reset:
 	b	exit
 
 1:	ldr	r0, =HVC_STUB_ERR
+	eret
 
 exit:
+	mov	r0, #0
 	eret
 ENDPROC(__kvm_handle_stub_hvc)
 

From d824ca52abd020a36948d12f2c6704ea2ae12513 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 3 Apr 2017 19:38:07 +0100
Subject: [PATCH 145/227] arm/arm64: Add hyp-stub API documentation

In order to help people understanding the hyp-stub API that exists
between the host kernel and the hypervisor mode (whether a hypervisor
has been installed or not), let's document said API.

As with any form of documentation, I expect it to become obsolete
and completely misleading within 20 minutes after having being merged.

Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 Documentation/virtual/kvm/arm/hyp-abi.txt | 53 +++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 Documentation/virtual/kvm/arm/hyp-abi.txt

diff --git a/Documentation/virtual/kvm/arm/hyp-abi.txt b/Documentation/virtual/kvm/arm/hyp-abi.txt
new file mode 100644
index 00000000000000..a20a0bee268d3c
--- /dev/null
+++ b/Documentation/virtual/kvm/arm/hyp-abi.txt
@@ -0,0 +1,53 @@
+* Internal ABI between the kernel and HYP
+
+This file documents the interaction between the Linux kernel and the
+hypervisor layer when running Linux as a hypervisor (for example
+KVM). It doesn't cover the interaction of the kernel with the
+hypervisor when running as a guest (under Xen, KVM or any other
+hypervisor), or any hypervisor-specific interaction when the kernel is
+used as a host.
+
+On arm and arm64 (without VHE), the kernel doesn't run in hypervisor
+mode, but still needs to interact with it, allowing a built-in
+hypervisor to be either installed or torn down.
+
+In order to achieve this, the kernel must be booted at HYP (arm) or
+EL2 (arm64), allowing it to install a set of stubs before dropping to
+SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction,
+and only act on individual CPUs.
+
+Unless specified otherwise, any built-in hypervisor must implement
+these functions (see arch/arm{,64}/include/asm/virt.h):
+
+* r0/x0 = HVC_SET_VECTORS
+  r1/x1 = vectors
+
+  Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors'
+  must be a physical address, and respect the alignment requirements
+  of the architecture. Only implemented by the initial stubs, not by
+  Linux hypervisors.
+
+* r0/x0 = HVC_RESET_VECTORS
+
+  Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials
+  stubs' exception vector value. This effectively disables an existing
+  hypervisor.
+
+* r0/x0 = HVC_SOFT_RESTART
+  r1/x1 = restart address
+  x2 = x0's value when entering the next payload (arm64)
+  x3 = x1's value when entering the next payload (arm64)
+  x4 = x2's value when entering the next payload (arm64)
+
+  Mask all exceptions, disable the MMU, move the arguments into place
+  (arm64 only), and jump to the restart address while at HYP/EL2. This
+  hypercall is not expected to return to its caller.
+
+Any other value of r0/x0 triggers a hypervisor-specific handling,
+which is not documented here.
+
+The return value of a stub hypercall is held by r0/x0, and is 0 on
+success, and HVC_STUB_ERR on error. A stub hypercall is allowed to
+clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and
+ip on arm). It is thus recommended to use a function call to perform
+the hypercall.

From b22e7df2d85fcbe8b36bab909b98c3d0239e69e6 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 27 Sep 2016 21:08:04 +0200
Subject: [PATCH 146/227] KVM: arm/arm64: Cleanup the arch timer code's irqchip
 checking

Currently we check if we have an in-kernel irqchip and if the vgic was
properly implemented several places in the arch timer code.  But, we
already predicate our enablement of the arm timers on having a valid
and initialized gic, so we can simply check if the timers are enabled or
not.

This also gets rid of the ugly "error that's not an error but used to
signal that the timer shouldn't poke the gic" construct we have.

Reviewed-by: Alexander Graf <agraf@suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arch_timer.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 35d7100e0815c4..363f0d2cfc7932 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -189,8 +189,6 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 {
 	int ret;
 
-	BUG_ON(!vgic_initialized(vcpu->kvm));
-
 	timer_ctx->active_cleared_last = false;
 	timer_ctx->irq.level = new_level;
 	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
@@ -205,7 +203,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
  * Check if there was a change in the timer state (should we raise or lower
  * the line level to the GIC).
  */
-static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
+static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
@@ -217,16 +215,14 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
 	 * because the guest would never see the interrupt.  Instead wait
 	 * until we call this function from kvm_timer_flush_hwstate.
 	 */
-	if (!vgic_initialized(vcpu->kvm) || !timer->enabled)
-		return -ENODEV;
+	if (!timer->enabled)
+		return;
 
 	if (kvm_timer_should_fire(vtimer) != vtimer->irq.level)
 		kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer);
 
 	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
 		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
-
-	return 0;
 }
 
 /* Schedule the background timer for the emulated timer. */
@@ -295,13 +291,16 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
  */
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 {
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	bool phys_active;
 	int ret;
 
-	if (kvm_timer_update_state(vcpu))
+	if (unlikely(!timer->enabled))
 		return;
 
+	kvm_timer_update_state(vcpu);
+
 	/* Set the background timer for the physical timer emulation. */
 	kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu));
 

From 3fe17e6826162021d5e9274949571b19fc94826b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 27 Sep 2016 21:08:05 +0200
Subject: [PATCH 147/227] KVM: arm/arm64: Add ARM user space interrupt
 signaling ABI

We have 2 modes for dealing with interrupts in the ARM world. We can
either handle them all using hardware acceleration through the vgic or
we can emulate a gic in user space and only drive CPU IRQ pins from
there.

Unfortunately, when driving IRQs from user space, we never tell user
space about events from devices emulated inside the kernel, which may
result in interrupt line state changes, so we lose out on for example
timer and PMU events if we run with user space gic emulation.

Define an ABI to publish such device output levels to userspace.

Reviewed-by: Alexander Graf <agraf@suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 42 +++++++++++++++++++++++++++++++
 arch/arm/include/uapi/asm/kvm.h   |  2 ++
 arch/arm64/include/uapi/asm/kvm.h |  2 ++
 include/uapi/linux/kvm.h          |  8 ++++++
 4 files changed, 54 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c248f772ae616..3b4e76e5201e31 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4147,3 +4147,45 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is
 available, means that that the kernel can support guests using the
 hashed page table MMU defined in Power ISA V3.00 (as implemented in
 the POWER9 processor), including in-memory segment tables.
+
+
+8.5 KVM_CAP_ARM_USER_IRQ
+
+Architectures: arm, arm64
+This capability, if KVM_CHECK_EXTENSION indicates that it is available, means
+that if userspace creates a VM without an in-kernel interrupt controller, it
+will be notified of changes to the output level of in-kernel emulated devices,
+which can generate virtual interrupts, presented to the VM.
+For such VMs, on every return to userspace, the kernel
+updates the vcpu's run->s.regs.device_irq_level field to represent the actual
+output level of the device.
+
+Whenever kvm detects a change in the device output level, kvm guarantees at
+least one return to userspace before running the VM.  This exit could either
+be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way,
+userspace can always sample the device output level and re-compute the state of
+the userspace interrupt controller.  Userspace should always check the state
+of run->s.regs.device_irq_level on every kvm exit.
+The value in run->s.regs.device_irq_level can represent both level and edge
+triggered interrupt signals, depending on the device.  Edge triggered interrupt
+signals will exit to userspace with the bit in run->s.regs.device_irq_level
+set exactly once per edge signal.
+
+The field run->s.regs.device_irq_level is available independent of
+run->kvm_valid_regs or run->kvm_dirty_regs bits.
+
+If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a
+number larger than 0 indicating the version of this capability is implemented
+and thereby which bits in in run->s.regs.device_irq_level can signal values.
+
+Currently the following bits are defined for the device_irq_level bitmap:
+
+  KVM_CAP_ARM_USER_IRQ >= 1:
+
+    KVM_ARM_DEV_EL1_VTIMER -  EL1 virtual timer
+    KVM_ARM_DEV_EL1_PTIMER -  EL1 physical timer
+    KVM_ARM_DEV_PMU        -  ARM PMU overflow interrupt signal
+
+Future versions of kvm may implement additional events. These will get
+indicated by returning a higher number from KVM_CHECK_EXTENSION and will be
+listed above.
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 6ebd3e6a1fd12d..a5838d605e7b0d 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -114,6 +114,8 @@ struct kvm_debug_exit_arch {
 };
 
 struct kvm_sync_regs {
+	/* Used with KVM_CAP_ARM_USER_IRQ */
+	__u64 device_irq_level;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index c2860358ae3e0c..cd6bea495e637d 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -143,6 +143,8 @@ struct kvm_debug_exit_arch {
 #define KVM_GUESTDBG_USE_HW		(1 << 17)
 
 struct kvm_sync_regs {
+	/* Used with KVM_CAP_ARM_USER_IRQ */
+	__u64 device_irq_level;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f51d5082a377ec..6d6b9b237f0b91 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -883,6 +883,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_ARM_USER_IRQ 137
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1354,4 +1355,11 @@ struct kvm_assigned_msix_entry {
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
 #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
+/* Available with KVM_CAP_ARM_USER_IRQ */
+
+/* Bits for run->s.regs.device_irq_level */
+#define KVM_ARM_DEV_EL1_VTIMER		(1 << 0)
+#define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
+#define KVM_ARM_DEV_PMU			(1 << 2)
+
 #endif /* __LINUX_KVM_H */

From d9e1397783765a275c3a7930250dcdb7e9480d7d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 27 Sep 2016 21:08:06 +0200
Subject: [PATCH 148/227] KVM: arm/arm64: Support arch timers with a userspace
 gic

If you're running with a userspace gic or other interrupt controller
(that is no vgic in the kernel), then you have so far not been able to
use the architected timers, because the output of the architected
timers, which are driven inside the kernel, was a kernel-only construct
between the arch timer code and the vgic.

This patch implements the new KVM_CAP_ARM_USER_IRQ feature, where we use a
side channel on the kvm_run structure, run->s.regs.device_irq_level, to
always notify userspace of the timer output levels when using a userspace
irqchip.

This works by ensuring that before we enter the guest, if the timer
output level has changed compared to what we last told userspace, we
don't enter the guest, but instead return to userspace to notify it of
the new level.  If we are exiting, because of an MMIO for example, and
the level changed at the same time, the value is also updated and
userspace can sample the line as it needs.  This is nicely achieved
simply always updating the timer_irq_level field after the main run
loop.

Note that the kvm_timer_update_irq trace event is changed to show the
host IRQ number for the timer instead of the guest IRQ number, because
the kernel no longer know which IRQ userspace wires up the timer signal
to.

Also note that this patch implements all required functionality but does
not yet advertise the capability.

Reviewed-by: Alexander Graf <agraf@suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/arm.c           |  18 +++---
 include/kvm/arm_arch_timer.h |   2 +
 virt/kvm/arm/arch_timer.c    | 122 ++++++++++++++++++++++++++++-------
 3 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index c378502623f6a9..ac6e57bcbe4d2c 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -515,13 +515,7 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 			return ret;
 	}
 
-	/*
-	 * Enable the arch timers only if we have an in-kernel VGIC
-	 * and it has been properly initialized, since we cannot handle
-	 * interrupts from the virtual timer with a userspace gic.
-	 */
-	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
-		ret = kvm_timer_enable(vcpu);
+	ret = kvm_timer_enable(vcpu);
 
 	return ret;
 }
@@ -640,9 +634,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		local_irq_disable();
 
 		/*
-		 * Re-check atomic conditions
+		 * If we have a singal pending, or need to notify a userspace
+		 * irqchip about timer level changes, then we exit (and update
+		 * the timer level state in kvm_timer_update_run below).
 		 */
-		if (signal_pending(current)) {
+		if (signal_pending(current) ||
+		    kvm_timer_should_notify_user(vcpu)) {
 			ret = -EINTR;
 			run->exit_reason = KVM_EXIT_INTR;
 		}
@@ -714,6 +711,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		ret = handle_exit(vcpu, run, ret);
 	}
 
+	/* Tell userspace about in-kernel device output levels */
+	kvm_timer_update_run(vcpu);
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	return ret;
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index fe797d6ef89d90..295584f31a4ef3 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -63,6 +63,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
+bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu);
+void kvm_timer_update_run(struct kvm_vcpu *vcpu);
 void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
 
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 363f0d2cfc7932..5dc216748d546f 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -184,6 +184,27 @@ bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 	return cval <= now;
 }
 
+/*
+ * Reflect the timer output level into the kvm_run structure
+ */
+void kvm_timer_update_run(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm)))
+		return;
+
+	/* Populate the device bitmap with the timer states */
+	regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
+				    KVM_ARM_DEV_EL1_PTIMER);
+	if (vtimer->irq.level)
+		regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
+	if (ptimer->irq.level)
+		regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
+}
+
 static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 				 struct arch_timer_context *timer_ctx)
 {
@@ -194,9 +215,12 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
 				   timer_ctx->irq.level);
 
-	ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq,
-				  timer_ctx->irq.level);
-	WARN_ON(ret);
+	if (likely(irqchip_in_kernel(vcpu->kvm))) {
+		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+					  timer_ctx->irq.irq,
+					  timer_ctx->irq.level);
+		WARN_ON(ret);
+	}
 }
 
 /*
@@ -215,7 +239,7 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 	 * because the guest would never see the interrupt.  Instead wait
 	 * until we call this function from kvm_timer_flush_hwstate.
 	 */
-	if (!timer->enabled)
+	if (unlikely(!timer->enabled))
 		return;
 
 	if (kvm_timer_should_fire(vtimer) != vtimer->irq.level)
@@ -282,28 +306,12 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
 	timer_disarm(timer);
 }
 
-/**
- * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the virtual timer has expired while we were running in the host,
- * and inject an interrupt if that was the case.
- */
-void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
+static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	bool phys_active;
 	int ret;
 
-	if (unlikely(!timer->enabled))
-		return;
-
-	kvm_timer_update_state(vcpu);
-
-	/* Set the background timer for the physical timer emulation. */
-	kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu));
-
 	/*
 	* If we enter the guest with the virtual input level to the VGIC
 	* asserted, then we have already told the VGIC what we need to, and
@@ -355,11 +363,72 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 	vtimer->active_cleared_last = !phys_active;
 }
 
+bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+	bool vlevel, plevel;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm)))
+		return false;
+
+	vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
+	plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
+
+	return vtimer->irq.level != vlevel ||
+	       ptimer->irq.level != plevel;
+}
+
+static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+	/*
+	 * To prevent continuously exiting from the guest, we mask the
+	 * physical interrupt such that the guest can make forward progress.
+	 * Once we detect the output level being deasserted, we unmask the
+	 * interrupt again so that we exit from the guest when the timer
+	 * fires.
+	*/
+	if (vtimer->irq.level)
+		disable_percpu_irq(host_vtimer_irq);
+	else
+		enable_percpu_irq(host_vtimer_irq, 0);
+}
+
+/**
+ * kvm_timer_flush_hwstate - prepare timers before running the vcpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the virtual timer has expired while we were running in the host,
+ * and inject an interrupt if that was the case, making sure the timer is
+ * masked or disabled on the host so that we keep executing.  Also schedule a
+ * software timer for the physical timer if it is enabled.
+ */
+void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	if (unlikely(!timer->enabled))
+		return;
+
+	kvm_timer_update_state(vcpu);
+
+	/* Set the background timer for the physical timer emulation. */
+	kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu));
+
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		kvm_timer_flush_hwstate_user(vcpu);
+	else
+		kvm_timer_flush_hwstate_vgic(vcpu);
+}
+
 /**
  * kvm_timer_sync_hwstate - sync timer state from cpu
  * @vcpu: The vcpu pointer
  *
- * Check if the virtual timer has expired while we were running in the guest,
+ * Check if any of the timers have expired while we were running in the guest,
  * and inject an interrupt if that was the case.
  */
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
@@ -559,6 +628,13 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	if (timer->enabled)
 		return 0;
 
+	/* Without a VGIC we do not map virtual IRQs to physical IRQs */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		goto no_vgic;
+
+	if (!vgic_initialized(vcpu->kvm))
+		return -ENODEV;
+
 	/*
 	 * Find the physical IRQ number corresponding to the host_vtimer_irq
 	 */
@@ -582,8 +658,8 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	if (ret)
 		return ret;
 
+no_vgic:
 	timer->enabled = 1;
-
 	return 0;
 }
 

From 3dbbdf78636e66094d82c4df496c54ff6ae46e31 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Wed, 1 Feb 2017 12:51:52 +0100
Subject: [PATCH 149/227] KVM: arm/arm64: Report PMU overflow interrupts to
 userspace irqchip

When not using an in-kernel VGIC, but instead emulating an interrupt
controller in userspace, we should report the PMU overflow status to
that userspace interrupt controller using the KVM_CAP_ARM_USER_IRQ
feature.

Reviewed-by: Alexander Graf <agraf@suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/arm.c        | 13 +++++++++----
 include/kvm/arm_pmu.h     |  7 +++++++
 virt/kvm/arm/arch_timer.c |  3 ---
 virt/kvm/arm/pmu.c        | 39 +++++++++++++++++++++++++++++++++++----
 4 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index ac6e57bcbe4d2c..9eda2932f686c2 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -635,11 +635,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 		/*
 		 * If we have a singal pending, or need to notify a userspace
-		 * irqchip about timer level changes, then we exit (and update
-		 * the timer level state in kvm_timer_update_run below).
+		 * irqchip about timer or PMU level changes, then we exit (and
+		 * update the timer level state in kvm_timer_update_run
+		 * below).
 		 */
 		if (signal_pending(current) ||
-		    kvm_timer_should_notify_user(vcpu)) {
+		    kvm_timer_should_notify_user(vcpu) ||
+		    kvm_pmu_should_notify_user(vcpu)) {
 			ret = -EINTR;
 			run->exit_reason = KVM_EXIT_INTR;
 		}
@@ -712,7 +714,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	}
 
 	/* Tell userspace about in-kernel device output levels */
-	kvm_timer_update_run(vcpu);
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
+		kvm_timer_update_run(vcpu);
+		kvm_pmu_update_run(vcpu);
+	}
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 92e7e97ca8ff0f..1ab4633adf4fd1 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -50,6 +50,8 @@ void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val);
 void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val);
 void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu);
+bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu);
+void kvm_pmu_update_run(struct kvm_vcpu *vcpu);
 void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val);
 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val);
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
@@ -85,6 +87,11 @@ static inline void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {}
 static inline void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {}
 static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {}
 static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {}
 static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 5dc216748d546f..5976609ef27cb9 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -193,9 +193,6 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu)
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
 
-	if (likely(irqchip_in_kernel(vcpu->kvm)))
-		return;
-
 	/* Populate the device bitmap with the timer states */
 	regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
 				    KVM_ARM_DEV_EL1_PTIMER);
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 69ccce308458a4..4b43e7f3b15801 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -230,13 +230,44 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
 		return;
 
 	overflow = !!kvm_pmu_overflow_status(vcpu);
-	if (pmu->irq_level != overflow) {
-		pmu->irq_level = overflow;
-		kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-				    pmu->irq_num, overflow);
+	if (pmu->irq_level == overflow)
+		return;
+
+	pmu->irq_level = overflow;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm))) {
+		int ret;
+		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+					  pmu->irq_num, overflow);
+		WARN_ON(ret);
 	}
 }
 
+bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+	bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm)))
+		return false;
+
+	return pmu->irq_level != run_level;
+}
+
+/*
+ * Reflect the PMU overflow interrupt output level into the kvm_run structure
+ */
+void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+	/* Populate the timer bitmap for user space */
+	regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
+	if (vcpu->arch.pmu.irq_level)
+		regs->device_irq_level |= KVM_ARM_DEV_PMU;
+}
+
 /**
  * kvm_pmu_flush_hwstate - flush pmu state to cpu
  * @vcpu: The vcpu pointer

From f7214e6023c86cbb66d3ebead0b27ac1a4ebb8dc Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Wed, 1 Feb 2017 12:54:11 +0100
Subject: [PATCH 150/227] KVM: arm/arm64: Advertise support for
 KVM_CAP_ARM_USER_IRQ

Now that we support both timers and PMU reporting interrupts
to userspace, we can advertise this support.

Reviewed-by: Alexander Graf <agraf@suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/arm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9eda2932f686c2..7941699a766d43 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -229,6 +229,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		else
 			r = kvm->arch.vgic.msis_require_devid;
 		break;
+	case KVM_CAP_ARM_USER_IRQ:
+		/*
+		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
+		 * (bump this number if adding more devices)
+		 */
+		r = 1;
+		break;
 	default:
 		r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
 		break;

From fe722d13e3638470a7eef5dad6c9ded7f6653773 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 7 Apr 2017 14:23:13 +0200
Subject: [PATCH 151/227] KVM: s390: Fix sdnxo setting for nested guests

If the guest does not use the host register management, but it uses
the sdnx area, we must fill in a proper sdnxo value (address of sdnx
and the sdnxc).

Reported-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 2fafc2be777fe4..025b1f2e17a9d4 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -627,7 +627,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = set_validity_icpt(scb_s, 0x10b0U);
 		if (rc)
 			goto unpin;
-		scb_s->sdnxo = hpa;
+		scb_s->sdnxo = hpa | sdnxc;
 	}
 	return 0;
 unpin:

From 80248559992a05573b2db7865490eb82c745db87 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 12 Apr 2017 12:59:59 +0200
Subject: [PATCH 152/227] KVM: s390: fix stale machine check data for guarded
 storage

When delivering a machine check the CPU state is "loaded", which
means that some registers are already in the host registers.
Before writing the register content into the machine check
save area, we must make sure that we save the content of the
registers into the data structures that are used for delivering
the machine check.
We already do the right thing for access, vector/floating point
registers, let's do the same for guarded storage.

Fixes: 4e0b1ab72b8a ("KVM: s390: gs support for kvm guests")
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 482673e3436d65..caf15c8a8948e7 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,6 +419,8 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	/* take care of lazy register loading */
 	save_fpu_regs();
 	save_access_regs(vcpu->run->s.regs.acrs);
+	if (MACHINE_HAS_GS && vcpu->arch.gs_enabled)
+		save_gs_cb(current->thread.gs_cb);
 
 	/* Extended save area */
 	rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr,

From 1df6ddede10a1cacbf1da4cdfca3342510db1882 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:18 +0200
Subject: [PATCH 153/227] KVM: x86: race between KVM_SET_GSI_ROUTING and
 KVM_CREATE_IRQCHIP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid races between KVM_SET_GSI_ROUTING and KVM_CREATE_IRQCHIP by taking
the kvm->lock when setting up routes.

If KVM_CREATE_IRQCHIP fails, KVM_SET_GSI_ROUTING could have already set
up routes pointing at pic/ioapic, being silently removed already.

Also, as a side effect, this patch makes sure that KVM_SET_GSI_ROUTING
and KVM_CAP_SPLIT_IRQCHIP cannot run in parallel.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 virt/kvm/kvm_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f489167839c48b..357e67cba32eed 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3069,8 +3069,11 @@ static long kvm_vm_ioctl(struct file *filp,
 					   routing.nr * sizeof(*entries)))
 				goto out_free_irq_routing;
 		}
+		/* avoid races with KVM_CREATE_IRQCHIP on x86 */
+		mutex_lock(&kvm->lock);
 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
 					routing.flags);
+		mutex_unlock(&kvm->lock);
 out_free_irq_routing:
 		vfree(entries);
 		break;

From 637e3f86faf97a930f1ac4c3f79fd667addae9e8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:19 +0200
Subject: [PATCH 154/227] KVM: x86: new irqchip mode
 KVM_IRQCHIP_INIT_IN_PROGRESS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's add a new mode and set it while we create the irqchip via
KVM_CREATE_IRQCHIP and KVM_CAP_SPLIT_IRQCHIP.

This mode will be used later to test if adding routes
(in kvm_set_routing_entry()) is already allowed.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/irq.h              | 18 +++++++++++++-----
 arch/x86/kvm/x86.c              | 11 ++++++++++-
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d962fa998a6fc5..2cc5ec7cc6f55d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -726,6 +726,7 @@ struct kvm_hv {
 
 enum kvm_irqchip_mode {
 	KVM_IRQCHIP_NONE,
+	KVM_IRQCHIP_INIT_IN_PROGRESS, /* temporarily set during creation */
 	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
 	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 40d5b2cf60611e..59e05febc8fb8e 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -93,21 +93,29 @@ static inline int pic_in_kernel(struct kvm *kvm)
 
 static inline int irqchip_split(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
+	int mode = kvm->arch.irqchip_mode;
+
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_SPLIT;
 }
 
 static inline int irqchip_kernel(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+	int mode = kvm->arch.irqchip_mode;
+
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_KERNEL;
 }
 
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
-	bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
+	int mode = kvm->arch.irqchip_mode;
 
-	/* Matches with wmb after initializing kvm->irq_routing. */
+	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
-	return ret;
+	return mode > KVM_IRQCHIP_INIT_IN_PROGRESS;
 }
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6bc47e2712c87d..c7956ce6a24264 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3928,9 +3928,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		if (kvm->created_vcpus)
 			goto split_irqchip_unlock;
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_empty_irq_routing(kvm);
-		if (r)
+		if (r) {
+			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
+			/* Pairs with smp_rmb() when reading irqchip_mode */
+			smp_wmb();
 			goto split_irqchip_unlock;
+		}
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -4018,8 +4023,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto create_irqchip_unlock;
 		}
 
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
+			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
+			/* Pairs with smp_rmb() when reading irqchip_mode */
+			smp_wmb();
 			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);

From 8bf463f3ba0eefd2c808fc22c005fc95f1ad48fe Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:20 +0200
Subject: [PATCH 155/227] KVM: x86: check against irqchip_mode in
 kvm_set_routing_entry()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's replace the checks for pic_in_kernel() and ioapic_in_kernel() by
checks against irqchip_mode.

Also make sure that creation of any route is only possible if we have
an lapic in kernel (irqchip_in_kernel()) or if we are currently
inititalizing the irqchip.

This is necessary to switch pic_in_kernel() and ioapic_in_kernel() to
irqchip_mode, too.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6825cd36d13b7c..2e5eec8c39fc4c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -282,24 +282,26 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	int delta;
 	unsigned max_pin;
 
+	/* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */
+	if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE)
+		goto out;
+
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
+		if (irqchip_split(kvm))
+			goto out;
 		delta = 0;
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_SLAVE:
 			delta = 8;
 			/* fall through */
 		case KVM_IRQCHIP_PIC_MASTER:
-			if (!pic_in_kernel(kvm))
-				goto out;
-
 			e->set = kvm_set_pic_irq;
 			max_pin = PIC_NUM_PINS;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-			if (!ioapic_in_kernel(kvm))
-				goto out;
-
 			max_pin = KVM_IOAPIC_NUM_PINS;
 			e->set = kvm_set_ioapic_irq;
 			break;

From 19d25a0e474b0c69272e17cb6a9106c25eb27c82 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:21 +0200
Subject: [PATCH 156/227] KVM: x86: check against irqchip_mode in
 pic_in_kernel()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's avoid checking against kvm->arch.vpic. We have kvm->arch.irqchip_mode
for that now.

KVM_IRQCHIP_KERNEL implies a fully inititalized pic, while kvm->arch.vpic
might temporarily be set but invalidated again if e.g. kvm_ioapic_init()
fails when setting KVM_CREATE_IRQCHIP. Although current users seem to be
fine, this avoids future bugs.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/irq.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 59e05febc8fb8e..26d8dd4a4323f6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -85,10 +85,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 
 static inline int pic_in_kernel(struct kvm *kvm)
 {
-	int ret;
+	int mode = kvm->arch.irqchip_mode;
 
-	ret = (pic_irqchip(kvm) != NULL);
-	return ret;
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_KERNEL;
 }
 
 static inline int irqchip_split(struct kvm *kvm)

From f567080bdd967fa09326442ac18826c0618fa6c9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:22 +0200
Subject: [PATCH 157/227] KVM: x86: check against irqchip_mode in
 ioapic_in_kernel()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM_IRQCHIP_KERNEL implies a fully inititalized ioapic, while
kvm->arch.vioapic might temporarily be set but invalidated again if e.g.
setting of default routing fails when setting KVM_CREATE_IRQCHIP.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 1cc6e54436dbaa..d88e4b9aa1510e 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -112,10 +112,11 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 
 static inline int ioapic_in_kernel(struct kvm *kvm)
 {
-	int ret;
+	int mode = kvm->arch.irqchip_mode;
 
-	ret = (ioapic_irqchip(kvm) != NULL);
-	return ret;
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_KERNEL;
 }
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);

From 90bca0529e1bc94fd36659217603e8a1c3f9ad83 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:23 +0200
Subject: [PATCH 158/227] KVM: x86: get rid of pic_irqchip()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seemed like a nice idea to encapsulate access to kvm->arch.vpic. But
as the usage is already mixed, internal locks are taken outside of i8259.c
and grepping for "vpic" only is much easier, let's just get rid of
pic_irqchip().

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c    |  4 ++--
 arch/x86/kvm/irq.c      |  2 +-
 arch/x86/kvm/irq.h      |  5 -----
 arch/x86/kvm/irq_comm.c |  4 ++--
 arch/x86/kvm/x86.c      | 24 +++++++++++-------------
 5 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 047b17a2626961..af97c5a1f14247 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -239,7 +239,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 int kvm_pic_read_irq(struct kvm *kvm)
 {
 	int irq, irq2, intno;
-	struct kvm_pic *s = pic_irqchip(kvm);
+	struct kvm_pic *s = kvm->arch.vpic;
 
 	s->output = 0;
 
@@ -576,7 +576,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
  */
 static void pic_irq_request(struct kvm *kvm, int level)
 {
-	struct kvm_pic *s = pic_irqchip(kvm);
+	struct kvm_pic *s = kvm->arch.vpic;
 
 	if (!s->output)
 		s->wakeup_needed = true;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 60d91c9d160c56..5c24811e8b0bca 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 		if (irqchip_split(v->kvm))
 			return pending_userspace_extint(v);
 		else
-			return pic_irqchip(v->kvm)->output;
+			return v->kvm->arch.vpic->output;
 	} else
 		return 0;
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 26d8dd4a4323f6..8fec66d1b9a9ba 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -78,11 +78,6 @@ void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vpic;
-}
-
 static inline int pic_in_kernel(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 2e5eec8c39fc4c..d82fc1443a6381 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
 {
-	struct kvm_pic *pic = pic_irqchip(kvm);
+	struct kvm_pic *pic = kvm->arch.vpic;
 	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
 }
 
@@ -236,7 +236,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
-	kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
+	kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
 unlock:
 	mutex_unlock(&kvm->irq_lock);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c7956ce6a24264..c50d6b62425faa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3715,18 +3715,17 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
+	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy(&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[0],
+		memcpy(&chip->chip.pic, &pic->pics[0],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy(&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[1],
+		memcpy(&chip->chip.pic, &pic->pics[1],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_IOAPIC:
@@ -3741,23 +3740,22 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
+	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		spin_lock(&pic_irqchip(kvm)->lock);
-		memcpy(&pic_irqchip(kvm)->pics[0],
-			&chip->chip.pic,
+		spin_lock(&pic->lock);
+		memcpy(&pic->pics[0], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		spin_lock(&pic_irqchip(kvm)->lock);
-		memcpy(&pic_irqchip(kvm)->pics[1],
-			&chip->chip.pic,
+		spin_lock(&pic->lock);
+		memcpy(&pic->pics[1], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3766,7 +3764,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 		r = -EINVAL;
 		break;
 	}
-	kvm_pic_update_irq(pic_irqchip(kvm));
+	kvm_pic_update_irq(pic);
 	return r;
 }
 

From 0191e92d84eefe6522a4546b0dfb69393835b168 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:24 +0200
Subject: [PATCH 159/227] KVM: x86: get rid of ioapic_irqchip()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's just use kvm->arch.vioapic directly.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 ++--
 arch/x86/kvm/ioapic.h | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 289270a6aecbb4..a433317d57872b 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -646,7 +646,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	if (!ioapic)
 		return -EINVAL;
 
@@ -659,7 +659,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	if (!ioapic)
 		return -EINVAL;
 
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index d88e4b9aa1510e..5a1de923064270 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -105,11 +105,6 @@ do {									\
 #define ASSERT(x) do { } while (0)
 #endif
 
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vioapic;
-}
-
 static inline int ioapic_in_kernel(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;

From 0bceb15ad15e9374387688ced76c1ac5f5eb6e14 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:25 +0200
Subject: [PATCH 160/227] KVM: x86: use ioapic_in_kernel() to check for ioapic
 existence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index a433317d57872b..da66650e331329 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -268,9 +268,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
-	if (!ioapic)
+	if (!ioapic_in_kernel(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }

From 4c0b06d8868b0ad96cca83e0d04687e7f3e5b7bd Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:26 +0200
Subject: [PATCH 161/227] KVM: x86: remove duplicate checks for ioapic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When handling KVM_GET_IRQCHIP, we already check irqchip_kernel(), which
implies a fully inititalized ioapic.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index da66650e331329..f8e988a14e8da6 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -645,8 +645,6 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	if (!ioapic)
-		return -EINVAL;
 
 	spin_lock(&ioapic->lock);
 	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
@@ -658,8 +656,6 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	if (!ioapic)
-		return -EINVAL;
 
 	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));

From 33392b4911150d57cd66a4e9dac74f93b4aef2be Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:27 +0200
Subject: [PATCH 162/227] KVM: x86: convert kvm_(set|get)_ioapic() into void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c | 6 ++----
 arch/x86/kvm/ioapic.h | 4 ++--
 arch/x86/kvm/x86.c    | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index f8e988a14e8da6..10941a6d0e6215 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -642,7 +642,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 	kfree(ioapic);
 }
 
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
@@ -650,10 +650,9 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
 	state->irr &= ~ioapic->irr_delivered;
 	spin_unlock(&ioapic->lock);
-	return 0;
 }
 
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
@@ -664,5 +663,4 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
-	return 0;
 }
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 5a1de923064270..29ce19732ccf8e 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -128,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			     struct kvm_lapic_irq *irq,
 			     struct dest_map *dest_map);
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
 			   ulong *ioapic_handled_vectors);
 void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c50d6b62425faa..0691feeae39285 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3729,7 +3729,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_IOAPIC:
-		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
+		kvm_get_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
@@ -3758,7 +3758,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
-		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
+		kvm_set_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;

From ba7454e17f90a7dfecc3eb82ec19eb7ac5b6f0e0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:28 +0200
Subject: [PATCH 163/227] KVM: x86: don't take kvm->irq_lock when creating
 IRQCHIP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I don't see any reason any more for this lock, seemed to be used to protect
removal of kvm->arch.vpic / kvm->arch.vioapic when already partially
inititalized, now access is properly protected using kvm->arch.irqchip_mode
and this shouldn't be necessary anymore.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0691feeae39285..182fea214c4291 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4028,10 +4028,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			/* Pairs with smp_rmb() when reading irqchip_mode */
 			smp_wmb();
 			mutex_lock(&kvm->slots_lock);
-			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
 			kvm_pic_destroy(kvm);
-			mutex_unlock(&kvm->irq_lock);
 			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}

From 49f520b99af13d8cafec6f227dabf6acf236c887 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:29 +0200
Subject: [PATCH 164/227] KVM: x86: push usage of slots_lock down
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's just move it to the place where it is actually needed.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c  | 2 ++
 arch/x86/kvm/ioapic.c | 2 ++
 arch/x86/kvm/x86.c    | 4 ----
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index af97c5a1f14247..2da2b429edcdc7 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -660,9 +660,11 @@ void kvm_pic_destroy(struct kvm *kvm)
 	if (!vpic)
 		return;
 
+	mutex_lock(&kvm->slots_lock);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+	mutex_unlock(&kvm->slots_lock);
 
 	kvm->arch.vpic = NULL;
 	kfree(vpic);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 10941a6d0e6215..6e1d8cbe2fd928 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -637,7 +637,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 		return;
 
 	cancel_delayed_work_sync(&ioapic->eoi_inject);
+	mutex_lock(&kvm->slots_lock);
 	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	mutex_unlock(&kvm->slots_lock);
 	kvm->arch.vioapic = NULL;
 	kfree(ioapic);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 182fea214c4291..3046b7f278aedd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4015,9 +4015,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		r = kvm_ioapic_init(kvm);
 		if (r) {
-			mutex_lock(&kvm->slots_lock);
 			kvm_pic_destroy(kvm);
-			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
 
@@ -4027,10 +4025,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
 			/* Pairs with smp_rmb() when reading irqchip_mode */
 			smp_wmb();
-			mutex_lock(&kvm->slots_lock);
 			kvm_ioapic_destroy(kvm);
 			kvm_pic_destroy(kvm);
-			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */

From 445ee82d7a9752a2ab3b678fc348c91a54c1782e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:30 +0200
Subject: [PATCH 165/227] KVM: x86: KVM_IRQCHIP_PIC_MASTER only has 8 pins
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, one could set pin 8-15, implicitly referring to
KVM_IRQCHIP_PIC_SLAVE.

Get rid of the two local variables max_pin and delta on the way.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d82fc1443a6381..fb1e37ea2674f6 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -279,8 +279,6 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	int r = -EINVAL;
-	int delta;
-	unsigned max_pin;
 
 	/* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */
 	if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE)
@@ -292,26 +290,25 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
 			goto out;
-		delta = 0;
+		e->irqchip.pin = ue->u.irqchip.pin;
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_SLAVE:
-			delta = 8;
+			e->irqchip.pin += PIC_NUM_PINS / 2;
 			/* fall through */
 		case KVM_IRQCHIP_PIC_MASTER:
+			if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+				goto out;
 			e->set = kvm_set_pic_irq;
-			max_pin = PIC_NUM_PINS;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-			max_pin = KVM_IOAPIC_NUM_PINS;
+			if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+				goto out;
 			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
 			goto out;
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
-		e->irqchip.pin = ue->u.irqchip.pin + delta;
-		if (e->irqchip.pin >= max_pin)
-			goto out;
 		break;
 	case KVM_IRQ_ROUTING_MSI:
 		e->set = kvm_set_msi;

From d62f270b2d5015d30d7f58f8429a5a26993b22a8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:31 +0200
Subject: [PATCH 166/227] KVM: x86: remove all-vcpu request from
 kvm_ioapic_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_ioapic_init() is guaranteed to be called without any created VCPUs,
so doing an all-vcpu request results in a NOP.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 6e1d8cbe2fd928..3072cdf15bfb7c 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -622,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
 		kfree(ioapic);
-		return ret;
 	}
 
-	kvm_vcpu_request_scan_ioapic(kvm);
 	return ret;
 }
 

From ca8ab3f8959c141b89dd682daafd2e64b01c295d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:32 +0200
Subject: [PATCH 167/227] KVM: x86: directly call
 kvm_make_scan_ioapic_request() in ioapic.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We know there is an ioapic, so let's call it directly.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 3072cdf15bfb7c..dc29a2785b81fe 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -313,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index, false);
-		kvm_vcpu_request_scan_ioapic(ioapic->kvm);
+		kvm_make_scan_ioapic_request(ioapic->kvm);
 		break;
 	}
 }
@@ -660,7 +660,7 @@ void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
 	ioapic->irr_delivered = 0;
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
 }

From 993225adf4af20a0e50e37c3d4894b79c98e01c9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:33 +0200
Subject: [PATCH 168/227] KVM: x86: rename kvm_vcpu_request_scan_ioapic()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's rename it into a proper arch specific callback.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c    | 2 +-
 include/linux/kvm_host.h | 4 ++--
 virt/kvm/eventfd.c       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index dc29a2785b81fe..bdff437acbcb7e 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -266,7 +266,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 	spin_unlock(&ioapic->lock);
 }
 
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
 	if (!ioapic_in_kernel(kvm))
 		return;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7e74ae4d99bbef..397b7b5b1933b8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -502,10 +502,10 @@ int __must_check vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
 void kvm_arch_post_irq_routing_update(struct kvm *kvm);
 #else
-static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
 }
 static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 4d28a9ddbee010..a8d540398bbd03 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -490,7 +490,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 	mutex_lock(&kvm->irq_lock);
 	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_arch_post_irq_ack_notifier_list_update(kvm);
 }
 
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
@@ -500,7 +500,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 	hlist_del_init_rcu(&kian->link);
 	mutex_unlock(&kvm->irq_lock);
 	synchronize_srcu(&kvm->irq_srcu);
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_arch_post_irq_ack_notifier_list_update(kvm);
 }
 #endif
 

From 43ae312ca18a0d6d1a363295fa6a2a14a12605e8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:34 +0200
Subject: [PATCH 169/227] KVM: x86: drop goto label in kvm_set_routing_entry()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No need for the goto label + local variable "r".

Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index fb1e37ea2674f6..d71f7a7fd5c9d3 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -278,18 +278,16 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
-	int r = -EINVAL;
-
 	/* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */
 	if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE)
-		goto out;
+		return -EINVAL;
 
 	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
-			goto out;
+			return -EINVAL;
 		e->irqchip.pin = ue->u.irqchip.pin;
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_SLAVE:
@@ -297,16 +295,16 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			/* fall through */
 		case KVM_IRQCHIP_PIC_MASTER:
 			if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
-				goto out;
+				return -EINVAL;
 			e->set = kvm_set_pic_irq;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
 			if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
-				goto out;
+				return -EINVAL;
 			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
-			goto out;
+			return -EINVAL;
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		break;
@@ -317,7 +315,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.data = ue->u.msi.data;
 
 		if (kvm_msi_route_invalid(kvm, e))
-			goto out;
+			return -EINVAL;
 		break;
 	case KVM_IRQ_ROUTING_HV_SINT:
 		e->set = kvm_hv_set_sint;
@@ -325,12 +323,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
 	default:
-		goto out;
+		return -EINVAL;
 	}
 
-	r = 0;
-out:
-	return r;
+	return 0;
 }
 
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,

From 8c6b7828c24a69416bcb639d5760c6eaa351fdf9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:35 +0200
Subject: [PATCH 170/227] KVM: x86: cleanup return handling in
 setup_routing_entry()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's drop the goto and return the error value directly.

Suggested-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 virt/kvm/irqchip.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 3bcc9990adf79e..cc30d01a56bee9 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -142,8 +142,8 @@ static int setup_routing_entry(struct kvm *kvm,
 			       struct kvm_kernel_irq_routing_entry *e,
 			       const struct kvm_irq_routing_entry *ue)
 {
-	int r = -EINVAL;
 	struct kvm_kernel_irq_routing_entry *ei;
+	int r;
 
 	/*
 	 * Do not allow GSI to be mapped to the same irqchip more than once.
@@ -153,20 +153,19 @@ static int setup_routing_entry(struct kvm *kvm,
 		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
-			return r;
+			return -EINVAL;
 
 	e->gsi = ue->gsi;
 	e->type = ue->type;
 	r = kvm_set_routing_entry(kvm, e, ue);
 	if (r)
-		goto out;
+		return r;
 	if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
 		rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;
 
 	hlist_add_head(&e->link, &rt->map[e->gsi]);
-	r = 0;
-out:
-	return r;
+
+	return 0;
 }
 
 void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm)

From e21d1758b0377c9b1351d76fb36ecd34ca72880d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:36 +0200
Subject: [PATCH 171/227] KVM: x86: simplify pic_unlock()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We can easily compact this code and get rid of one local variable.

Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 2da2b429edcdc7..fce571c667c6bc 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s)
 	__releases(&s->lock)
 {
 	bool wakeup = s->wakeup_needed;
-	struct kvm_vcpu *vcpu, *found = NULL;
+	struct kvm_vcpu *vcpu;
 	int i;
 
 	s->wakeup_needed = false;
@@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s)
 	if (wakeup) {
 		kvm_for_each_vcpu(i, vcpu, s->kvm) {
 			if (kvm_apic_accept_pic_intr(vcpu)) {
-				found = vcpu;
-				break;
+				kvm_make_request(KVM_REQ_EVENT, vcpu);
+				kvm_vcpu_kick(vcpu);
+				return;
 			}
 		}
-
-		if (!found)
-			return;
-
-		kvm_make_request(KVM_REQ_EVENT, found);
-		kvm_vcpu_kick(found);
 	}
 }
 

From dc24d1d2cbfa3f69c74825253d5931d1ff499205 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:37 +0200
Subject: [PATCH 172/227] KVM: x86: make kvm_pic_reset() static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not used outside of i8259.c, so let's make it static.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c | 2 +-
 arch/x86/kvm/irq.h   | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index fce571c667c6bc..0252e680097c23 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -268,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	return intno;
 }
 
-void kvm_pic_reset(struct kvm_kpic_state *s)
+static void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 	int irq, i;
 	struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 8fec66d1b9a9ba..0edd22c3344c9e 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -114,8 +114,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return mode > KVM_IRQCHIP_INIT_IN_PROGRESS;
 }
 
-void kvm_pic_reset(struct kvm_kpic_state *s);
-
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);

From 9fecaa9e32ae7370878e5967d8874b6f58360b10 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:38 +0200
Subject: [PATCH 173/227] KVM: x86: drop picdev_in_range()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We already have the exact same checks a couple of lines below.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 0252e680097c23..5b5c87f4d8bde9 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -451,46 +451,33 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
 	return s->elcr;
 }
 
-static int picdev_in_range(gpa_t addr)
-{
-	switch (addr) {
-	case 0x20:
-	case 0x21:
-	case 0xa0:
-	case 0xa1:
-	case 0x4d0:
-	case 0x4d1:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 static int picdev_write(struct kvm_pic *s,
 			 gpa_t addr, int len, const void *val)
 {
 	unsigned char data = *(unsigned char *)val;
-	if (!picdev_in_range(addr))
-		return -EOPNOTSUPP;
 
 	if (len != 1) {
 		pr_pic_unimpl("non byte write\n");
 		return 0;
 	}
-	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
 	case 0xa0:
 	case 0xa1:
+		pic_lock(s);
 		pic_ioport_write(&s->pics[addr >> 7], addr, data);
+		pic_unlock(s);
 		break;
 	case 0x4d0:
 	case 0x4d1:
+		pic_lock(s);
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
+		pic_unlock(s);
 		break;
+	default:
+		return -EOPNOTSUPP;
 	}
-	pic_unlock(s);
 	return 0;
 }
 
@@ -498,29 +485,31 @@ static int picdev_read(struct kvm_pic *s,
 		       gpa_t addr, int len, void *val)
 {
 	unsigned char data = 0;
-	if (!picdev_in_range(addr))
-		return -EOPNOTSUPP;
 
 	if (len != 1) {
 		memset(val, 0, len);
 		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
-	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
 	case 0xa0:
 	case 0xa1:
+		pic_lock(s);
 		data = pic_ioport_read(&s->pics[addr >> 7], addr);
+		pic_unlock(s);
 		break;
 	case 0x4d0:
 	case 0x4d1:
+		pic_lock(s);
 		data = elcr_ioport_read(&s->pics[addr & 1], addr);
+		pic_unlock(s);
 		break;
+	default:
+		return -EOPNOTSUPP;
 	}
 	*(unsigned char *)val = data;
-	pic_unlock(s);
 	return 0;
 }
 

From 84a5c79e7048b94100d28f313b00dcbbe457c398 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:39 +0200
Subject: [PATCH 174/227] KVM: x86: set data directly in picdev_read()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now it looks almost as picdev_write().

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 5b5c87f4d8bde9..d481f3346eb5ba 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -484,7 +484,7 @@ static int picdev_write(struct kvm_pic *s,
 static int picdev_read(struct kvm_pic *s,
 		       gpa_t addr, int len, void *val)
 {
-	unsigned char data = 0;
+	unsigned char *data = (unsigned char *)val;
 
 	if (len != 1) {
 		memset(val, 0, len);
@@ -497,19 +497,18 @@ static int picdev_read(struct kvm_pic *s,
 	case 0xa0:
 	case 0xa1:
 		pic_lock(s);
-		data = pic_ioport_read(&s->pics[addr >> 7], addr);
+		*data = pic_ioport_read(&s->pics[addr >> 7], addr);
 		pic_unlock(s);
 		break;
 	case 0x4d0:
 	case 0x4d1:
 		pic_lock(s);
-		data = elcr_ioport_read(&s->pics[addr & 1], addr);
+		*data = elcr_ioport_read(&s->pics[addr & 1], addr);
 		pic_unlock(s);
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-	*(unsigned char *)val = data;
 	return 0;
 }
 

From b5e7cf52e102d267646af0ea3dc10a4fd4e51b7d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:40 +0200
Subject: [PATCH 175/227] KVM: x86: simplify pic_ioport_read()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8259.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index d481f3346eb5ba..bdcd4139eca923 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -417,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
 	return ret;
 }
 
-static u32 pic_ioport_read(void *opaque, u32 addr1)
+static u32 pic_ioport_read(void *opaque, u32 addr)
 {
 	struct kvm_kpic_state *s = opaque;
-	unsigned int addr;
 	int ret;
 
-	addr = addr1;
-	addr &= 1;
 	if (s->poll) {
-		ret = pic_poll_read(s, addr1);
+		ret = pic_poll_read(s, addr);
 		s->poll = 0;
 	} else
-		if (addr == 0)
+		if ((addr & 1) == 0)
 			if (s->read_reg_select)
 				ret = s->isr;
 			else

From 900ab14ca93dce3f95fdb2bd9c4b654d4d5f2571 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:41 +0200
Subject: [PATCH 176/227] KVM: x86: use irqchip_kernel() to check for
 pic+ioapic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Although the current check is not wrong, this check explicitly includes
the pic.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d71f7a7fd5c9d3..4517a4c2ac3a1e 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -232,7 +232,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!ioapic_in_kernel(kvm))
+	if (!irqchip_kernel(kvm))
 		goto unlock;
 
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);

From c5e8ec8e9beffb9481dba27ab5048c03c8585d7d Mon Sep 17 00:00:00 2001
From: Denis Plotnikov <dplotnikov@virtuozzo.com>
Date: Fri, 7 Apr 2017 12:09:52 +0300
Subject: [PATCH 177/227] KVM: x86: remaster kvm_write_tsc code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reuse existing code instead of using inline asm.
Make the code more concise and clear in the TSC
synchronization part.

Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 52 ++++++++++++----------------------------------
 1 file changed, 13 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3046b7f278aedd..422f803fa36596 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1443,10 +1443,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	s64 usdiff;
 	bool matched;
 	bool already_matched;
 	u64 data = msr->data;
+	bool synchronizing = false;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
@@ -1454,51 +1454,25 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
-		int faulted = 0;
-
-		/* n.b - signed multiplication and division required */
-		usdiff = data - kvm->arch.last_tsc_write;
-#ifdef CONFIG_X86_64
-		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
-#else
-		/* do_div() only does unsigned */
-		asm("1: idivl %[divisor]\n"
-		    "2: xor %%edx, %%edx\n"
-		    "   movl $0, %[faulted]\n"
-		    "3:\n"
-		    ".section .fixup,\"ax\"\n"
-		    "4: movl $1, %[faulted]\n"
-		    "   jmp  3b\n"
-		    ".previous\n"
-
-		_ASM_EXTABLE(1b, 4b)
-
-		: "=A"(usdiff), [faulted] "=r" (faulted)
-		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
-
-#endif
-		do_div(elapsed, 1000);
-		usdiff -= elapsed;
-		if (usdiff < 0)
-			usdiff = -usdiff;
-
-		/* idivl overflow => difference is larger than USEC_PER_SEC */
-		if (faulted)
-			usdiff = USEC_PER_SEC;
-	} else
-		usdiff = USEC_PER_SEC; /* disable TSC match window below */
+		u64 tsc_exp = kvm->arch.last_tsc_write +
+					nsec_to_cycles(vcpu, elapsed);
+		u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+		/*
+		 * Special case: TSC write with a small delta (1 second)
+		 * of virtual cycle time against real time is
+		 * interpreted as an attempt to synchronize the CPU.
+		 */
+		synchronizing = data < tsc_exp + tsc_hz &&
+				data + tsc_hz > tsc_exp;
+	}
 
 	/*
-	 * Special case: TSC write with a small delta (1 second) of virtual
-	 * cycle time against real time is interpreted as an attempt to
-	 * synchronize the CPU.
-         *
 	 * For a reliable TSC, we can match TSC offsets, and for an unstable
 	 * TSC, we add elapsed time in this computation.  We could let the
 	 * compensation code attempt to catch up if we fall behind, but
 	 * it's better to try to match offsets from the beginning.
          */
-	if (usdiff < USEC_PER_SEC &&
+	if (synchronizing &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;

From bd8fab39cd4fe429f161dfc86bf64993d5c17320 Mon Sep 17 00:00:00 2001
From: Denis Plotnikov <dplotnikov@virtuozzo.com>
Date: Fri, 7 Apr 2017 12:09:53 +0300
Subject: [PATCH 178/227] KVM: x86: fix maintaining of kvm_clock stability on
 guest CPU hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VCPU TSC synchronization is perfromed in kvm_write_tsc() when the TSC
value being set is within 1 second from the expected, as obtained by
extrapolating of the TSC in already synchronized VCPUs.

This is naturally achieved on all VCPUs at VM start and resume;
however on VCPU hotplug it is not: the newly added VCPU is created
with TSC == 0 while others are well ahead.

To compensate for that, consider host-initiated kvm_write_tsc() with
TSC == 0 a special case requiring synchronization regardless of the
current TSC on other VCPUs.

Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 422f803fa36596..2e4a4727506146 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1454,16 +1454,25 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
-		u64 tsc_exp = kvm->arch.last_tsc_write +
-					nsec_to_cycles(vcpu, elapsed);
-		u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
-		/*
-		 * Special case: TSC write with a small delta (1 second)
-		 * of virtual cycle time against real time is
-		 * interpreted as an attempt to synchronize the CPU.
-		 */
-		synchronizing = data < tsc_exp + tsc_hz &&
-				data + tsc_hz > tsc_exp;
+		if (data == 0 && msr->host_initiated) {
+			/*
+			 * detection of vcpu initialization -- need to sync
+			 * with other vCPUs. This particularly helps to keep
+			 * kvm_clock stable after CPU hotplug
+			 */
+			synchronizing = true;
+		} else {
+			u64 tsc_exp = kvm->arch.last_tsc_write +
+						nsec_to_cycles(vcpu, elapsed);
+			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+			/*
+			 * Special case: TSC write with a small delta (1 second)
+			 * of virtual cycle time against real time is
+			 * interpreted as an attempt to synchronize the CPU.
+			 */
+			synchronizing = data < tsc_exp + tsc_hz &&
+					data + tsc_hz > tsc_exp;
+		}
 	}
 
 	/*

From 5a48a622718cbd2d26d449f38ba40a2b5f7212cf Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 11 Apr 2017 02:49:21 -0700
Subject: [PATCH 179/227] x86/kvm: virt_xxx memory barriers instead of
 mandatory barriers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

virt_xxx memory barriers are implemented trivially using the low-level
__smp_xxx macros, __smp_xxx is equal to a compiler barrier for strong
TSO memory model, however, mandatory barriers will unconditional add
memory barriers, this patch replaces the rmb() in kvm_steal_clock() by
virt_rmb().

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kernel/kvm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 14f65a5f938e4f..da5c0978998488 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -396,9 +396,9 @@ static u64 kvm_steal_clock(int cpu)
 	src = &per_cpu(steal_time, cpu);
 	do {
 		version = src->version;
-		rmb();
+		virt_rmb();
 		steal = src->steal;
-		rmb();
+		virt_rmb();
 	} while ((version & 1) || (version != src->version));
 
 	return steal;

From 405a353a0e20d09090ad96147da6afad9b0ce056 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 6 Apr 2017 15:22:20 +0200
Subject: [PATCH 180/227] KVM: x86: Add MSR_AMD64_DC_CFG to the list of ignored
 MSRs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hyper-V writes 0x800000000000 to MSR_AMD64_DC_CFG when running on AMD CPUs
as recommended in erratum 383, analogous to our svm_init_erratum_383.

By ignoring the MSR, this patch enables running Hyper-V in L1 on AMD.

Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2e4a4727506146..34bf64fb4dea82 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2137,6 +2137,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_VM_HSAVE_PA:
 	case MSR_AMD64_PATCH_LOADER:
 	case MSR_AMD64_BU_CFG2:
+	case MSR_AMD64_DC_CFG:
 		break;
 
 	case MSR_EFER:
@@ -2399,6 +2400,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_FAM10H_MMIO_CONF_BASE:
 	case MSR_AMD64_BU_CFG2:
 	case MSR_IA32_PERF_CTL:
+	case MSR_AMD64_DC_CFG:
 		msr_info->data = 0;
 		break;
 	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:

From 332518706195007f9fbafa69652aa5b3cf72df24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Thu, 13 Apr 2017 18:39:18 +0200
Subject: [PATCH 181/227] KVM: nVMX: fix AD condition when handling EPT
 violation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I have introduced this bug when applying and simplifying Paolo's patch
as we agreed on the list.  The original was "x &= ~y; if (z) x |= y;".

Here is the story of a bad workflow:

  A maintainer was already testing with the intended change, but it was
  applied only to a testing repo on a different machine.  When the time
  to push tested patches to kvm/next came, he realized that this change
  was missing and quickly added it to the maintenance repo, didn't test
  again (because the change is trivial, right), and pushed the world to
  fire.

Fixes: ae1e2d1082ae ("kvm: nVMX: support EPT accessed/dirty bits")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cfdb0d9389d1f6..837f6dd1ae9c9c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6221,7 +6221,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 		 * page table accesses are reads or writes.
 		 */
 		u64 eptp = nested_ept_get_cr3(vcpu);
-		if (eptp & VMX_EPT_AD_ENABLE_BIT)
+		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
 			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
 	}
 

From 6c7a5dce22b3f3cc44be098e2837fa6797edb8b8 Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Tue, 18 Apr 2017 17:59:58 +0200
Subject: [PATCH 182/227] KVM: arm/arm64: fix races in kvm_psci_vcpu_on

Fix potential races in kvm_psci_vcpu_on() by taking the kvm->lock
mutex.  In general, it's a bad idea to allow more than one PSCI_CPU_ON
to process the same target VCPU at the same time.  One such problem
that may arise is that one PSCI_CPU_ON could be resetting the target
vcpu, which fills the entire sys_regs array with a temporary value
including the MPIDR register, while another looks up the VCPU based
on the MPIDR value, resulting in no target VCPU found.  Resolves both
races found with the kvm-unit-tests/arm/psci unit test.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Reported-by: Levente Kurusa <lkurusa@redhat.com>
Suggested-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Andrew Jones <drjones@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/psci.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index c2b131527a643f..a08d7a93aebbec 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -208,9 +208,10 @@ int kvm_psci_version(struct kvm_vcpu *vcpu)
 
 static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 {
-	int ret = 1;
+	struct kvm *kvm = vcpu->kvm;
 	unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
 	unsigned long val;
+	int ret = 1;
 
 	switch (psci_fn) {
 	case PSCI_0_2_FN_PSCI_VERSION:
@@ -230,7 +231,9 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 		break;
 	case PSCI_0_2_FN_CPU_ON:
 	case PSCI_0_2_FN64_CPU_ON:
+		mutex_lock(&kvm->lock);
 		val = kvm_psci_vcpu_on(vcpu);
+		mutex_unlock(&kvm->lock);
 		break;
 	case PSCI_0_2_FN_AFFINITY_INFO:
 	case PSCI_0_2_FN64_AFFINITY_INFO:
@@ -279,6 +282,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 
 static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
 	unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
 	unsigned long val;
 
@@ -288,7 +292,9 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
 		val = PSCI_RET_SUCCESS;
 		break;
 	case KVM_PSCI_FN_CPU_ON:
+		mutex_lock(&kvm->lock);
 		val = kvm_psci_vcpu_on(vcpu);
+		mutex_unlock(&kvm->lock);
 		break;
 	default:
 		val = PSCI_RET_NOT_SUPPORTED;

From ff567614d58551b650a2375b50be368fbfed5cd5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 19 Apr 2017 12:15:26 +0100
Subject: [PATCH 183/227] KVM: arm/arm64: vgic-v3: De-optimize VMCR
 save/restore when emulating a GICv2

When emulating a GICv2-on-GICv3, special care must be taken to only
save/restore VMCR_EL2 when ICC_SRE_EL1.SRE is cleared. Otherwise,
all Group-0 interrupts end-up being delivered as FIQ, which is
probably not what the guest expects, as demonstrated here with
an unhappy EFI:

	FIQ Exception at 0x000000013BD21CC4

This means that we cannot perform the load/put trick when dealing
with VMCR_EL2 (because the host has SRE set), and we have to deal
with it in the world-switch.

Fortunately, this is not the most common case (modern guests should
be able to deal with GICv3 directly), and the performance is not worse
than what it was before the VMCR optimization.

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/hyp/vgic-v3-sr.c |  8 ++++++--
 virt/kvm/arm/vgic/vgic-v3.c   | 11 +++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 3d0b1ddb692982..91922c1eddc8e6 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -128,8 +128,10 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 	 * Make sure stores to the GIC via the memory mapped interface
 	 * are now visible to the system register interface.
 	 */
-	if (!cpu_if->vgic_sre)
+	if (!cpu_if->vgic_sre) {
 		dsb(st);
+		cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+	}
 
 	if (used_lrs) {
 		int i;
@@ -205,11 +207,13 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	 * delivered as a FIQ to the guest, with potentially fatal
 	 * consequences. So we must make sure that ICC_SRE_EL1 has
 	 * been actually programmed with the value we want before
-	 * starting to mess with the rest of the GIC.
+	 * starting to mess with the rest of the GIC, and VMCR_EL2 in
+	 * particular.
 	 */
 	if (!cpu_if->vgic_sre) {
 		write_gicreg(0, ICC_SRE_EL1);
 		isb();
+		write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
 	}
 
 	val = read_gicreg(ICH_VTR_EL2);
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index bc7010db9f4d36..df150365030076 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -373,12 +373,19 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
-	kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+	/*
+	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+	 * VMCR_EL2 save/restore in the world switch.
+	 */
+	if (likely(cpu_if->vgic_sre))
+		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
 }
 
 void vgic_v3_put(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
-	cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
+	if (likely(cpu_if->vgic_sre))
+		cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
 }

From cffcd9df10daa753610d79f018466f9c61603b97 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 10 Apr 2017 10:19:44 +0100
Subject: [PATCH 184/227] KVM: arm/arm64: vgic-v3: Fix off-by-one LR access

When iterating over the used LRs, be careful not to try to access
an unused LR, or even an unimplemented one if you're unlucky...

Reviewed-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 virt/kvm/arm/hyp/vgic-v3-sr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 91922c1eddc8e6..bce6037cf01d31 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -143,7 +143,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 		val = read_gicreg(ICH_VTR_EL2);
 		nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-		for (i = 0; i <= used_lrs; i++) {
+		for (i = 0; i < used_lrs; i++) {
 			if (cpu_if->vgic_elrsr & (1 << i))
 				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
 			else

From 307d927967007acef98cfd3f0639c7a4bf234ede Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Mar 2017 21:02:08 +1100
Subject: [PATCH 185/227] KVM: PPC: Provide functions for queueing up
 FP/VEC/VSX unavailable interrupts

This provides functions that can be used for generating interrupts
indicating that a given functional unit (floating point, vector, or
VSX) is unavailable.  These functions will be used in instruction
emulation code.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_ppc.h |  3 +++
 arch/powerpc/kvm/book3s.c          | 18 ++++++++++++++++++
 arch/powerpc/kvm/booke.c           |  5 +++++
 3 files changed, 26 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c8c56a0c..4f1f22fc0ea15c 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -132,6 +132,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
+extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c185bd92f8..0ff0d07c07575c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -197,6 +197,24 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 }
 EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);
 
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, 0);
+}
+
+void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_ALTIVEC, 0);
+}
+
+void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_VSX, 0);
+}
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0514cbd4e533ac..3c296c2eacf8b9 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -300,6 +300,11 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 }
 
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+}
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);

From 6f63e81bda98cbb549b01faf978884692ded438d Mon Sep 17 00:00:00 2001
From: Bin Lu <lblulb@linux.vnet.ibm.com>
Date: Tue, 21 Feb 2017 21:12:36 +0800
Subject: [PATCH 186/227] KVM: PPC: Book3S: Add MMIO emulation for FP and VSX
 instructions

This patch provides the MMIO load/store emulation for instructions
of 'double & vector unsigned char & vector signed char & vector
unsigned short & vector signed short & vector unsigned int & vector
signed int & vector double '.

The instructions that this adds emulation for are:

- ldx, ldux, lwax,
- lfs, lfsx, lfsu, lfsux, lfd, lfdx, lfdu, lfdux,
- stfs, stfsx, stfsu, stfsux, stfd, stfdx, stfdu, stfdux, stfiwx,
- lxsdx, lxsspx, lxsiwax, lxsiwzx, lxvd2x, lxvw4x, lxvdsx,
- stxsdx, stxsspx, stxsiwx, stxvd2x, stxvw4x

[paulus@ozlabs.org - some cleanups, fixes and rework, make it
 compile for Book E, fix build when PR KVM is built in]

Signed-off-by: Bin Lu <lblulb@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/disassemble.h |   5 +
 arch/powerpc/include/asm/kvm_host.h    |  23 ++
 arch/powerpc/include/asm/kvm_ppc.h     |   7 +
 arch/powerpc/include/asm/ppc-opcode.h  |  50 ++++
 arch/powerpc/kvm/emulate_loadstore.c   | 335 ++++++++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c             | 318 ++++++++++++++++++++++-
 6 files changed, 731 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
index 4852e849128bf8..c0a55050f70f41 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -87,6 +87,11 @@ static inline unsigned int get_oc(u32 inst)
 	return (inst >> 11) & 0x7fff;
 }
 
+static inline unsigned int get_tx_or_sx(u32 inst)
+{
+	return (inst) & 0x1;
+}
+
 #define IS_XFORM(inst)	(get_op(inst)  == 31)
 #define IS_DSFORM(inst)	(get_op(inst) >= 56)
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 01d05c76f1c7dd..2acc691ed9d0da 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -438,6 +438,11 @@ struct mmio_hpte_cache {
 	unsigned int index;
 };
 
+#define KVMPPC_VSX_COPY_NONE		0
+#define KVMPPC_VSX_COPY_WORD		1
+#define KVMPPC_VSX_COPY_DWORD		2
+#define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP	3
+
 struct openpic;
 
 struct kvm_vcpu_arch {
@@ -641,6 +646,21 @@ struct kvm_vcpu_arch {
 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_host_swabbed;
 	u8 mmio_sign_extend;
+	/* conversion between single and double precision */
+	u8 mmio_sp64_extend;
+	/*
+	 * Number of simulations for vsx.
+	 * If we use 2*8bytes to simulate 1*16bytes,
+	 * then the number should be 2 and
+	 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_DWORD.
+	 * If we use 4*4bytes to simulate 1*16bytes,
+	 * the number should be 4 and
+	 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_WORD.
+	 */
+	u8 mmio_vsx_copy_nums;
+	u8 mmio_vsx_offset;
+	u8 mmio_vsx_copy_type;
+	u8 mmio_vsx_tx_sx_enabled;
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
@@ -729,6 +749,8 @@ struct kvm_vcpu_arch {
 };
 
 #define VCPU_FPR(vcpu, i)	(vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
+#define VCPU_VSX_FPR(vcpu, i, j)	((vcpu)->arch.fp.fpr[i][j])
+#define VCPU_VSX_VR(vcpu, i)		((vcpu)->arch.vr.vr[i])
 
 /* Values for vcpu->arch.state */
 #define KVMPPC_VCPU_NOTREADY		0
@@ -742,6 +764,7 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FPR	0x0020
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
+#define KVM_MMIO_REG_VSX	0x0080
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4f1f22fc0ea15c..bbecec4e753a96 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -78,9 +78,15 @@ extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                unsigned int rt, unsigned int bytes,
 			       int is_default_endian);
+extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			       u64 val, unsigned int bytes,
 			       int is_default_endian);
+extern int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				int rs, unsigned int bytes,
+				int is_default_endian);
 
 extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 				 enum instruction_type type, u32 *inst);
@@ -243,6 +249,7 @@ union kvmppc_one_reg {
 	u64	dval;
 	vector128 vval;
 	u64	vsxval[2];
+	u32	vsx32val[4];
 	struct {
 		u64	addr;
 		u64	length;
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86563eeda..1e37c3c0733a82 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -86,7 +86,9 @@
 #define OP_TRAP_64 2
 
 #define OP_31_XOP_TRAP      4
+#define OP_31_XOP_LDX       21
 #define OP_31_XOP_LWZX      23
+#define OP_31_XOP_LDUX      53
 #define OP_31_XOP_DCBST     54
 #define OP_31_XOP_LWZUX     55
 #define OP_31_XOP_TRAP_64   68
@@ -99,6 +101,7 @@
 #define OP_31_XOP_LHZX      279
 #define OP_31_XOP_LHZUX     311
 #define OP_31_XOP_MFSPR     339
+#define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
 #define OP_31_XOP_LHAUX     375
 #define OP_31_XOP_STHX      407
@@ -108,10 +111,46 @@
 #define OP_31_XOP_LWBRX     534
 #define OP_31_XOP_TLBSYNC   566
 #define OP_31_XOP_STWBRX    662
+#define OP_31_XOP_STFSX	    663
+#define OP_31_XOP_STFSUX    695
+#define OP_31_XOP_STFDX     727
+#define OP_31_XOP_STFDUX    759
 #define OP_31_XOP_LHBRX     790
 #define OP_31_XOP_STHBRX    918
+#define OP_31_XOP_STFIWX    983
+
+/* VSX Scalar Load Instructions */
+#define OP_31_XOP_LXSDX         588
+#define OP_31_XOP_LXSSPX        524
+#define OP_31_XOP_LXSIWAX       76
+#define OP_31_XOP_LXSIWZX       12
+
+/* VSX Scalar Store Instructions */
+#define OP_31_XOP_STXSDX        716
+#define OP_31_XOP_STXSSPX       652
+#define OP_31_XOP_STXSIWX       140
+
+/* VSX Vector Load Instructions */
+#define OP_31_XOP_LXVD2X        844
+#define OP_31_XOP_LXVW4X        780
+
+/* VSX Vector Load and Splat Instruction */
+#define OP_31_XOP_LXVDSX        332
+
+/* VSX Vector Store Instructions */
+#define OP_31_XOP_STXVD2X       972
+#define OP_31_XOP_STXVW4X       908
+
+#define OP_31_XOP_LFSX          535
+#define OP_31_XOP_LFSUX         567
+#define OP_31_XOP_LFDX          599
+#define OP_31_XOP_LFDUX		631
 
 #define OP_LWZ  32
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD 54
+#define OP_STFDU 55
 #define OP_LD   58
 #define OP_LWZU 33
 #define OP_LBZ  34
@@ -127,6 +166,17 @@
 #define OP_LHAU 43
 #define OP_STH  44
 #define OP_STHU 45
+#define OP_LMW  46
+#define OP_STMW 47
+#define OP_LFS  48
+#define OP_LFSU 49
+#define OP_LFD  50
+#define OP_LFDU 51
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD  54
+#define OP_STFDU 55
+#define OP_LQ    56
 
 /* sorted alphabetically */
 #define PPC_INST_BHRBE			0x7c00025c
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 6d3c0ee1d744bb..9cda1b9dbc5192 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -34,6 +34,30 @@
 #include "timing.h"
 #include "trace.h"
 
+#ifdef CONFIG_PPC_FPU
+static bool kvmppc_check_fp_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
+		kvmppc_core_queue_fpunavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_PPC_FPU */
+
+#ifdef CONFIG_VSX
+static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_VSX)) {
+		kvmppc_core_queue_vsx_unavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_VSX */
+
 /* XXX to do:
  * lhax
  * lhaux
@@ -66,6 +90,19 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	rs = get_rs(inst);
 	rt = get_rt(inst);
 
+	/*
+	 * if mmio_vsx_tx_sx_enabled == 0, copy data between
+	 * VSR[0..31] and memory
+	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
+	 * VSR[32..63] and memory
+	 */
+	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
+	vcpu->arch.mmio_vsx_copy_nums = 0;
+	vcpu->arch.mmio_vsx_offset = 0;
+	vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
+	vcpu->arch.mmio_sp64_extend = 0;
+	vcpu->arch.mmio_sign_extend = 0;
+
 	switch (get_op(inst)) {
 	case 31:
 		switch (get_xop(inst)) {
@@ -157,6 +194,230 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			                               2, 0);
 			break;
 
+		case OP_31_XOP_LDX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			break;
+
+		case OP_31_XOP_LDUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LWAX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			break;
+
+#ifdef CONFIG_PPC_FPU
+		case OP_31_XOP_LFSX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LFSUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LFDX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 8, 1);
+			break;
+
+		case OP_31_XOP_LFDUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFSX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			break;
+
+		case OP_31_XOP_STFSUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFDX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs),
+		                               8, 1);
+			break;
+
+		case OP_31_XOP_STFDUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs),
+		                               8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFIWX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs),
+		                               4, 1);
+			break;
+#endif
+
+#ifdef CONFIG_VSX
+		case OP_31_XOP_LXSDX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_LXSSPX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXSIWAX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 1);
+			break;
+
+		case OP_31_XOP_LXSIWZX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVD2X:
+		/*
+		 * In this case, the official load/store process is like this:
+		 * Step1, exit from vm by page fault isr, then kvm save vsr.
+		 * Please see guest_exit_cont->store_fp_state->SAVE_32VSRS
+		 * as reference.
+		 *
+		 * Step2, copy data between memory and VCPU
+		 * Notice: for LXVD2X/STXVD2X/LXVW4X/STXVW4X, we use
+		 * 2copies*8bytes or 4copies*4bytes
+		 * to simulate one copy of 16bytes.
+		 * Also there is an endian issue here, we should notice the
+		 * layout of memory.
+		 * Please see MARCO of LXVD2X_ROT/STXVD2X_ROT as more reference.
+		 * If host is little-endian, kvm will call XXSWAPD for
+		 * LXVD2X_ROT/STXVD2X_ROT.
+		 * So, if host is little-endian,
+		 * the postion of memeory should be swapped.
+		 *
+		 * Step3, return to guest, kvm reset register.
+		 * Please see kvmppc_hv_entry->load_fp_state->REST_32VSRS
+		 * as reference.
+		 */
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 2;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVW4X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 4;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVDSX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type =
+				 KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_STXSDX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+						 rs, 8, 1);
+			break;
+
+		case OP_31_XOP_STXSSPX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+						 rs, 4, 1);
+			break;
+
+		case OP_31_XOP_STXSIWX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_offset = 1;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 4, 1);
+			break;
+
+		case OP_31_XOP_STXVD2X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 2;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 8, 1);
+			break;
+
+		case OP_31_XOP_STXVW4X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 4;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 4, 1);
+			break;
+#endif /* CONFIG_VSX */
 		default:
 			emulated = EMULATE_FAIL;
 			break;
@@ -167,7 +428,45 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		break;
 
-	/* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */
+#ifdef CONFIG_PPC_FPU
+	case OP_STFS:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+			4, 1);
+		break;
+
+	case OP_STFSU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+			4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_STFD:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+	                               8, 1);
+		break;
+
+	case OP_STFDU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+	                               8, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+#endif
+
+	/* TBD: Add support for other 64 bit load variants like ldu etc. */
 	case OP_LD:
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
@@ -252,6 +551,40 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
+#ifdef CONFIG_PPC_FPU
+	case OP_LFS:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 4, 1);
+		break;
+
+	case OP_LFSU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_LFD:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 8, 1);
+		break;
+
+	case OP_LFDU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 8, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+#endif
+
 	default:
 		emulated = EMULATE_FAIL;
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0e42aa8a279f32..6c7244879bfe8c 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -37,6 +37,7 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include <asm/iommu.h>
+#include <asm/switch_to.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -801,6 +802,129 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 		kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
 }
 
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_dword_offset(int index)
+{
+	int offset;
+
+	if ((index != 0) && (index != 1))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset =  index;
+#else
+	offset = 1 - index;
+#endif
+
+	return offset;
+}
+
+static inline int kvmppc_get_vsr_word_offset(int index)
+{
+	int offset;
+
+	if ((index > 3) || (index < 0))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset = index;
+#else
+	offset = 3 - index;
+#endif
+	return offset;
+}
+
+static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsxval[offset] = gpr;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsxval[0] = gpr;
+		val.vsxval[1] = gpr;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
+		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
+	u32 gpr32)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+	int dword_offset, word_offset;
+
+	if (offset == -1)
+		return;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsx32val[offset] = gpr32;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		dword_offset = offset / 2;
+		word_offset = offset % 2;
+		val.vsxval[0] = VCPU_VSX_FPR(vcpu, index, dword_offset);
+		val.vsx32val[word_offset] = gpr32;
+		VCPU_VSX_FPR(vcpu, index, dword_offset) = val.vsxval[0];
+	}
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_PPC_FPU
+static inline u64 sp_to_dp(u32 fprs)
+{
+	u64 fprd;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfs%U1%X1 0,%1; stfd%U0%X0 0,%0" : "=m" (fprd) : "m" (fprs)
+	     : "fr0");
+	preempt_enable();
+	return fprd;
+}
+
+static inline u32 dp_to_sp(u64 fprd)
+{
+	u32 fprs;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfd%U1%X1 0,%1; stfs%U0%X0 0,%0" : "=m" (fprs) : "m" (fprd)
+	     : "fr0");
+	preempt_enable();
+	return fprs;
+}
+
+#else
+#define sp_to_dp(x)	(x)
+#define dp_to_sp(x)	(x)
+#endif /* CONFIG_PPC_FPU */
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
@@ -827,6 +951,10 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	/* conversion between single and double precision */
+	if ((vcpu->arch.mmio_sp64_extend) && (run->mmio.len == 4))
+		gpr = sp_to_dp(gpr);
+
 	if (vcpu->arch.mmio_sign_extend) {
 		switch (run->mmio.len) {
 #ifdef CONFIG_PPC64
@@ -843,8 +971,6 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		}
 	}
 
-	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
-
 	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
 	case KVM_MMIO_REG_GPR:
 		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
@@ -860,6 +986,17 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
 		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
+#endif
+#ifdef CONFIG_VSX
+	case KVM_MMIO_REG_VSX:
+		if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_DWORD)
+			kvmppc_set_vsr_dword(vcpu, gpr);
+		else if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_WORD)
+			kvmppc_set_vsr_word(vcpu, gpr);
+		else if (vcpu->arch.mmio_vsx_copy_type ==
+				KVMPPC_VSX_COPY_DWORD_LOAD_DUMP)
+			kvmppc_set_vsr_dword_dump(vcpu, gpr);
+		break;
 #endif
 	default:
 		BUG();
@@ -927,6 +1064,35 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return __kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian, 1);
 }
 
+#ifdef CONFIG_VSX
+int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
+	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
+		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+		return EMULATE_FAIL;
+	}
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
+			is_default_endian, mmio_sign_extend);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+	return emulated;
+}
+#endif /* CONFIG_VSX */
+
 int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			u64 val, unsigned int bytes, int is_default_endian)
 {
@@ -952,6 +1118,9 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_is_write = 1;
 
+	if ((vcpu->arch.mmio_sp64_extend) && (bytes == 4))
+		val = dp_to_sp(val);
+
 	/* Store the value at the lowest bytes in 'data'. */
 	if (!host_swabbed) {
 		switch (bytes) {
@@ -985,6 +1154,129 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvmppc_handle_store);
 
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
+{
+	u32 dword_offset, word_offset;
+	union kvmppc_one_reg reg;
+	int vsx_offset = 0;
+	int copy_type = vcpu->arch.mmio_vsx_copy_type;
+	int result = 0;
+
+	switch (copy_type) {
+	case KVMPPC_VSX_COPY_DWORD:
+		vsx_offset =
+			kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			*val = reg.vsxval[vsx_offset];
+		}
+		break;
+
+	case KVMPPC_VSX_COPY_WORD:
+		vsx_offset =
+			kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+			dword_offset = vsx_offset / 2;
+			word_offset = vsx_offset % 2;
+			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
+			*val = reg.vsx32val[word_offset];
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			*val = reg.vsx32val[vsx_offset];
+		}
+		break;
+
+	default:
+		result = -1;
+		break;
+	}
+
+	return result;
+}
+
+int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			int rs, unsigned int bytes, int is_default_endian)
+{
+	u64 val;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	vcpu->arch.io_gpr = rs;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
+	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
+		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+		return EMULATE_FAIL;
+	}
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
+			return EMULATE_FAIL;
+
+		emulated = kvmppc_handle_store(run, vcpu,
+			 val, bytes, is_default_endian);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu,
+			struct kvm_run *run)
+{
+	enum emulation_result emulated = EMULATE_FAIL;
+	int r;
+
+	vcpu->arch.paddr_accessed += run->mmio.len;
+
+	if (!vcpu->mmio_is_write) {
+		emulated = kvmppc_handle_vsx_load(run, vcpu, vcpu->arch.io_gpr,
+			 run->mmio.len, 1, vcpu->arch.mmio_sign_extend);
+	} else {
+		emulated = kvmppc_handle_vsx_store(run, vcpu,
+			 vcpu->arch.io_gpr, run->mmio.len, 1);
+	}
+
+	switch (emulated) {
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST;
+		break;
+	case EMULATE_FAIL:
+		pr_info("KVM: MMIO emulation failed (VSX repeat)\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		r = RESUME_HOST;
+		break;
+	default:
+		r = RESUME_GUEST;
+		break;
+	}
+	return r;
+}
+#endif /* CONFIG_VSX */
+
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
 	int r = 0;
@@ -1087,13 +1379,24 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
 	if (vcpu->mmio_needed) {
+		vcpu->mmio_needed = 0;
 		if (!vcpu->mmio_is_write)
 			kvmppc_complete_mmio_load(vcpu, run);
-		vcpu->mmio_needed = 0;
+#ifdef CONFIG_VSX
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			vcpu->arch.mmio_vsx_copy_nums--;
+			vcpu->arch.mmio_vsx_offset++;
+		}
+
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run);
+			if (r == RESUME_HOST) {
+				vcpu->mmio_needed = 1;
+				return r;
+			}
+		}
+#endif
 	} else if (vcpu->arch.osi_needed) {
 		u64 *gprs = run->osi.gprs;
 		int i;
@@ -1115,6 +1418,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
 	}
 
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
 	if (run->immediate_exit)
 		r = -EINTR;
 	else

From 91242fd1a3eb96e4efe43bdf96c2fcec97fdf4ff Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 17 Mar 2017 19:31:38 +1100
Subject: [PATCH 187/227] KVM: PPC: Add MMIO emulation for stdx (store
 doubleword indexed)

This adds missing stdx emulation for emulated MMIO accesses by KVM
guests.  This allows the Mellanox mlx5_core driver from recent kernels
to work when MMIO emulation is enforced by userspace.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/ppc-opcode.h | 1 +
 arch/powerpc/kvm/emulate_loadstore.c  | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 1e37c3c0733a82..94e7df22630efe 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -94,6 +94,7 @@
 #define OP_31_XOP_TRAP_64   68
 #define OP_31_XOP_DCBF      86
 #define OP_31_XOP_LBZX      87
+#define OP_31_XOP_STDX      149
 #define OP_31_XOP_STWX      151
 #define OP_31_XOP_STBX      215
 #define OP_31_XOP_LBZUX     119
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 9cda1b9dbc5192..a0f27a30379d8e 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -119,6 +119,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
+		case OP_31_XOP_STDX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+							8, 1);
+			break;
+
 		case OP_31_XOP_STWX:
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),

From ceba57df43a25ede55a96a795703c46022b1d1d0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 21 Mar 2017 15:43:47 +1100
Subject: [PATCH 188/227] KVM: PPC: Emulation for more integer loads and stores

This adds emulation for the following integer loads and stores,
thus enabling them to be used in a guest for accessing emulated
MMIO locations.

- lhaux
- lwaux
- lwzux
- ldu
- lwa
- stdux
- stwux
- stdu
- ldbrx
- stdbrx

Previously, most of these would cause an emulation failure exit to
userspace, though ldu and lwa got treated incorrectly as ld, and
stdu got treated incorrectly as std.

This also tidies up some of the formatting and updates the comment
listing instructions that still need to be implemented.

With this, all integer loads and stores that are defined in the Power
ISA v2.07 are emulated, except for those that are permitted to trap
when used on cache-inhibited or write-through mappings (and which do
in fact trap on POWER8), that is, lmw/stmw, lswi/stswi, lswx/stswx,
lq/stq, and l[bhwdq]arx/st[bhwdq]cx.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/ppc-opcode.h |   5 +
 arch/powerpc/kvm/emulate_loadstore.c  | 135 ++++++++++++++++----------
 2 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 94e7df22630efe..738bac164f8bb7 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -96,6 +96,8 @@
 #define OP_31_XOP_LBZX      87
 #define OP_31_XOP_STDX      149
 #define OP_31_XOP_STWX      151
+#define OP_31_XOP_STDUX     181
+#define OP_31_XOP_STWUX     183
 #define OP_31_XOP_STBX      215
 #define OP_31_XOP_LBZUX     119
 #define OP_31_XOP_STBUX     247
@@ -104,13 +106,16 @@
 #define OP_31_XOP_MFSPR     339
 #define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
+#define OP_31_XOP_LWAUX     373
 #define OP_31_XOP_LHAUX     375
 #define OP_31_XOP_STHX      407
 #define OP_31_XOP_STHUX     439
 #define OP_31_XOP_MTSPR     467
 #define OP_31_XOP_DCBI      470
+#define OP_31_XOP_LDBRX     532
 #define OP_31_XOP_LWBRX     534
 #define OP_31_XOP_TLBSYNC   566
+#define OP_31_XOP_STDBRX    660
 #define OP_31_XOP_STWBRX    662
 #define OP_31_XOP_STFSX	    663
 #define OP_31_XOP_STFSUX    695
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index a0f27a30379d8e..f10ba0c2c202b2 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -58,18 +58,14 @@ static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_VSX */
 
-/* XXX to do:
- * lhax
- * lhaux
- * lswx
- * lswi
- * stswx
- * stswi
- * lha
- * lhau
- * lmw
- * stmw
+/*
+ * XXX to do:
+ * lfiwax, lfiwzx
+ * vector loads and stores
  *
+ * Instructions that trap when used on cache-inhibited mappings
+ * are not emulated here: multiple and string instructions,
+ * lq/stq, and the load-reserve/store-conditional instructions.
  */
 int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 {
@@ -110,6 +106,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 			break;
 
+		case OP_31_XOP_LWZUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_LBZX:
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			break;
@@ -121,26 +122,34 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STDX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-							8, 1);
+					kvmppc_get_gpr(vcpu, rs), 8, 1);
+			break;
+
+		case OP_31_XOP_STDUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STWX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               4, 1);
+					kvmppc_get_gpr(vcpu, rs), 4, 1);
+			break;
+
+		case OP_31_XOP_STWUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STBX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               1, 1);
+					kvmppc_get_gpr(vcpu, rs), 1, 1);
 			break;
 
 		case OP_31_XOP_STBUX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               1, 1);
+					kvmppc_get_gpr(vcpu, rs), 1, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -148,6 +157,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
 			break;
 
+		case OP_31_XOP_LHAUX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_LHZX:
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 			break;
@@ -159,14 +173,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STHX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 1);
+					kvmppc_get_gpr(vcpu, rs), 2, 1);
 			break;
 
 		case OP_31_XOP_STHUX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 1);
+					kvmppc_get_gpr(vcpu, rs), 2, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -186,8 +198,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STWBRX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               4, 0);
+					kvmppc_get_gpr(vcpu, rs), 4, 0);
 			break;
 
 		case OP_31_XOP_LHBRX:
@@ -196,8 +207,16 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STHBRX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 0);
+					kvmppc_get_gpr(vcpu, rs), 2, 0);
+			break;
+
+		case OP_31_XOP_LDBRX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 0);
+			break;
+
+		case OP_31_XOP_STDBRX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 0);
 			break;
 
 		case OP_31_XOP_LDX:
@@ -213,6 +232,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
 			break;
 
+		case OP_31_XOP_LWAUX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 #ifdef CONFIG_PPC_FPU
 		case OP_31_XOP_LFSX:
 			if (kvmppc_check_fp_disabled(vcpu))
@@ -267,16 +291,14 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;
 			emulated = kvmppc_handle_store(run, vcpu,
-				VCPU_FPR(vcpu, rs),
-		                               8, 1);
+				VCPU_FPR(vcpu, rs), 8, 1);
 			break;
 
 		case OP_31_XOP_STFDUX:
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;
 			emulated = kvmppc_handle_store(run, vcpu,
-				VCPU_FPR(vcpu, rs),
-		                               8, 1);
+				VCPU_FPR(vcpu, rs), 8, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -284,8 +306,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;
 			emulated = kvmppc_handle_store(run, vcpu,
-				VCPU_FPR(vcpu, rs),
-		                               4, 1);
+				VCPU_FPR(vcpu, rs), 4, 1);
 			break;
 #endif
 
@@ -472,10 +493,22 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		break;
 #endif
 
-	/* TBD: Add support for other 64 bit load variants like ldu etc. */
 	case OP_LD:
 		rt = get_rt(inst);
-		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+		switch (inst & 3) {
+		case 0:	/* ld */
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			break;
+		case 1: /* ldu */
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+		case 2:	/* lwa */
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			break;
+		default:
+			emulated = EMULATE_FAIL;
+		}
 		break;
 
 	case OP_LWZU:
@@ -498,31 +531,37 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		                               4, 1);
 		break;
 
-	/* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */
 	case OP_STD:
 		rs = get_rs(inst);
-		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               8, 1);
+		switch (inst & 3) {
+		case 0:	/* std */
+			emulated = kvmppc_handle_store(run, vcpu,
+				kvmppc_get_gpr(vcpu, rs), 8, 1);
+			break;
+		case 1: /* stdu */
+			emulated = kvmppc_handle_store(run, vcpu,
+				kvmppc_get_gpr(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+		default:
+			emulated = EMULATE_FAIL;
+		}
 		break;
 
 	case OP_STWU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               4, 1);
+				kvmppc_get_gpr(vcpu, rs), 4, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STB:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               1, 1);
+				kvmppc_get_gpr(vcpu, rs), 1, 1);
 		break;
 
 	case OP_STBU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               1, 1);
+				kvmppc_get_gpr(vcpu, rs), 1, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
@@ -546,14 +585,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 	case OP_STH:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               2, 1);
+				kvmppc_get_gpr(vcpu, rs), 2, 1);
 		break;
 
 	case OP_STHU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               2, 1);
+				kvmppc_get_gpr(vcpu, rs), 2, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 

From 9b5ab0051306033b0b1e83dd72e1f84a0c8dd4bf Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 23 Mar 2017 11:55:16 +1100
Subject: [PATCH 189/227] KVM: PPC: Add MMIO emulation for remaining
 floating-point instructions

For completeness, this adds emulation of the lfiwax and lfiwzx
instructions.  With this, all floating-point load and store instructions
as of Power ISA V2.07 are emulated.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/ppc-opcode.h |  2 ++
 arch/powerpc/kvm/emulate_loadstore.c  | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 738bac164f8bb7..73f06f4dddc78a 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -122,6 +122,8 @@
 #define OP_31_XOP_STFDX     727
 #define OP_31_XOP_STFDUX    759
 #define OP_31_XOP_LHBRX     790
+#define OP_31_XOP_LFIWAX    855
+#define OP_31_XOP_LFIWZX    887
 #define OP_31_XOP_STHBRX    918
 #define OP_31_XOP_STFIWX    983
 
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index f10ba0c2c202b2..af833531af3191 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -270,6 +270,20 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
+		case OP_31_XOP_LFIWAX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_loads(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LFIWZX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
 		case OP_31_XOP_STFSX:
 			if (kvmppc_check_fp_disabled(vcpu))
 				return EMULATE_DONE;

From a1c52e1c7cb9fb8714b89b7bb5773c498e39e38b Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 20 Jan 2017 11:00:08 +0100
Subject: [PATCH 190/227] KVM: PPC: Book3S HV: Use common error handling code
 in kvmppc_clr_passthru_irq()

Add a jump target so that a bit of exception handling can be better reused
at the end of this function.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9e2a82a3..06b7d8ae27e522 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3624,11 +3624,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -EIO;
 
 	mutex_lock(&kvm->lock);
+	if (!kvm->arch.pimap)
+		goto unlock;
 
-	if (kvm->arch.pimap == NULL) {
-		mutex_unlock(&kvm->lock);
-		return 0;
-	}
 	pimap = kvm->arch.pimap;
 
 	for (i = 0; i < pimap->n_mapped; i++) {
@@ -3650,7 +3648,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	 * We don't free this structure even when the count goes to
 	 * zero. The structure is freed when we destroy the VM.
 	 */
-
+ unlock:
 	mutex_unlock(&kvm->lock);
 	return 0;
 }

From 37655490db757a8ef9141830428485395b86e8fa Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 20 Jan 2017 16:30:18 +0100
Subject: [PATCH 191/227] KVM: PPC: e500: Use kcalloc() in e500_mmu_host_init()

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kcalloc".

  This issue was detected by using the Coccinelle software.

* Replace the specification of a data type by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/e500_mmu_host.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 0fda4230f6c0f8..77fd043b3ecc50 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -797,9 +797,8 @@ int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	host_tlb_params[0].sets =
 		host_tlb_params[0].entries / host_tlb_params[0].ways;
 	host_tlb_params[1].sets = 1;
-
-	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
-					   host_tlb_params[1].entries,
+	vcpu_e500->h2g_tlb1_rmap = kcalloc(host_tlb_params[1].entries,
+					   sizeof(*vcpu_e500->h2g_tlb1_rmap),
 					   GFP_KERNEL);
 	if (!vcpu_e500->h2g_tlb1_rmap)
 		return -EINVAL;

From 9eecec126e006768c21ba2b4588ce5b0e23fda0b Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:47:13 +1100
Subject: [PATCH 192/227] KVM: PPC: Book3S PR: Get rid of unused local variable

@is_mmio has never been used since introduction in
commit 2f4cf5e42d13 ("Add book3s.c") from 2009.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_pr.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d4dfc0ca2a4444..633502f52bbbf3 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -538,7 +538,6 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int relocated;
 	int page_found = 0;
 	struct kvmppc_pte pte;
-	bool is_mmio = false;
 	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
 	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
 	u64 vsid;
@@ -616,8 +615,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* Page not found in guest SLB */
 		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
 		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-	} else if (!is_mmio &&
-		   kvmppc_visible_gpa(vcpu, pte.raddr)) {
+	} else if (kvmppc_visible_gpa(vcpu, pte.raddr)) {
 		if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) {
 			/*
 			 * There is already a host HPTE there, presumably

From bd9166ffe624000140fc6b606b256df01fc0d060 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:48:10 +1100
Subject: [PATCH 193/227] KVM: PPC: Book3S PR: Exit KVM on failed mapping

At the moment kvmppc_mmu_map_page() returns -1 if
mmu_hash_ops.hpte_insert() fails for any reason so the page fault handler
resumes the guest and it faults on the same address again.

This adds distinction to kvmppc_mmu_map_page() to return -EIO if
mmu_hash_ops.hpte_insert() failed for a reason other than full pteg.
At the moment only pSeries_lpar_hpte_insert() returns -2 if
plpar_pte_enter() failed with a code other than H_PTEG_FULL.
Other mmu_hash_ops.hpte_insert() instances can only fail with
-1 "full pteg".

With this change, if PR KVM fails to update HPT, it can signal
the userspace about this instead of returning to guest and having
the very same page fault over and over again.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_host.c | 5 ++++-
 arch/powerpc/kvm/book3s_pr.c          | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f4fd2648..4b4e927c4822c3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -177,12 +177,15 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	ret = mmu_hash_ops.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
 				       hpsize, hpsize, MMU_SEGSIZE_256M);
 
-	if (ret < 0) {
+	if (ret == -1) {
 		/* If we couldn't map a primary PTE, try a secondary */
 		hash = ~hash;
 		vflags ^= HPTE_V_SECONDARY;
 		attempt++;
 		goto map_again;
+	} else if (ret < 0) {
+		r = -EIO;
+		goto out_unlock;
 	} else {
 		trace_kvm_book3s_64_mmu_map(rflags, hpteg,
 					    vpn, hpaddr, orig_pte);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 633502f52bbbf3..ce437b98477e4c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -625,7 +625,11 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_mmu_unmap_page(vcpu, &pte);
 		}
 		/* The guest's PTE is not mapped yet. Map on the host */
-		kvmppc_mmu_map_page(vcpu, &pte, iswrite);
+		if (kvmppc_mmu_map_page(vcpu, &pte, iswrite) == -EIO) {
+			/* Exit KVM if mapping failed */
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
 		if (data)
 			vcpu->stat.sp_storage++;
 		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&

From 96df2267695199b9377bd641c7eb68c393b81b0b Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:49:22 +1100
Subject: [PATCH 194/227] KVM: PPC: Book3S PR: Preserve storage control bits

PR KVM page fault handler performs eaddr to pte translation for a guest,
however kvmppc_mmu_book3s_64_xlate() does not preserve WIMG bits
(storage control) in the kvmppc_pte struct. If PR KVM is running as
a second level guest under HV KVM, and PR KVM tries inserting HPT entry,
this fails in HV KVM if it already has this mapping.

This preserves WIMG bits between kvmppc_mmu_book3s_64_xlate() and
kvmppc_mmu_map_page().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h   | 1 +
 arch/powerpc/kvm/book3s_64_mmu.c      | 1 +
 arch/powerpc/kvm/book3s_64_mmu_host.c | 2 ++
 arch/powerpc/kvm/book3s_pr.c          | 2 +-
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2acc691ed9d0da..0f3ac09cbfe01e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -342,6 +342,7 @@ struct kvmppc_pte {
 	bool may_read		: 1;
 	bool may_write		: 1;
 	bool may_execute	: 1;
+	unsigned long wimg;
 	u8 page_size;		/* MMU_PAGE_xxx */
 };
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 70153578131a04..29ebe2fd58674c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -319,6 +319,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		gpte->may_execute = true;
 	gpte->may_read = false;
 	gpte->may_write = false;
+	gpte->wimg = r & HPTE_R_WIMG;
 
 	switch (pp) {
 	case 0:
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4b4e927c4822c3..145a61892c4881 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -145,6 +145,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	else
 		kvmppc_mmu_flush_icache(pfn);
 
+	rflags = (rflags & ~HPTE_R_WIMG) | orig_pte->wimg;
+
 	/*
 	 * Use 64K pages if possible; otherwise, on 64K page kernels,
 	 * we need to transfer 4 more bits from guest real to host real addr.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index ce437b98477e4c..f026b062c0ed22 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -537,7 +537,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int r = RESUME_GUEST;
 	int relocated;
 	int page_found = 0;
-	struct kvmppc_pte pte;
+	struct kvmppc_pte pte = { 0 };
 	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
 	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
 	u64 vsid;

From 3762d45aa71d546cc0afa29da2f1cd360c099484 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 13 Apr 2017 17:04:20 +1000
Subject: [PATCH 195/227] KVM: PPC: Align the table size to system page size

At the moment the userspace can request a table smaller than a page size
and this value will be stored as kvmppc_spapr_tce_table::size.
However the actual allocated size will still be aligned to the system
page size as alloc_page() is used there.

This aligns the table size up to the system page size. It should not
change the existing behaviour but when in-kernel TCE acceleration patchset
reaches the upstream kernel, this will allow small TCE tables be
accelerated as well: PCI IODA iommu_table allocator already aligns
the size and, without this patch, an IOMMU group won't attach to LIOBN
due to the mismatching table size.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_vio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 3e26cd4979f936..ab20a47a8821ca 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -164,7 +164,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			return -EBUSY;
 	}
 
-	size = args->size;
+	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
 	if (ret) {

From 4898d3f49b5b156c33f0ae0f49ede417ab86195e Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:51 +1100
Subject: [PATCH 196/227] KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability
 number

This adds a capability number for in-kernel support for VFIO on
SPAPR platform.

The capability will tell the user space whether in-kernel handlers of
H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space
must not attempt allocating a TCE table in the host kernel via
the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests
will not be passed to the user space which is desired action in
the situation like that.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6180ea50e9ef01..7b488eae61b820 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -892,6 +892,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_MIPS_64BIT 139
 #define KVM_CAP_S390_GS 140
 #define KVM_CAP_S390_AIS 141
+#define KVM_CAP_SPAPR_TCE_VFIO 142
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From e91aa8e6ecd5e8193df6dec109f9d5deda5c7317 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:52 +1100
Subject: [PATCH 197/227] KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64
 permanently

It does not make much sense to have KVM in book3s-64 and
not to have IOMMU bits for PCI pass through support as it costs little
and allows VFIO to function on book3s KVM.

Having IOMMU_API always enabled makes it unnecessary to have a lot of
"#ifdef IOMMU_API" in arch/powerpc/kvm/book3s_64_vio*. With those
ifdef's we could have only user space emulated devices accelerated
(but not VFIO) which do not seem to be very useful.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 029be26b5a17ed..65a471de96de2b 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -67,6 +67,7 @@ config KVM_BOOK3S_64
 	select KVM_BOOK3S_64_HANDLER
 	select KVM
 	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.

From 503bfcbe18576a79be0bc5173b23b530845e704a Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:53 +1100
Subject: [PATCH 198/227] KVM: PPC: Pass kvm* to kvmppc_find_table()

The guest view TCE tables are per KVM anyway (not per VCPU) so pass kvm*
there. This will be used in the following patches where we will be
attaching VFIO containers to LIOBNs via ioctl() to KVM (rather than
to VCPU).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/kvm/book3s_64_vio.c    |  7 ++++---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 13 +++++++------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index bbecec4e753a96..2e66800a870b62 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -177,7 +177,7 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
-		struct kvm_vcpu *vcpu, unsigned long liobn);
+		struct kvm *kvm, unsigned long liobn);
 extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
 		unsigned long ioba, unsigned long npages);
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index ab20a47a8821ca..d507d94e020c52 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -214,12 +214,13 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -247,7 +248,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	u64 __user *tces;
 	u64 tce;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -301,7 +302,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index e4c4ea973e5789..918af76ab2b6bf 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -48,10 +48,9 @@
  * WARNING: This will be called in real or virtual mode on HV KVM and virtual
  *          mode on PR KVM
  */
-struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 		unsigned long liobn)
 {
-	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_spapr_tce_table *stt;
 
 	list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
@@ -182,12 +181,13 @@ EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		unsigned long ioba, unsigned long tce)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -240,7 +240,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -301,7 +301,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -322,12 +322,13 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 	unsigned long idx;
 	struct page *page;
 	u64 *tbl;
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 

From da6f59e19233efdda58f196afbae8e05f6030d7f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:54 +1100
Subject: [PATCH 199/227] KVM: PPC: Use preregistered memory API to access TCE
 list

VFIO on sPAPR already implements guest memory pre-registration
when the entire guest RAM gets pinned. This can be used to translate
the physical address of a guest page containing the TCE list
from H_PUT_TCE_INDIRECT.

This makes use of the pre-registrered memory API to access TCE list
pages in order to avoid unnecessary locking on the KVM memory
reverse map as we know that all of guest memory is pinned and
we have a flat array mapping GPA to HPA which makes it simpler and
quicker to index into that array (even with looking up the
kernel page tables in vmalloc_to_phys) than it is to find the memslot,
lock the rmap entry, look up the user page tables, and unlock the rmap
entry. Note that the rmap pointer is initialized to NULL
where declared (not in this patch).

If a requested chunk of memory has not been preregistered, this will
fall back to non-preregistered case and lock rmap.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 58 +++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 918af76ab2b6bf..0f145fc7a3a567 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -239,6 +239,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	long i, ret = H_SUCCESS;
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
+	bool prereg = false;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -259,23 +260,47 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
-		return H_TOO_HARD;
+	if (mm_iommu_preregistered(vcpu->kvm->mm)) {
+		/*
+		 * We get here if guest memory was pre-registered which
+		 * is normally VFIO case and gpa->hpa translation does not
+		 * depend on hpt.
+		 */
+		struct mm_iommu_table_group_mem_t *mem;
 
-	rmap = (void *) vmalloc_to_phys(rmap);
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+			return H_TOO_HARD;
 
-	/*
-	 * Synchronize with the MMU notifier callbacks in
-	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
-	 * While we have the rmap lock, code running on other CPUs
-	 * cannot finish unmapping the host real page that backs
-	 * this guest real page, so we are OK to access the host
-	 * real page.
-	 */
-	lock_rmap(rmap);
-	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
-		ret = H_TOO_HARD;
-		goto unlock_exit;
+		mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
+		if (mem)
+			prereg = mm_iommu_ua_to_hpa_rm(mem, ua, &tces) == 0;
+	}
+
+	if (!prereg) {
+		/*
+		 * This is usually a case of a guest with emulated devices only
+		 * when TCE list is not in preregistered memory.
+		 * We do not require memory to be preregistered in this case
+		 * so lock rmap and do __find_linux_pte_or_hugepte().
+		 */
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+			return H_TOO_HARD;
+
+		rmap = (void *) vmalloc_to_phys(rmap);
+
+		/*
+		 * Synchronize with the MMU notifier callbacks in
+		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+		 * While we have the rmap lock, code running on other CPUs
+		 * cannot finish unmapping the host real page that backs
+		 * this guest real page, so we are OK to access the host
+		 * real page.
+		 */
+		lock_rmap(rmap);
+		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
 	}
 
 	for (i = 0; i < npages; ++i) {
@@ -289,7 +314,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	}
 
 unlock_exit:
-	unlock_rmap(rmap);
+	if (rmap)
+		unlock_rmap(rmap);
 
 	return ret;
 }

From b1af23d836f811137d504d14d4cbdd01929dec34 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:55 +1100
Subject: [PATCH 200/227] KVM: PPC: iommu: Unify TCE checking

This reworks helpers for checking TCE update parameters in way they
can be used in KVM.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/iommu.h    | 20 ++++++++++++----
 arch/powerpc/include/asm/kvm_ppc.h  |  6 +++--
 arch/powerpc/kernel/iommu.c         | 37 ++++++++++-------------------
 arch/powerpc/kvm/book3s_64_vio_hv.c | 31 ++++++------------------
 4 files changed, 39 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d96142572e6dbc..8a8ce220d7d04a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -296,11 +296,21 @@ static inline void iommu_restore(void)
 #endif
 
 /* The API to support IOMMU operations for VFIO */
-extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce_value,
-		unsigned long npages);
-extern int iommu_tce_put_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce);
+extern int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages);
+extern int iommu_tce_check_gpa(unsigned long page_shift,
+		unsigned long gpa);
+
+#define iommu_tce_clear_param_check(tbl, ioba, tce_value, npages) \
+		(iommu_tce_check_ioba((tbl)->it_page_shift,       \
+				(tbl)->it_offset, (tbl)->it_size, \
+				(ioba), (npages)) || (tce_value))
+#define iommu_tce_put_param_check(tbl, ioba, gpa)                 \
+		(iommu_tce_check_ioba((tbl)->it_page_shift,       \
+				(tbl)->it_offset, (tbl)->it_size, \
+				(ioba), 1) ||                     \
+		iommu_tce_check_gpa((tbl)->it_page_shift, (gpa)))
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2e66800a870b62..4d079a29eae241 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -178,8 +178,10 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
 		struct kvm *kvm, unsigned long liobn);
-extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
-		unsigned long ioba, unsigned long npages);
+#define kvmppc_ioba_validate(stt, ioba, npages)                         \
+		(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
+				(stt)->size, (ioba), (npages)) ?        \
+				H_PARAMETER : H_SUCCESS)
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
 		unsigned long tce);
 extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5a3231fedf089e..f2b724cd9e64d8 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -963,47 +963,36 @@ void iommu_flush_tce(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_flush_tce);
 
-int iommu_tce_clear_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce_value,
-		unsigned long npages)
+int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages)
 {
-	/* tbl->it_ops->clear() does not support any value but 0 */
-	if (tce_value)
-		return -EINVAL;
+	unsigned long mask = (1UL << page_shift) - 1;
 
-	if (ioba & ~IOMMU_PAGE_MASK(tbl))
+	if (ioba & mask)
 		return -EINVAL;
 
-	ioba >>= tbl->it_page_shift;
-	if (ioba < tbl->it_offset)
+	ioba >>= page_shift;
+	if (ioba < offset)
 		return -EINVAL;
 
-	if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+	if ((ioba + 1) > (offset + size))
 		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
 
-int iommu_tce_put_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce)
+int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 {
-	if (tce & ~IOMMU_PAGE_MASK(tbl))
-		return -EINVAL;
-
-	if (ioba & ~IOMMU_PAGE_MASK(tbl))
-		return -EINVAL;
-
-	ioba >>= tbl->it_page_shift;
-	if (ioba < tbl->it_offset)
-		return -EINVAL;
+	unsigned long mask = (1UL << page_shift) - 1;
 
-	if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+	if (gpa & mask)
 		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
 long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction)
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 0f145fc7a3a567..440d3ab5dc323c 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -61,27 +61,6 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
-/*
- * Validates IO address.
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
- */
-long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
-		unsigned long ioba, unsigned long npages)
-{
-	unsigned long mask = (1ULL << stt->page_shift) - 1;
-	unsigned long idx = ioba >> stt->page_shift;
-
-	if ((ioba & mask) || (idx < stt->offset) ||
-			(idx - stt->offset + npages > stt->size) ||
-			(idx + npages < idx))
-		return H_PARAMETER;
-
-	return H_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
-
 /*
  * Validates TCE address.
  * At the moment flags and page mask are validated.
@@ -95,10 +74,14 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
  */
 long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
 {
-	unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
-	unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	enum dma_data_direction dir = iommu_tce_direction(tce);
+
+	/* Allow userspace to poison TCE table */
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
 
-	if (tce & mask)
+	if (iommu_tce_check_gpa(stt->page_shift, gpa))
 		return H_PARAMETER;
 
 	return H_SUCCESS;

From 121f80ba68f1a5779a36d7b3247206e60e0a7418 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:56 +1100
Subject: [PATCH 201/227] KVM: PPC: VFIO: Add in-kernel acceleration for VFIO

This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests targeted an IOMMU TCE table used for VFIO
without passing them to user space which saves time on switching
to user space and back.

This adds H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE handlers to KVM.
KVM tries to handle a TCE request in the real mode, if failed
it passes the request to the virtual mode to complete the operation.
If it a virtual mode handler fails, the request is passed to
the user space; this is not expected to happen though.

To avoid dealing with page use counters (which is tricky in real mode),
this only accelerates SPAPR TCE IOMMU v2 clients which are required
to pre-register the userspace memory. The very first TCE request will
be handled in the VFIO SPAPR TCE driver anyway as the userspace view
of the TCE table (iommu_table::it_userspace) is not allocated till
the very first mapping happens and we cannot call vmalloc in real mode.

If we fail to update a hardware IOMMU table unexpected reason, we just
clear it and move on as there is nothing really we can do about it -
for example, if we hot plug a VFIO device to a guest, existing TCE tables
will be mirrored automatically to the hardware and there is no interface
to report to the guest about possible failures.

This adds new attribute - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE - to
the VFIO KVM device. It takes a VFIO group fd and SPAPR TCE table fd
and associates a physical IOMMU table with the SPAPR TCE table (which
is a guest view of the hardware IOMMU table). The iommu_table object
is cached and referenced so we do not have to look up for it in real mode.

This does not implement the UNSET counterpart as there is no use for it -
once the acceleration is enabled, the existing userspace won't
disable it unless a VFIO container is destroyed; this adds necessary
cleanup to the KVM_DEV_VFIO_GROUP_DEL handler.

This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user
space.

This adds real mode version of WARN_ON_ONCE() as the generic version
causes problems with rcu_sched. Since we testing what vmalloc_to_phys()
returns in the code, this also adds a check for already existing
vmalloc_to_phys() call in kvmppc_rm_h_put_tce_indirect().

This finally makes use of vfio_external_user_iommu_id() which was
introduced quite some time ago and was considered for removal.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/devices/vfio.txt |  18 +-
 arch/powerpc/include/asm/kvm_host.h        |   8 +
 arch/powerpc/include/asm/kvm_ppc.h         |   4 +
 arch/powerpc/kvm/book3s_64_vio.c           | 306 ++++++++++++++++++++-
 arch/powerpc/kvm/book3s_64_vio_hv.c        | 201 +++++++++++++-
 arch/powerpc/kvm/powerpc.c                 |   2 +
 include/uapi/linux/kvm.h                   |   6 +
 virt/kvm/vfio.c                            | 105 +++++++
 8 files changed, 645 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
index ef51740c67ca13..528c77c8022c66 100644
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -16,7 +16,21 @@ Groups:
 
 KVM_DEV_VFIO_GROUP attributes:
   KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+	kvm_device_attr.addr points to an int32_t file descriptor
+	for the VFIO group.
   KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+	kvm_device_attr.addr points to an int32_t file descriptor
+	for the VFIO group.
+  KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table
+	allocated by sPAPR KVM.
+	kvm_device_attr.addr points to a struct:
 
-For each, kvm_device_attr.addr points to an int32_t file descriptor
-for the VFIO group.
+	struct kvm_vfio_spapr_tce {
+		__s32	groupfd;
+		__s32	tablefd;
+	};
+
+	where
+	@groupfd is a file descriptor for a VFIO group;
+	@tablefd is a file descriptor for a TCE table allocated via
+		KVM_CREATE_SPAPR_TCE.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0f3ac09cbfe01e..77c60826d1456c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,13 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_iommu_table {
+	struct rcu_head rcu;
+	struct list_head next;
+	struct iommu_table *tbl;
+	struct kref kref;
+};
+
 struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
@@ -196,6 +203,7 @@ struct kvmppc_spapr_tce_table {
 	u32 page_shift;
 	u64 offset;		/* in pages */
 	u64 size;		/* window size in pages */
+	struct list_head iommu_tables;
 	struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4d079a29eae241..5885d327c025e3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -173,6 +173,10 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp);
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index d507d94e020c52..a160c14304eba2 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -28,6 +28,8 @@
 #include <linux/hugetlb.h>
 #include <linux/list.h>
 #include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/file.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -40,6 +42,7 @@
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
 {
@@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 	return ret;
 }
 
+static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
+			struct kvmppc_spapr_tce_iommu_table, rcu);
+
+	iommu_tce_table_put(stit->tbl);
+
+	kfree(stit);
+}
+
+static void kvm_spapr_tce_liobn_put(struct kref *kref)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref,
+			struct kvmppc_spapr_tce_iommu_table, kref);
+
+	list_del_rcu(&stit->next);
+
+	call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free);
+}
+
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp)
+{
+	int i;
+	struct kvmppc_spapr_tce_table *stt;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct iommu_table_group *table_group = NULL;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+
+		table_group = iommu_group_get_iommudata(grp);
+		if (WARN_ON(!table_group))
+			continue;
+
+		list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+			for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+				if (table_group->tables[i] != stit->tbl)
+					continue;
+
+				kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
+				return;
+			}
+		}
+	}
+}
+
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	bool found = false;
+	struct iommu_table *tbl = NULL;
+	struct iommu_table_group *table_group;
+	long i;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	struct fd f;
+
+	f = fdget(tablefd);
+	if (!f.file)
+		return -EBADF;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt == f.file->private_data) {
+			found = true;
+			break;
+		}
+	}
+
+	fdput(f);
+
+	if (!found)
+		return -EINVAL;
+
+	table_group = iommu_group_get_iommudata(grp);
+	if (WARN_ON(!table_group))
+		return -EFAULT;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbltmp = table_group->tables[i];
+
+		if (!tbltmp)
+			continue;
+		/*
+		 * Make sure hardware table parameters are exactly the same;
+		 * this is used in the TCE handlers where boundary checks
+		 * use only the first attached table.
+		 */
+		if ((tbltmp->it_page_shift == stt->page_shift) &&
+				(tbltmp->it_offset == stt->offset) &&
+				(tbltmp->it_size == stt->size)) {
+			/*
+			 * Reference the table to avoid races with
+			 * add/remove DMA windows.
+			 */
+			tbl = iommu_tce_table_get(tbltmp);
+			break;
+		}
+	}
+	if (!tbl)
+		return -EINVAL;
+
+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+		if (tbl != stit->tbl)
+			continue;
+
+		if (!kref_get_unless_zero(&stit->kref)) {
+			/* stit is being destroyed */
+			iommu_tce_table_put(tbl);
+			return -ENOTTY;
+		}
+		/*
+		 * The table is already known to this KVM, we just increased
+		 * its KVM reference counter and can return.
+		 */
+		return 0;
+	}
+
+	stit = kzalloc(sizeof(*stit), GFP_KERNEL);
+	if (!stit) {
+		iommu_tce_table_put(tbl);
+		return -ENOMEM;
+	}
+
+	stit->tbl = tbl;
+	kref_init(&stit->kref);
+
+	list_add_rcu(&stit->next, &stt->iommu_tables);
+
+	return 0;
+}
+
 static void release_spapr_tce_table(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_table *stt = container_of(head,
@@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
 
 	list_del_rcu(&stt->list);
 
+	list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+		WARN_ON(!kref_read(&stit->kref));
+		while (1) {
+			if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put))
+				break;
+		}
+	}
+
 	kvm_put_kvm(stt->kvm);
 
 	kvmppc_account_memlimit(
@@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	stt->offset = args->offset;
 	stt->size = size;
 	stt->kvm = kvm;
+	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
 
 	for (i = 0; i < npages; i++) {
 		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -211,11 +355,101 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	return ret;
 }
 
+static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+		return H_HARDWARE;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret != H_SUCCESS)
+		iommu_tce_xchg(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	if (mm_iommu_mapped_inc(mem))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	if (WARN_ON_ONCE(ret)) {
+		mm_iommu_mapped_dec(mem);
+		return H_HARDWARE;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
 	struct kvmppc_spapr_tce_table *stt;
-	long ret;
+	long ret, idx;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
@@ -232,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		} else {
+			idx = srcu_read_lock(&vcpu->kvm->srcu);
+			ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl,
+					entry, ua, dir);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		}
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE(1);
+		kvmppc_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -247,6 +509,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long entry, ua = 0;
 	u64 __user *tces;
 	u64 tce;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -285,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -301,6 +584,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -314,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 440d3ab5dc323c..eda0a8f6fae8eb 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -40,6 +40,31 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 
+#ifdef CONFIG_BUG
+
+#define WARN_ON_ONCE_RM(condition)	({			\
+	static bool __section(.data.unlikely) __warned;		\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once && !__warned)) {		\
+		__warned = true;				\
+		pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n",	\
+				__stringify(condition),		\
+				__func__, __LINE__);		\
+		dump_stack();					\
+	}							\
+	unlikely(__ret_warn_once);				\
+})
+
+#else
+
+#define WARN_ON_ONCE_RM(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	unlikely(__ret_warn_on);				\
+})
+
+#endif
+
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
 /*
@@ -161,11 +186,117 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir))
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret)
+		iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa = 0;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+	if (ret) {
+		mm_iommu_mapped_dec(mem);
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		unsigned long ioba, unsigned long tce)
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
@@ -182,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE)
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		else
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry, ua, dir);
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE_RM(1);
+		kvmppc_rm_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -223,6 +379,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
 	bool prereg = false;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -270,6 +427,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 			return H_TOO_HARD;
 
 		rmap = (void *) vmalloc_to_phys(rmap);
+		if (WARN_ON_ONCE_RM(!rmap))
+			return H_HARDWARE;
 
 		/*
 		 * Synchronize with the MMU notifier callbacks in
@@ -293,6 +452,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		ua = 0;
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -309,6 +489,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -322,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6c7244879bfe8c..cf725c580fc577 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -534,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_SPAPR_TCE_64:
+		/* fallthrough */
+	case KVM_CAP_SPAPR_TCE_VFIO:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
 	case KVM_CAP_PPC_ENABLE_HCALL:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7b488eae61b820..3c168b6fd74b22 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1097,6 +1097,7 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_GROUP			1
 #define   KVM_DEV_VFIO_GROUP_ADD			1
 #define   KVM_DEV_VFIO_GROUP_DEL			2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE		3
 
 enum kvm_device_type {
 	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
@@ -1118,6 +1119,11 @@ enum kvm_device_type {
 	KVM_DEV_TYPE_MAX,
 };
 
+struct kvm_vfio_spapr_tce {
+	__s32	groupfd;
+	__s32	tablefd;
+};
+
 /*
  * ioctls for VM fds
  */
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d32f239eb47133..37d9118fd84be0 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -20,6 +20,10 @@
 #include <linux/vfio.h>
 #include "vfio.h"
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+#include <asm/kvm_ppc.h>
+#endif
+
 struct kvm_vfio_group {
 	struct list_head node;
 	struct vfio_group *vfio_group;
@@ -89,6 +93,47 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
 	return ret > 0;
 }
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group)
+{
+	int (*fn)(struct vfio_group *);
+	int ret = -EINVAL;
+
+	fn = symbol_get(vfio_external_user_iommu_id);
+	if (!fn)
+		return ret;
+
+	ret = fn(vfio_group);
+
+	symbol_put(vfio_external_user_iommu_id);
+
+	return ret;
+}
+
+static struct iommu_group *kvm_vfio_group_get_iommu_group(
+		struct vfio_group *group)
+{
+	int group_id = kvm_vfio_external_user_iommu_id(group);
+
+	if (group_id < 0)
+		return NULL;
+
+	return iommu_group_get_by_id(group_id);
+}
+
+static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm,
+		struct vfio_group *vfio_group)
+{
+	struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group);
+
+	if (WARN_ON_ONCE(!grp))
+		return;
+
+	kvm_spapr_tce_release_iommu_group(kvm, grp);
+	iommu_group_put(grp);
+}
+#endif
+
 /*
  * Groups can use the same or different IOMMU domains.  If the same then
  * adding a new group may change the coherency of groups we've previously
@@ -211,6 +256,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 
 		mutex_unlock(&kv->lock);
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		kvm_spapr_tce_release_vfio_group(dev->kvm, vfio_group);
+#endif
 		kvm_vfio_group_set_kvm(vfio_group, NULL);
 
 		kvm_vfio_group_put_external_user(vfio_group);
@@ -218,6 +266,57 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 		kvm_vfio_update_coherency(dev);
 
 		return ret;
+
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: {
+		struct kvm_vfio_spapr_tce param;
+		struct kvm_vfio *kv = dev->private;
+		struct vfio_group *vfio_group;
+		struct kvm_vfio_group *kvg;
+		struct fd f;
+		struct iommu_group *grp;
+
+		if (copy_from_user(&param, (void __user *)arg,
+				sizeof(struct kvm_vfio_spapr_tce)))
+			return -EFAULT;
+
+		f = fdget(param.groupfd);
+		if (!f.file)
+			return -EBADF;
+
+		vfio_group = kvm_vfio_group_get_external_user(f.file);
+		fdput(f);
+
+		if (IS_ERR(vfio_group))
+			return PTR_ERR(vfio_group);
+
+		grp = kvm_vfio_group_get_iommu_group(vfio_group);
+		if (WARN_ON_ONCE(!grp)) {
+			kvm_vfio_group_put_external_user(vfio_group);
+			return -EIO;
+		}
+
+		ret = -ENOENT;
+
+		mutex_lock(&kv->lock);
+
+		list_for_each_entry(kvg, &kv->group_list, node) {
+			if (kvg->vfio_group != vfio_group)
+				continue;
+
+			ret = kvm_spapr_tce_attach_iommu_group(dev->kvm,
+					param.tablefd, grp);
+			break;
+		}
+
+		mutex_unlock(&kv->lock);
+
+		iommu_group_put(grp);
+		kvm_vfio_group_put_external_user(vfio_group);
+
+		return ret;
+	}
+#endif /* CONFIG_SPAPR_TCE_IOMMU */
 	}
 
 	return -ENXIO;
@@ -242,6 +341,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
 		switch (attr->attr) {
 		case KVM_DEV_VFIO_GROUP_ADD:
 		case KVM_DEV_VFIO_GROUP_DEL:
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
+#endif
 			return 0;
 		}
 
@@ -257,6 +359,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
 	struct kvm_vfio_group *kvg, *tmp;
 
 	list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group);
+#endif
 		kvm_vfio_group_set_kvm(kvg->vfio_group, NULL);
 		kvm_vfio_group_put_external_user(kvg->vfio_group);
 		list_del(&kvg->node);

From feafd13c96d6e2998d3da81bb55db379fcf02499 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 5 Apr 2017 15:58:51 +0200
Subject: [PATCH 202/227] KVM: PPC: Book3S PR: Do not fail emulation with
 mtspr/mfspr for unknown SPRs

According to the PowerISA 2.07, mtspr and mfspr should not always
generate an illegal instruction exception when being used with an
undefined SPR, but rather treat the instruction as a NOP or inject a
privilege exception in some cases, too - depending on the SPR number.
Also turn the printk here into a ratelimited print statement, so that
the guest can not flood the dmesg log of the host by issueing lots of
illegal mtspr/mfspr instruction here.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_emulate.c | 34 +++++++++++++++++++++++--------
 arch/powerpc/kvm/emulate.c        |  8 ++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 8359752b3efcc6..68d68983948e13 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -503,10 +503,18 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		break;
 unprivileged:
 	default:
-		printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
-#ifndef DEBUG_SPR
-		emulated = EMULATE_FAIL;
-#endif
+		pr_info_ratelimited("KVM: invalid SPR write: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
 		break;
 	}
 
@@ -648,10 +656,20 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
 		break;
 	default:
 unprivileged:
-		printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
-#ifndef DEBUG_SPR
-		emulated = EMULATE_FAIL;
-#endif
+		pr_info_ratelimited("KVM: invalid SPR read: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0 ||
+			    sprn == 4 || sprn == 5 || sprn == 6) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
+
 		break;
 	}
 
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b379146de55bf1..c873ffe553624f 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -259,10 +259,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_MFSPR:
 			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
 			break;
 
 		case OP_31_XOP_MTSPR:
 			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
 			break;
 
 		case OP_31_XOP_TLBSYNC:

From 5b560525add78e15fd5b75e5d03ea1ff988e6747 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 20 Apr 2017 11:16:20 +0100
Subject: [PATCH 203/227] ARM: hyp-stub: Fix Thumb-2 compilation

The assembler defaults to emiting the short form of ADR, leading
to an out-of-range immediate. Using the wide version solves this
issue.

Fixes: bc845e4fbbbb ("ARM: KVM: Implement HVC_RESET_VECTORS stub hypercall in the init code")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kernel/hyp-stub.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index d8523cc47a5463..ec7e7377d423dd 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -125,7 +125,7 @@ ENTRY(__hyp_stub_install_secondary)
  * (see safe_svcmode_maskall).
  */
 	@ Now install the hypervisor stub:
-	adr	r7, __hyp_stub_vectors
+	W(adr)	r7, __hyp_stub_vectors
 	mcr	p15, 4, r7, c12, c0, 0	@ set hypervisor vector base (HVBAR)
 
 	@ Disable all traps, so we don't get any nasty surprise

From 1edb632133efb6226b6bef3e7d9fa8c7134ac4e2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 20 Apr 2017 16:02:21 +0100
Subject: [PATCH 204/227] ARM: KVM: Fix idmap stub entry when running Thumb-2
 code

When entering the hyp stub implemented in the idmap, we try to
be mindful of the fact that we could be running a Thumb-2 kernel
by adding 1 to the address we compute. Unfortunately, the assembler
also knows about this trick, and has already generated an address
that has bit 0 set in the litteral pool.

Our superfluous correction ends up confusing the CPU entierely,
as we now branch to the stub in ARM mode instead of Thumb, and on
a possibly unaligned address for good measure. From that point,
nothing really good happens.

The obvious fix in to remove this stupid target PC correction.

Fixes: 6bebcecb6c5b ("ARM: KVM: Allow the main HYP code to use the init hyp stub implementation")
Reported-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/arm/kvm/hyp/hyp-entry.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S
index a35baa81fd2323..95a2faefc070f5 100644
--- a/arch/arm/kvm/hyp/hyp-entry.S
+++ b/arch/arm/kvm/hyp/hyp-entry.S
@@ -144,7 +144,6 @@ hyp_hvc:
 	ldr	r1, [r1]
 	ldr	ip, =__kvm_handle_stub_hvc
 	sub	ip, ip, r1
-THUMB(	add	ip, ip, #1)
 	pop	{r1}
 
 	bx	ip

From 71cb1bf66ef0d1d36ff3f04150450df9dd8af654 Mon Sep 17 00:00:00 2001
From: Farhan Ali <alifm@linux.vnet.ibm.com>
Date: Thu, 23 Feb 2017 13:39:17 -0500
Subject: [PATCH 205/227] s390/sclp: Detect KSS facility

Let's detect the keyless subset facility.

Signed-off-by: Farhan Ali <alifm@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index ace3bd315438e3..6f5167bc19286d 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -75,6 +75,7 @@ struct sclp_info {
 	unsigned char has_pfmfi : 1;
 	unsigned char has_ibs : 1;
 	unsigned char has_skey : 1;
+	unsigned char has_kss : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 519ec1787117d7..efd84d1d178b09 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -40,7 +40,8 @@ struct read_info_sccb {
 	u8	fac85;			/* 85 */
 	u8	_pad_86[91 - 86];	/* 86-90 */
 	u8	flags;			/* 91 */
-	u8	_pad_92[99 - 92];	/* 92-98 */
+	u8	_pad_92[98 - 92];	/* 92-97 */
+	u8	fac98;			/* 98 */
 	u8	hamaxpow;		/* 99 */
 	u32	rnsize2;		/* 100-103 */
 	u64	rnmax2;			/* 104-111 */
@@ -99,6 +100,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb)
 	sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
 	sclp.has_ibs = !!(sccb->fac117 & 0x20);
 	sclp.has_hvs = !!(sccb->fac119 & 0x80);
+	sclp.has_kss = !!(sccb->fac98 & 0x01);
 	if (sccb->fac85 & 0x02)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
 	sclp.rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2;

From 730cd632c4e485b90f97fe998e300045da094938 Mon Sep 17 00:00:00 2001
From: Farhan Ali <alifm@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 16:12:56 -0500
Subject: [PATCH 206/227] KVM: s390: Support keyless subset guest mode

If the KSS facility is available on the machine, we also make it
available for our KVM guests.

The KSS facility bypasses storage key management as long as the guest
does not issue a related instruction. When that happens, the control is
returned to the host, which has to turn off KSS for a guest vcpu
before retrying the instruction.

Signed-off-by: Corey S. McQuay <csmcquay@linux.vnet.ibm.com>
Signed-off-by: Farhan Ali <alifm@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  2 ++
 arch/s390/include/uapi/asm/kvm.h |  1 +
 arch/s390/kvm/intercept.c        |  3 +++
 arch/s390/kvm/kvm-s390.c         |  8 +++++++-
 arch/s390/kvm/kvm-s390.h         |  1 +
 arch/s390/kvm/priv.c             | 19 +++++++++++++------
 arch/s390/kvm/vsie.c             |  6 +++++-
 7 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 552c319483c63d..426614a882a9b1 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -122,6 +122,7 @@ struct esca_block {
 #define CPUSTAT_SLSR       0x00002000
 #define CPUSTAT_ZARCH      0x00000800
 #define CPUSTAT_MCDS       0x00000100
+#define CPUSTAT_KSS        0x00000200
 #define CPUSTAT_SM         0x00000080
 #define CPUSTAT_IBS        0x00000040
 #define CPUSTAT_GED2       0x00000010
@@ -185,6 +186,7 @@ struct kvm_s390_sie_block {
 #define ICPT_OPEREXC	0x2C
 #define ICPT_PARTEXEC	0x38
 #define ICPT_IOINST	0x40
+#define ICPT_KSS	0x5c
 	__u8	icptcode;		/* 0x0050 */
 	__u8	icptstatus;		/* 0x0051 */
 	__u16	ihcpu;			/* 0x0052 */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 2c9ad251fa3381..bf9267930939fe 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -119,6 +119,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_CMMA	10
 #define KVM_S390_VM_CPU_FEAT_PFMFI	11
 #define KVM_S390_VM_CPU_FEAT_SIGPIF	12
+#define KVM_S390_VM_CPU_FEAT_KSS	13
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index f5378f3361273b..a4752bf6b526d0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -426,6 +426,9 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 	case ICPT_PARTEXEC:
 		rc = handle_partial_execution(vcpu);
 		break;
+	case ICPT_KSS:
+		rc = kvm_s390_skey_check_enable(vcpu);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 11b7d66389916b..8771fef112a1be 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -300,6 +300,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
 	if (sclp.has_ibs)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+	if (sclp.has_kss)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_KSS);
 	/*
 	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
 	 * all skey handling functions read/set the skey from the PGSTE
@@ -2034,7 +2036,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
 					| SDNXC;
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
-	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+
+	if (sclp.has_kss)
+		atomic_or(CPUSTAT_KSS, &vcpu->arch.sie_block->cpuflags);
+	else
+		vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 
 	if (vcpu->kvm->arch.use_cmma) {
 		rc = kvm_s390_vcpu_setup_cmma(vcpu);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 455124fe06476c..55f5c8457d6d46 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -254,6 +254,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
+int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu);
 
 /* implemented in vsie.c */
 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 0ffe973535fab8..c03106c428cfa8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -198,18 +198,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int __skey_check_enable(struct kvm_vcpu *vcpu)
+int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 {
 	int rc = 0;
+	struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
 
 	trace_kvm_s390_skey_related_inst(vcpu);
-	if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
+	if (!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
+	    !(atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS))
 		return rc;
 
 	rc = s390_enable_skey();
 	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
-	if (!rc)
-		vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+	if (!rc) {
+		if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)
+			atomic_andnot(CPUSTAT_KSS, &sie_block->cpuflags);
+		else
+			sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE |
+					     ICTL_RRBE);
+	}
 	return rc;
 }
 
@@ -218,7 +225,7 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
 	int rc;
 
 	vcpu->stat.instruction_storage_key++;
-	rc = __skey_check_enable(vcpu);
+	rc = kvm_s390_skey_check_enable(vcpu);
 	if (rc)
 		return rc;
 	if (sclp.has_skey) {
@@ -916,7 +923,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 		}
 
 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
-			int rc = __skey_check_enable(vcpu);
+			int rc = kvm_s390_skey_check_enable(vcpu);
 
 			if (rc)
 				return rc;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 025b1f2e17a9d4..4719ecb9ab423b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -117,6 +117,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		newflags |= cpuflags & CPUSTAT_SM;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
 		newflags |= cpuflags & CPUSTAT_IBS;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_KSS))
+		newflags |= cpuflags & CPUSTAT_KSS;
 
 	atomic_set(&scb_s->cpuflags, newflags);
 	return 0;
@@ -289,7 +291,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	 * bits. Therefore we cannot provide interpretation and would later
 	 * have to provide own emulation handlers.
 	 */
-	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_KSS))
+		scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+
 	scb_s->icpua = scb_o->icpua;
 
 	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))

From fe0e80befd4d3a62d40f24b98b17483ea00ef2dd Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 10 Mar 2017 12:47:13 +0100
Subject: [PATCH 207/227] KVM: VMX: drop vmm_exclusive module parameter

vmm_exclusive=0 leads to KVM setting X86_CR4_VMXE always and calling
VMXON only when the vcpu is loaded. X86_CR4_VMXE is used as an
indication in cpu_emergency_vmxoff() (called on kdump) if VMXOFF has to be
called. This is obviously not the case if both are used independtly.
Calling VMXOFF without a previous VMXON will result in an exception.

In addition, X86_CR4_VMXE is used as a mean to test if VMX is already in
use by another VMM in hardware_enable(). So there can't really be
co-existance. If the other VMM is prepared for co-existance and does a
similar check, only one VMM can exist. If the other VMM is not prepared
and blindly sets/clears X86_CR4_VMXE, we will get inconsistencies with
X86_CR4_VMXE.

As we also had bug reports related to clearing of vmcs with vmm_exclusive=0
this seems to be pretty much untested. So let's better drop it.

While at it, directly move setting/clearing X86_CR4_VMXE into
kvm_cpu_vmxon/off.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 38 +++++++-------------------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 837f6dd1ae9c9c..c1a12b94e1fd96 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 static bool __read_mostly emulate_invalid_guest_state = true;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-static bool __read_mostly vmm_exclusive = 1;
-module_param(vmm_exclusive, bool, S_IRUGO);
-
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
@@ -910,8 +907,6 @@ static void nested_release_page_clean(struct page *page)
 
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
-static void kvm_cpu_vmxon(u64 addr);
-static void kvm_cpu_vmxoff(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -2231,15 +2226,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(phys_addr);
-	else if (!already_loaded)
-		loaded_vmcs_clear(vmx->loaded_vmcs);
-
 	if (!already_loaded) {
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 		local_irq_disable();
 		crash_disable_local_vmclear(cpu);
 
@@ -2317,11 +2307,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 	vmx_vcpu_pi_put(vcpu);
 
 	__vmx_load_host_state(to_vmx(vcpu));
-	if (!vmm_exclusive) {
-		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
-		vcpu->cpu = -1;
-		kvm_cpu_vmxoff();
-	}
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
@@ -3416,6 +3401,7 @@ static __init int vmx_disabled_by_bios(void)
 
 static void kvm_cpu_vmxon(u64 addr)
 {
+	cr4_set_bits(X86_CR4_VMXE);
 	intel_pt_handle_vmx(1);
 
 	asm volatile (ASM_VMX_VMXON_RAX
@@ -3458,12 +3444,8 @@ static int hardware_enable(void)
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
-	cr4_set_bits(X86_CR4_VMXE);
-
-	if (vmm_exclusive) {
-		kvm_cpu_vmxon(phys_addr);
-		ept_sync_global();
-	}
+	kvm_cpu_vmxon(phys_addr);
+	ept_sync_global();
 
 	native_store_gdt(this_cpu_ptr(&host_gdt));
 
@@ -3489,15 +3471,13 @@ static void kvm_cpu_vmxoff(void)
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 
 	intel_pt_handle_vmx(0);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static void hardware_disable(void)
 {
-	if (vmm_exclusive) {
-		vmclear_local_loaded_vmcss();
-		kvm_cpu_vmxoff();
-	}
-	cr4_clear_bits(X86_CR4_VMXE);
+	vmclear_local_loaded_vmcss();
+	kvm_cpu_vmxoff();
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -9170,11 +9150,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	vmx->loaded_vmcs->shadow_vmcs = NULL;
 	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
 	loaded_vmcs_init(vmx->loaded_vmcs);
-	if (!vmm_exclusive)
-		kvm_cpu_vmxoff();
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);

From db2336a80489e7c3c7728cefd9be58fac5ecfb39 Mon Sep 17 00:00:00 2001
From: Kyle Huey <me@kylehuey.com>
Date: Mon, 20 Mar 2017 01:16:28 -0700
Subject: [PATCH 208/227] KVM: x86: virtualize cpuid faulting

Hardware support for faulting on the cpuid instruction is not required to
emulate it, because cpuid triggers a VM exit anyways. KVM handles the relevant
MSRs (MSR_PLATFORM_INFO and MSR_MISC_FEATURES_ENABLE) and upon a
cpuid-induced VM exit checks the cpuid faulting state and the CPL.
kvm_require_cpl is even kind enough to inject the GP fault for us.

Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Reviewed-by: David Matlack <dmatlack@google.com>
[Return "1" from kvm_emulate_cpuid, it's not void. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/cpuid.c            |  3 +++
 arch/x86/kvm/cpuid.h            | 11 +++++++++++
 arch/x86/kvm/emulate.c          |  7 +++++++
 arch/x86/kvm/x86.c              | 26 ++++++++++++++++++++++++++
 5 files changed, 49 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2cc5ec7cc6f55d..f5c942edbc86c0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -611,6 +611,8 @@ struct kvm_vcpu_arch {
 	unsigned long dr7;
 	unsigned long eff_db[KVM_NR_DB_REGS];
 	unsigned long guest_debug_dr7;
+	u64 msr_platform_info;
+	u64 msr_misc_features_enables;
 
 	u64 mcg_cap;
 	u64 mcg_status;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index efde6cc5087518..a181ae76c71ce1 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -876,6 +876,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
 	u32 eax, ebx, ecx, edx;
 
+	if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+		return 1;
+
 	eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 35058c2c0eeabe..a6fd40aade7cbf 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -205,4 +205,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
 	return x86_stepping(best->eax);
 }
 
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.msr_misc_features_enables &
+		  MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
 #endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 45c7306c8780b2..6a2ea945d01f12 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3854,6 +3854,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
 static int em_cpuid(struct x86_emulate_ctxt *ctxt)
 {
 	u32 eax, ebx, ecx, edx;
+	u64 msr = 0;
+
+	ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+	if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+	    ctxt->ops->cpl(ctxt)) {
+		return emulate_gp(ctxt, 0);
+	}
 
 	eax = reg_read(ctxt, VCPU_REGS_RAX);
 	ecx = reg_read(ctxt, VCPU_REGS_RCX);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 34bf64fb4dea82..49a69c0a0d5077 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1007,6 +1007,8 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MCG_CTL,
 	MSR_IA32_MCG_EXT_CTL,
 	MSR_IA32_SMBASE,
+	MSR_PLATFORM_INFO,
+	MSR_MISC_FEATURES_ENABLES,
 };
 
 static unsigned num_emulated_msrs;
@@ -2314,6 +2316,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.osvw.status = data;
 		break;
+	case MSR_PLATFORM_INFO:
+		if (!msr_info->host_initiated ||
+		    data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
+		    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
+		     cpuid_fault_enabled(vcpu)))
+			return 1;
+		vcpu->arch.msr_platform_info = data;
+		break;
+	case MSR_MISC_FEATURES_ENABLES:
+		if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+		    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+		     !supports_cpuid_fault(vcpu)))
+			return 1;
+		vcpu->arch.msr_misc_features_enables = data;
+		break;
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
@@ -2529,6 +2546,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vcpu->arch.osvw.status;
 		break;
+	case MSR_PLATFORM_INFO:
+		msr_info->data = vcpu->arch.msr_platform_info;
+		break;
+	case MSR_MISC_FEATURES_ENABLES:
+		msr_info->data = vcpu->arch.msr_misc_features_enables;
+		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -7710,6 +7733,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!init_event) {
 		kvm_pmu_reset(vcpu);
 		vcpu->arch.smbase = 0x30000;
+
+		vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+		vcpu->arch.msr_misc_features_enables = 0;
 	}
 
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));

From 668fffa3f838edfcb1679f842f7ef1afa61c3e9a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 21 Apr 2017 12:27:17 +0200
Subject: [PATCH 209/227] kvm: better MWAIT emulation for guests

Guests that are heavy on futexes end up IPI'ing each other a lot. That
can lead to significant slowdowns and latency increase for those guests
when running within KVM.

If only a single guest is needed on a host, we have a lot of spare host
CPU time we can throw at the problem. Modern CPUs implement a feature
called "MWAIT" which allows guests to wake up sleeping remote CPUs without
an IPI - thus without an exit - at the expense of never going out of guest
context.

The decision whether this is something sensible to use should be up to the
VM admin, so to user space. We can however allow MWAIT execution on systems
that support it properly hardware wise.

This patch adds a CAP to user space and a KVM cpuid leaf to indicate
availability of native MWAIT execution. With that enabled, the worst a
guest can do is waste as many cycles as a "jmp ." would do, so it's not
a privilege problem.

We consciously do *not* expose the feature in our CPUID bitmap, as most
people will want to benefit from sleeping vCPUs to allow for over commit.

Reported-by: "Gabriel L. Somlo" <gsomlo@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
[agraf: fix amd, change commit message]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 ++++++++
 arch/x86/kvm/svm.c                |  7 ++++--
 arch/x86/kvm/vmx.c                |  6 ++++--
 arch/x86/kvm/x86.c                |  3 +++
 arch/x86/kvm/x86.h                | 36 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 6 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e60be91d803662..dc674c2b8b31f8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4111,3 +4111,12 @@ reserved.
  2: MIPS64 or microMIPS64 with access to all address segments.
     Both registers and addresses are 64-bits wide.
     It will be possible to run 64-bit or 32-bit guest code.
+
+8.8 KVM_CAP_X86_GUEST_MWAIT
+
+Architectures: x86
+
+This capability indicates that guest using memory monotoring instructions
+(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit.  As such time
+spent while virtual CPU is halted in this way will then be accounted for as
+guest running time on the host (as opposed to e.g. HLT).
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1b203abf76e1e6..c41f03e5090ada 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1198,10 +1198,13 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_CLGI);
 	set_intercept(svm, INTERCEPT_SKINIT);
 	set_intercept(svm, INTERCEPT_WBINVD);
-	set_intercept(svm, INTERCEPT_MONITOR);
-	set_intercept(svm, INTERCEPT_MWAIT);
 	set_intercept(svm, INTERCEPT_XSETBV);
 
+	if (!kvm_mwait_in_guest()) {
+		set_intercept(svm, INTERCEPT_MONITOR);
+		set_intercept(svm, INTERCEPT_MWAIT);
+	}
+
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
 	control->int_ctl = V_INTR_MASKING_MASK;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c1a12b94e1fd96..a4ef6371810106 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3527,11 +3527,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING |
-	      CPU_BASED_MWAIT_EXITING |
-	      CPU_BASED_MONITOR_EXITING |
 	      CPU_BASED_INVLPG_EXITING |
 	      CPU_BASED_RDPMC_EXITING;
 
+	if (!kvm_mwait_in_guest())
+		min |= CPU_BASED_MWAIT_EXITING |
+			CPU_BASED_MONITOR_EXITING;
+
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49a69c0a0d5077..2f9fe6bf709160 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2687,6 +2687,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ADJUST_CLOCK:
 		r = KVM_CLOCK_TSC_STABLE;
 		break;
+	case KVM_CAP_X86_GUEST_MWAIT:
+		r = kvm_mwait_in_guest();
+		break;
 	case KVM_CAP_X86_SMM:
 		/* SMBASE is usually relocated above 1M on modern chipsets,
 		 * and SMM handlers might indeed rely on 4G segment limits,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e8ff3e4ce38a53..6120670749055d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -1,6 +1,8 @@
 #ifndef ARCH_X86_KVM_X86_H
 #define ARCH_X86_KVM_X86_H
 
+#include <asm/processor.h>
+#include <asm/mwait.h>
 #include <linux/kvm_host.h>
 #include <asm/pvclock.h>
 #include "kvm_cache_regs.h"
@@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 	    __rem;						\
 	 })
 
+static inline bool kvm_mwait_in_guest(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
+		return false;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		/* All AMD CPUs have a working MWAIT implementation */
+		return true;
+	case X86_VENDOR_INTEL:
+		/* Handle Intel below */
+		break;
+	default:
+		return false;
+	}
+
+	/*
+	 * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
+	 * they would allow guest to stop the CPU completely by disabling
+	 * interrupts then invoking MWAIT.
+	 */
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return false;
+
+	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+	if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK))
+		return false;
+
+	return true;
+}
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3c168b6fd74b22..e43906b95d9f64 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -893,6 +893,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_GS 140
 #define KVM_CAP_S390_AIS 141
 #define KVM_CAP_SPAPR_TCE_VFIO 142
+#define KVM_CAP_X86_GUEST_MWAIT 143
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From e891a32e7ae0c6ba57959799c0ac84681616eeef Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 17 Apr 2017 12:51:37 -0300
Subject: [PATCH 210/227] KVM: x86: remove irq disablement around
 KVM_SET_CLOCK/KVM_GET_CLOCK

The disablement of interrupts at KVM_SET_CLOCK/KVM_GET_CLOCK
attempts to disable software suspend from causing "non atomic behaviour" of
the operation:

    Add a helper function to compute the kernel time and convert nanoseconds
    back to CPU specific cycles.  Note that these must not be called in preemptible
    context, as that would mean the kernel could enter software suspend state,
    which would cause non-atomic operation.

However, assume the kernel can enter software suspend at the following 2 points:

        ktime_get_ts(&ts);
1.
						hypothetical_ktime_get_ts(&ts)
        monotonic_to_bootbased(&ts);
2.

monotonic_to_bootbased() should be correct relative to a ktime_get_ts(&ts)
performed after point 1 (that is after resuming from software suspend),
hypothetical_ktime_get_ts()

Therefore it is also correct for the ktime_get_ts(&ts) before point 1,
which is

	ktime_get_ts(&ts) = hypothetical_ktime_get_ts(&ts) + time-to-execute-suspend-code

Note CLOCK_MONOTONIC does not count during suspension.

So remove the irq disablement, which causes the following warning on
-RT kernels:

 With this reasoning, and the -RT bug that the irq disablement causes
 (because spin_lock is now a sleeping lock), remove the IRQ protection as it
 causes:

 [ 1064.668109] in_atomic(): 0, irqs_disabled(): 1, pid: 15296, name:m
 [ 1064.668110] INFO: lockdep is turned off.
 [ 1064.668110] irq event stamp: 0
 [ 1064.668112] hardirqs last  enabled at (0): [<          (null)>]  )
 [ 1064.668116] hardirqs last disabled at (0): [] c0
 [ 1064.668118] softirqs last  enabled at (0): [] c0
 [ 1064.668118] softirqs last disabled at (0): [<          (null)>]  )
 [ 1064.668121] CPU: 13 PID: 15296 Comm: qemu-kvm Not tainted 3.10.0-1
 [ 1064.668121] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 5
 [ 1064.668123]  ffff8c1796b88000 00000000afe7344c ffff8c179abf3c68 f3
 [ 1064.668125]  ffff8c179abf3c90 ffffffff930ccb3d ffff8c1b992b3610 f0
 [ 1064.668126]  00007ffc1a26fbc0 ffff8c179abf3cb0 ffffffff9375f694 f0
 [ 1064.668126] Call Trace:
 [ 1064.668132]  [] dump_stack+0x19/0x1b
 [ 1064.668135]  [] __might_sleep+0x12d/0x1f0
 [ 1064.668138]  [] rt_spin_lock+0x24/0x60
 [ 1064.668155]  [] __get_kvmclock_ns+0x36/0x110 [k]
 [ 1064.668159]  [] ? futex_wait_queue_me+0x103/0x10
 [ 1064.668171]  [] kvm_arch_vm_ioctl+0xa2/0xd70 [k]
 [ 1064.668173]  [] ? futex_wait+0x1ac/0x2a0

v2: notice get_kvmclock_ns with the same problem (Pankaj).
v3: remove useless helper function (Pankaj).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f9fe6bf709160..f68c5b2ba627da 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1759,7 +1759,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-static u64 __get_kvmclock_ns(struct kvm *kvm)
+u64 get_kvmclock_ns(struct kvm *kvm)
 {
 	struct kvm_arch *ka = &kvm->arch;
 	struct pvclock_vcpu_time_info hv_clock;
@@ -1780,18 +1780,6 @@ static u64 __get_kvmclock_ns(struct kvm *kvm)
 	return __pvclock_read_cycles(&hv_clock, rdtsc());
 }
 
-u64 get_kvmclock_ns(struct kvm *kvm)
-{
-	unsigned long flags;
-	s64 ns;
-
-	local_irq_save(flags);
-	ns = __get_kvmclock_ns(kvm);
-	local_irq_restore(flags);
-
-	return ns;
-}
-
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 {
 	struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -4202,10 +4190,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
-		local_irq_disable();
-		now_ns = __get_kvmclock_ns(kvm);
+		now_ns = get_kvmclock_ns(kvm);
 		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
-		local_irq_enable();
 		kvm_gen_update_masterclock(kvm);
 		break;
 	}
@@ -4213,11 +4199,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		local_irq_disable();
-		now_ns = __get_kvmclock_ns(kvm);
+		now_ns = get_kvmclock_ns(kvm);
 		user_ns.clock = now_ns;
 		user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
-		local_irq_enable();
 		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
 		r = -EFAULT;

From e000b8e0968dd7bfa09c444607ce1e48e57aafd3 Mon Sep 17 00:00:00 2001
From: "Jason J. Herne" <jjherne@linux.vnet.ibm.com>
Date: Mon, 20 Mar 2017 09:57:42 -0400
Subject: [PATCH 211/227] s390: kvm: Cpu model support for msa6, msa7 and msa8

msa6 and msa7 require no changes.
msa8 adds kma instruction and feature area.

Signed-off-by: Jason J. Herne <jjherne@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/vm.txt | 3 ++-
 arch/s390/include/uapi/asm/kvm.h         | 3 ++-
 arch/s390/kvm/kvm-s390.c                 | 4 ++++
 arch/s390/tools/gen_facilities.c         | 1 +
 tools/arch/s390/include/uapi/asm/kvm.h   | 3 ++-
 5 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index b6cda49f2ba418..575ccb022aacd9 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -140,7 +140,8 @@ struct kvm_s390_vm_cpu_subfunc {
        u8 kmo[16];           # valid with Message-Security-Assist-Extension 4
        u8 pcc[16];           # valid with Message-Security-Assist-Extension 4
        u8 ppno[16];          # valid with Message-Security-Assist-Extension 5
-       u8 reserved[1824];    # reserved for future instructions
+       u8 kma[16];           # valid with Message-Security-Assist-Extension 8
+       u8 reserved[1808];    # reserved for future instructions
 };
 
 Parameters: address of a buffer to load the subfunction blocks from.
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index bf9267930939fe..3dd2a1d308dd0b 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -141,7 +141,8 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kmo[16];		/* with MSA4 */
 	__u8 pcc[16];		/* with MSA4 */
 	__u8 ppno[16];		/* with MSA5 */
-	__u8 reserved[1824];
+	__u8 kma[16];		/* with MSA8 */
+	__u8 reserved[1808];
 };
 
 /* kvm attributes for crypto */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8771fef112a1be..7eb1275cc26589 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -276,6 +276,10 @@ static void kvm_s390_cpu_feat_init(void)
 		__cpacf_query(CPACF_PPNO, (cpacf_mask_t *)
 			      kvm_s390_available_subfunc.ppno);
 
+	if (test_facility(146)) /* MSA8 */
+		__cpacf_query(CPACF_KMA, (cpacf_mask_t *)
+			      kvm_s390_available_subfunc.kma);
+
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
 	/*
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 0cf802de52a1d2..be63fbd699fd65 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -82,6 +82,7 @@ static struct facility_def facility_defs[] = {
 			78, /* enhanced-DAT 2 */
 			130, /* instruction-execution-protection */
 			131, /* enhanced-SOP 2 and side-effect */
+			146, /* msa extension 8 */
 			-1  /* END */
 		}
 	},
diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h
index a2ffec4139ad1c..7f4fd65e920851 100644
--- a/tools/arch/s390/include/uapi/asm/kvm.h
+++ b/tools/arch/s390/include/uapi/asm/kvm.h
@@ -131,7 +131,8 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kmo[16];		/* with MSA4 */
 	__u8 pcc[16];		/* with MSA4 */
 	__u8 ppno[16];		/* with MSA5 */
-	__u8 reserved[1824];
+	__u8 kma[16];		/* with MSA8 */
+	__u8 reserved[1808];
 };
 
 /* kvm attributes for crypto */

From 72875d8a4d92f6f37e051be522b2252fd49bd50e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:19 +0200
Subject: [PATCH 212/227] KVM: add kvm_{test,clear}_request to replace
 {test,clear}_bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users were expected to use kvm_check_request() for testing and clearing,
but request have expanded their use since then and some users want to
only test or do a faster clear.

Make sure that requests are not directly accessed with bit operations.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c           |  2 +-
 arch/powerpc/kvm/book3s_pr.c      |  2 +-
 arch/powerpc/kvm/book3s_pr_papr.c |  2 +-
 arch/powerpc/kvm/booke.c          |  4 ++--
 arch/powerpc/kvm/powerpc.c        |  2 +-
 arch/s390/kvm/kvm-s390.c          |  2 +-
 arch/x86/kvm/vmx.c                |  2 +-
 arch/x86/kvm/x86.c                |  8 ++++----
 include/linux/kvm_host.h          | 14 ++++++++++++--
 9 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 34e78a3ee9d79e..4144bfaef13772 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -982,7 +982,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 		 * check if any I/O interrupts are pending.
 		 */
 		if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		}
 	}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f026b062c0ed22..69a09444d46e66 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -349,7 +349,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->stat.halt_wakeup++;
 
 			/* Unset POW bit after we woke up */
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index f102616febc7b1..bcbeeb62dd13e9 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -344,7 +344,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 	case H_CEDE:
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
 	case H_LOGICAL_CI_LOAD:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3c296c2eacf8b9..3eaac380997700 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -584,7 +584,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu)
 	 * userspace, so clear the KVM_REQ_WATCHDOG request.
 	 */
 	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
-		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_WATCHDOG, vcpu);
 
 	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
 	nr_jiffies = watchdog_next_timeout(vcpu);
@@ -695,7 +695,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_WE) {
 		local_irq_enable();
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		hard_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf725c580fc577..1ee22a9100748f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -233,7 +233,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	case EV_HCALL_TOKEN(EV_IDLE):
 		r = EV_SUCCESS;
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		break;
 	default:
 		r = EV_UNIMPLEMENTED;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7eb1275cc26589..4bafb0a0c8b5b7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2496,7 +2496,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 	}
 
 	/* nothing to do, just clear the request */
-	clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4ef6371810106..b003b8dfb20c6d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6299,7 +6299,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
 			return handle_interrupt_window(&vmx->vcpu);
 
-		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
 			return 1;
 
 		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f68c5b2ba627da..2de54c20fa9e37 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1753,7 +1753,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 	/* guest entries allowed */
 	kvm_for_each_vcpu(i, vcpu, kvm)
-		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
@@ -7041,7 +7041,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		if (r <= 0)
 			break;
 
-		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
@@ -7169,7 +7169,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		kvm_apic_accept_events(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		r = -EAGAIN;
 		goto out;
 	}
@@ -8382,7 +8382,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (atomic_read(&vcpu->arch.nmi_queued))
 		return true;
 
-	if (test_bit(KVM_REQ_SMI, &vcpu->requests))
+	if (kvm_test_request(KVM_REQ_SMI, vcpu))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 397b7b5b1933b8..374fa92c765778 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1079,10 +1079,20 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	set_bit(req, &vcpu->requests);
 }
 
+static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_bit(req, &vcpu->requests);
+}
+
+static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
+{
+	clear_bit(req, &vcpu->requests);
+}
+
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
-	if (test_bit(req, &vcpu->requests)) {
-		clear_bit(req, &vcpu->requests);
+	if (kvm_test_request(req, vcpu)) {
+		kvm_clear_request(req, vcpu);
 
 		/*
 		 * Ensure the rest of the request is visible to kvm_check_request's

From 1bd2009e7334a9900c88f7016207619d2f86f884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:20 +0200
Subject: [PATCH 213/227] KVM: x86: always use kvm_make_request instead of
 set_bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2de54c20fa9e37..0936c3e2e51c13 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2203,8 +2203,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
 
 			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
-				set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
-					&vcpu->requests);
+				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 
 			ka->boot_vcpu_runs_old_kvmclock = tmp;
 		}
@@ -2802,11 +2801,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
 }
 
-static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
-	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-}
-
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	/* Address WBINVD may be executed by guest */
@@ -2850,7 +2844,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
 			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
-			kvm_migrate_timers(vcpu);
+			kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
 		vcpu->cpu = cpu;
 	}
 

From 75aaafb79f73516b69d5639ad30a72d72e75c8b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:21 +0200
Subject: [PATCH 214/227] KVM: remove #ifndef CONFIG_S390 around
 kvm_vcpu_wake_up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The #ifndef was protecting a missing halt_wakeup stat, but that is no
longer necessary.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 357e67cba32eed..e5d52b46b531d4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2195,7 +2195,6 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-#ifndef CONFIG_S390
 void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
 	struct swait_queue_head *wqp;
@@ -2225,7 +2224,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
-#endif /* !CONFIG_S390 */
 
 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {

From 930f7fd6da77ed9476a538345513460fd304aaf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:22 +0200
Subject: [PATCH 215/227] KVM: mark requests that do not need a wakeup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some operations must ensure that the guest is not running with stale
data, but if the guest is halted, then the update can wait until another
event happens.  kvm_make_all_requests() currently doesn't wake up, so we
can mark all requests used with it.

First 8 bits were arbitrarily reserved for request numbers.

Most uses of requests have the request type as a constant, so a compiler
will optimize the '&'.

An alternative would be to have an inline function that would return
whether the request needs a wake-up or not, but I like this one better
even though it might produce worse assembly.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h   |  2 +-
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/x86/include/asm/kvm_host.h   |  6 +++---
 include/linux/kvm_host.h          | 12 +++++++-----
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index de67ce64750173..49358f20d36f44 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -44,7 +44,7 @@
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 #endif
 
-#define KVM_REQ_VCPU_EXIT	8
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
 
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 522e4f60976ee1..1c9458a7ec92ea 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -41,7 +41,7 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
-#define KVM_REQ_VCPU_EXIT	8
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f5c942edbc86c0..19219826bed6bb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -61,10 +61,10 @@
 #define KVM_REQ_PMI               19
 #define KVM_REQ_SMI               20
 #define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS 22
-#define KVM_REQ_SCAN_IOAPIC       23
+#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD  25
+#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_CRASH          26
 #define KVM_REQ_IOAPIC_EOI_EXIT   27
 #define KVM_REQ_HV_RESET          28
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 374fa92c765778..a805ddcb7eb050 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -115,12 +115,14 @@ static inline bool is_error_page(struct page *page)
 	return IS_ERR(page);
 }
 
+#define KVM_REQUEST_MASK           GENMASK(7,0)
+#define KVM_REQUEST_NO_WAKEUP      BIT(8)
 /*
  * Architecture-independent vcpu->requests bit members
  * Bits 4-7 are reserved for more arch-independent bits.
  */
-#define KVM_REQ_TLB_FLUSH          0
-#define KVM_REQ_MMU_RELOAD         1
+#define KVM_REQ_TLB_FLUSH          (0 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MMU_RELOAD         (1 | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_PENDING_TIMER      2
 #define KVM_REQ_UNHALT             3
 
@@ -1076,17 +1078,17 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
 	 */
 	smp_wmb();
-	set_bit(req, &vcpu->requests);
+	set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
 {
-	return test_bit(req, &vcpu->requests);
+	return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
 {
-	clear_bit(req, &vcpu->requests);
+	clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)

From 6c6e8360b34d59d2f687a1649e61173742dbc891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:23 +0200
Subject: [PATCH 216/227] KVM: perform a wake_up in kvm_make_all_cpus_request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want to have kvm_make_all_cpus_request() to be an optmized version of

  kvm_for_each_vcpu(i, vcpu, kvm) {
    kvm_make_request(vcpu, request);
    kvm_vcpu_kick(vcpu);
  }

and kvm_vcpu_kick() wakes up the target vcpu.  We know which requests do
not need the wake up and use it to optimize the loop.

Thanks to that, this patch doesn't change the behavior of current users
(the all don't need the wake up) and only prepares for future where the
wake up is going to be needed.

I think that most requests do not need the wake up, so we would flip the
bit then.

Later on, kvm_make_request() will take care of kicking too, using this
bit to make the decision whether to kick or not.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e5d52b46b531d4..3772f7dcc72d3f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -186,6 +186,9 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		/* Set ->requests bit before we read ->mode. */
 		smp_mb__after_atomic();
 
+		if (!(req & KVM_REQUEST_NO_WAKEUP))
+			kvm_vcpu_wake_up(vcpu);
+
 		if (cpus != NULL && cpu != -1 && cpu != me &&
 		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
 			cpumask_set_cpu(cpu, cpus);

From cde9af6e79046e12cd08d161139b1d5e57e9fbac Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Wed, 26 Apr 2017 22:32:24 +0200
Subject: [PATCH 217/227] KVM: add explicit barrier to kvm_vcpu_kick

kvm_vcpu_kick() must issue a general memory barrier prior to reading
vcpu->mode in order to ensure correctness of the mutual-exclusion
memory barrier pattern used with vcpu->requests.  While the cmpxchg
called from kvm_vcpu_kick():

 kvm_vcpu_kick
   kvm_arch_vcpu_should_kick
     kvm_vcpu_exiting_guest_mode
       cmpxchg

implies general memory barriers before and after the operation, that
implication is only valid when cmpxchg succeeds.  We need an explicit
barrier for when it fails, otherwise a VCPU thread on its entry path
that reads zero for vcpu->requests does not exclude the possibility
the requesting thread sees !IN_GUEST_MODE when it reads vcpu->mode.

kvm_make_all_cpus_request already had a barrier, so we remove it, as
now it would be redundant.

Signed-off-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       | 2 +-
 include/linux/kvm_host.h | 6 ++++++
 virt/kvm/kvm_main.c      | 3 ---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0936c3e2e51c13..69fcee26f4dae2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6853,7 +6853,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	/*
 	 * 1) We should set ->mode before checking ->requests.  Please see
-	 * the comment in kvm_make_all_cpus_request.
+	 * the comment in kvm_vcpu_exiting_guest_mode().
 	 *
 	 * 2) For APICv, we should set ->mode before checking PIR.ON.  This
 	 * pairs with the memory barrier implicit in pi_test_and_set_on
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a805ddcb7eb050..84c5396564f764 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -270,6 +270,12 @@ struct kvm_vcpu {
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
+	/*
+	 * The memory barrier ensures a previous write to vcpu->requests cannot
+	 * be reordered with the read of vcpu->mode.  It pairs with the general
+	 * memory barrier following the write of vcpu->mode in VCPU RUN.
+	 */
+	smp_mb__before_atomic();
 	return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3772f7dcc72d3f..1efb076430358f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -183,9 +183,6 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
-		/* Set ->requests bit before we read ->mode. */
-		smp_mb__after_atomic();
-
 		if (!(req & KVM_REQUEST_NO_WAKEUP))
 			kvm_vcpu_wake_up(vcpu);
 

From 178f02ffafafc59d4d4b135242e5cc1515743680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:26 +0200
Subject: [PATCH 218/227] KVM: return if kvm_vcpu_wake_up() did wake up the
 VCPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No need to kick a VCPU that we have just woken up.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 84c5396564f764..f4a2c00092f890 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -690,7 +690,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
-void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1efb076430358f..632f7b3e198c46 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -183,8 +183,8 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
-		if (!(req & KVM_REQUEST_NO_WAKEUP))
-			kvm_vcpu_wake_up(vcpu);
+		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
+			continue;
 
 		if (cpus != NULL && cpu != -1 && cpu != me &&
 		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
@@ -2195,7 +2195,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
+bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
 	struct swait_queue_head *wqp;
 
@@ -2203,8 +2203,10 @@ void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 	if (swait_active(wqp)) {
 		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
+		return true;
 	}
 
+	return false;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
 
@@ -2216,7 +2218,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	int me;
 	int cpu = vcpu->cpu;
 
-	kvm_vcpu_wake_up(vcpu);
+	if (kvm_vcpu_wake_up(vcpu))
+		return;
+
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 		if (kvm_arch_vcpu_should_kick(vcpu))

From 7a97cec26b94c909f4cbad2dc3186af3e457a522 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 27 Apr 2017 14:33:43 +0200
Subject: [PATCH 219/227] KVM: mark requests that need synchronization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_make_all_requests() provides a synchronization that waits until all
kicked VCPUs have acknowledged the kick.  This is important for
KVM_REQ_MMU_RELOAD as it prevents freeing while lockless paging is
underway.

This patch adds the synchronization property into all requests that are
currently being used with kvm_make_all_requests() in order to preserve
the current behavior and only introduce a new framework.  Removing it
from requests where it is not necessary is left for future patches.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h   |  2 +-
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/x86/include/asm/kvm_host.h   |  6 +++---
 include/linux/kvm_host.h          |  9 +++++----
 virt/kvm/kvm_main.c               | 25 ++++++++++++++++++++++---
 5 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 49358f20d36f44..3cd04d164c64db 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -44,7 +44,7 @@
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 #endif
 
-#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1c9458a7ec92ea..d239ae166c4e47 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -41,7 +41,7 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
-#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 19219826bed6bb..84c8489531bb7c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -61,10 +61,10 @@
 #define KVM_REQ_PMI               19
 #define KVM_REQ_SMI               20
 #define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_CRASH          26
 #define KVM_REQ_IOAPIC_EOI_EXIT   27
 #define KVM_REQ_HV_RESET          28
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f4a2c00092f890..a5bfffa8c8d439 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -117,14 +117,15 @@ static inline bool is_error_page(struct page *page)
 
 #define KVM_REQUEST_MASK           GENMASK(7,0)
 #define KVM_REQUEST_NO_WAKEUP      BIT(8)
+#define KVM_REQUEST_WAIT           BIT(9)
 /*
  * Architecture-independent vcpu->requests bit members
  * Bits 4-7 are reserved for more arch-independent bits.
  */
-#define KVM_REQ_TLB_FLUSH          (0 | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_MMU_RELOAD         (1 | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_PENDING_TIMER      2
-#define KVM_REQ_UNHALT             3
+#define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_PENDING_TIMER     2
+#define KVM_REQ_UNHALT            3
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 632f7b3e198c46..035bc51f656f99 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,6 +165,24 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(vcpu_put);
 
+/* TODO: merge with kvm_arch_vcpu_should_kick */
+static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
+{
+	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
+
+	/*
+	 * We need to wait for the VCPU to reenable interrupts and get out of
+	 * READING_SHADOW_PAGE_TABLES mode.
+	 */
+	if (req & KVM_REQUEST_WAIT)
+		return mode != OUTSIDE_GUEST_MODE;
+
+	/*
+	 * Need to kick a running VCPU, but otherwise there is nothing to do.
+	 */
+	return mode == IN_GUEST_MODE;
+}
+
 static void ack_flush(void *_completed)
 {
 }
@@ -174,6 +192,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	int i, cpu, me;
 	cpumask_var_t cpus;
 	bool called = true;
+	bool wait = req & KVM_REQUEST_WAIT;
 	struct kvm_vcpu *vcpu;
 
 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
@@ -187,13 +206,13 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 			continue;
 
 		if (cpus != NULL && cpu != -1 && cpu != me &&
-		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
+		    kvm_request_needs_ipi(vcpu, req))
 			cpumask_set_cpu(cpu, cpus);
 	}
 	if (unlikely(cpus == NULL))
-		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+		smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait);
 	else if (!cpumask_empty(cpus))
-		smp_call_function_many(cpus, ack_flush, NULL, 1);
+		smp_call_function_many(cpus, ack_flush, NULL, wait);
 	else
 		called = false;
 	put_cpu();

From 6ed071f051e12cf7baa1b69d3becb8f232fdfb7b Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Tue, 25 Apr 2017 16:42:44 +0200
Subject: [PATCH 220/227] KVM: x86: fix emulation of RSM and IRET instructions

On AMD, the effect of set_nmi_mask called by emulate_iret_real and em_rsm
on hflags is reverted later on in x86_emulate_instruction where hflags are
overwritten with ctxt->emul_flags (the kvm_set_hflags call). This manifests
as a hang when rebooting Windows VMs with QEMU, OVMF, and >1 vcpu.

Instead of trying to merge ctxt->emul_flags into vcpu->arch.hflags after
an instruction is emulated, this commit deletes emul_flags altogether and
makes the emulator access vcpu->arch.hflags using two new accessors. This
way all changes, on the emulator side as well as in functions called from
the emulator and accessing vcpu state with emul_to_vcpu, are preserved.

More details on the bug and its manifestation with Windows and OVMF:

  It's a KVM bug in the interaction between SMI/SMM and NMI, specific to AMD.
  I believe that the SMM part explains why we started seeing this only with
  OVMF.

  KVM masks and unmasks NMI when entering and leaving SMM. When KVM emulates
  the RSM instruction in em_rsm, the set_nmi_mask call doesn't stick because
  later on in x86_emulate_instruction we overwrite arch.hflags with
  ctxt->emul_flags, effectively reverting the effect of the set_nmi_mask call.
  The AMD-specific hflag of interest here is HF_NMI_MASK.

  When rebooting the system, Windows sends an NMI IPI to all but the current
  cpu to shut them down. Only after all of them are parked in HLT will the
  initiating cpu finish the restart. If NMI is masked, other cpus never get
  the memo and the initiating cpu spins forever, waiting for
  hal!HalpInterruptProcessorsStarted to drop. That's the symptom we observe.

Fixes: a584539b24b8 ("KVM: x86: pass the whole hflags field to emulator and back")
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  4 +++-
 arch/x86/kvm/emulate.c             | 16 +++++++++-------
 arch/x86/kvm/x86.c                 | 15 ++++++++++++---
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 3e8c287090e428..05596261577937 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -221,6 +221,9 @@ struct x86_emulate_ops {
 	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
 			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 	void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
+
+	unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
+	void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -290,7 +293,6 @@ struct x86_emulate_ctxt {
 
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
-	int emul_flags;
 
 	bool perm_ok; /* do not check permissions if true */
 	bool ud;	/* inject an #UD if host doesn't support insn */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6a2ea945d01f12..c25cfaf584e728 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	u64 smbase;
 	int ret;
 
-	if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+	if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
 		return emulate_ud(ctxt);
 
 	/*
@@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 		return X86EMUL_UNHANDLEABLE;
 	}
 
-	if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+	if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
 		ctxt->ops->set_nmi_mask(ctxt, false);
 
-	ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
-	ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+	ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+		~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
 	return X86EMUL_CONTINUE;
 }
 
@@ -5323,6 +5323,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	const struct x86_emulate_ops *ops = ctxt->ops;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = ctxt->dst.type;
+	unsigned emul_flags;
 
 	ctxt->mem_read.pos = 0;
 
@@ -5337,6 +5338,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		goto done;
 	}
 
+	emul_flags = ctxt->ops->get_hflags(ctxt);
 	if (unlikely(ctxt->d &
 		     (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
 		if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5370,7 +5372,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				fetch_possible_mmx_operand(ctxt, &ctxt->dst);
 		}
 
-		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+		if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_PRE_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -5399,7 +5401,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				goto done;
 		}
 
-		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+		if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_POST_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -5453,7 +5455,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
-	if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+	if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_MEMACCESS);
 		if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 69fcee26f4dae2..be2ade58edb9f0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5207,6 +5207,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
 	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
 }
 
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+	return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+	kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
 static const struct x86_emulate_ops emulate_ops = {
 	.read_gpr            = emulator_read_gpr,
 	.write_gpr           = emulator_write_gpr,
@@ -5246,6 +5256,8 @@ static const struct x86_emulate_ops emulate_ops = {
 	.intercept           = emulator_intercept,
 	.get_cpuid           = emulator_get_cpuid,
 	.set_nmi_mask        = emulator_set_nmi_mask,
+	.get_hflags          = emulator_get_hflags,
+	.set_hflags          = emulator_set_hflags,
 };
 
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -5298,7 +5310,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
 	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
 	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
-	ctxt->emul_flags = vcpu->arch.hflags;
 
 	init_decode_cache(ctxt);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5702,8 +5713,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-		if (vcpu->arch.hflags != ctxt->emul_flags)
-			kvm_set_hflags(vcpu, ctxt->emul_flags);
 		kvm_rip_write(vcpu, ctxt->eip);
 		if (r == EMULATE_DONE)
 			kvm_vcpu_check_singlestep(vcpu, rflags, &r);

From 70f3aac964ae2bc9a0a1d5d65a62e258591ade18 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 26 Apr 2017 08:53:46 -0700
Subject: [PATCH 221/227] kvm: nVMX: Remove superfluous VMX instruction fault
 checks

According to the Intel SDM, "Certain exceptions have priority over VM
exits. These include invalid-opcode exceptions, faults based on
privilege level*, and general-protection exceptions that are based on
checking I/O permission bits in the task-state segment (TSS)."

There is no need to check for faulting conditions that the hardware
has already checked.

* These include faults generated by attempts to execute, in
  virtual-8086 mode, privileged instructions that are not recognized
  in that mode.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 57 ++++++++++++----------------------------------
 1 file changed, 14 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b003b8dfb20c6d..75e6cc945be029 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7052,34 +7052,24 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
 	int ret;
-	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
-	/* The Intel VMX Instruction Reference lists a bunch of bits that
-	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
-	 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
-	 * Otherwise, we should fail with #UD. We test these now:
+	/*
+	 * The Intel VMX Instruction Reference lists a bunch of bits that are
+	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
+	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+	 * Otherwise, we should fail with #UD.  But most faulting conditions
+	 * have already been checked by hardware, prior to the VM-exit for
+	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
+	 * that bit set to 1 in non-root mode.
 	 */
-	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
-	    !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
-	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
-	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	if (is_long_mode(vcpu) && !cs.l) {
+	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
 
-	if (vmx_get_cpl(vcpu)) {
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 		return kvm_skip_emulated_instruction(vcpu);
@@ -7106,29 +7096,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
  * Intel's VMX Instruction Reference specifies a common set of prerequisites
  * for running VMX instructions (except VMXON, whose prerequisites are
  * slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
  */
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 {
-	struct kvm_segment cs;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!vmx->nested.vmxon) {
+	if (!to_vmx(vcpu)->nested.vmxon) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 0;
 	}
-
-	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
-	    (is_long_mode(vcpu) && !cs.l)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
-	}
-
-	if (vmx_get_cpl(vcpu)) {
-		kvm_inject_gp(vcpu, 0);
-		return 0;
-	}
-
 	return 1;
 }
 
@@ -7472,7 +7448,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 		if (get_vmx_mem_address(vcpu, exit_qualification,
 				vmx_instruction_info, true, &gva))
 			return 1;
-		/* _system ok, as nested_vmx_check_permission verified cpl=0 */
+		/* _system ok, as hardware has verified cpl=0 */
 		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
 			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
 	}
@@ -7605,7 +7581,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	if (get_vmx_mem_address(vcpu, exit_qualification,
 			vmx_instruction_info, true, &vmcs_gva))
 		return 1;
-	/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+	/* ok to use *_system, as hardware has verified cpl=0 */
 	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
 				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
 				 sizeof(u64), &e)) {
@@ -7638,11 +7614,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 

From bcb85c887647c0f096b51ac2ee391ea792b14b9e Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 24 Apr 2017 11:16:49 +0200
Subject: [PATCH 222/227] KVM: Documentation: remove VM mmap documentation

Since commit 80f5b5e700fa9c ("KVM: remove vm mmap method"), the VM mmap
handler is gone. Remove the corresponding documentation.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f038f8cafa70fb..4029943887a30b 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -110,11 +110,7 @@ Type: system ioctl
 Parameters: machine type identifier (KVM_VM_*)
 Returns: a VM fd that can be used to control the new virtual machine.
 
-The new VM has no virtual cpus and no memory.  An mmap() of a VM fd
-will access the virtual machine's physical address space; offset zero
-corresponds to guest physical address zero.  Use of mmap() on a VM fd
-is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
-available.
+The new VM has no virtual cpus and no memory.
 You probably want to use 0 as machine type.
 
 In order to create user controlled virtual machines on S390, check

From 5c0aea0e8d98e38858fbb3a09870ed8487a01da2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 28 Apr 2017 17:06:20 +0200
Subject: [PATCH 223/227] KVM: x86: don't hold kvm->lock in KVM_SET_GSI_ROUTING
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We needed the lock to avoid racing with creation of the irqchip on x86. As
kvm_set_irq_routing() calls srcu_synchronize_expedited(), this lock
might be held for a longer time.

Let's introduce an arch specific callback to check if we can actually
add irq routes. For x86, all we have to do is check if we have an
irqchip in the kernel. We don't need kvm->lock at that point as the
irqchip is marked as inititalized only when actually fully created.

Reported-by: Steve Rutherford <srutherford@google.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Fixes: 1df6ddede10a ("KVM: x86: race between KVM_SET_GSI_ROUTING and KVM_CREATE_IRQCHIP")
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/irq.h              |  2 +-
 arch/x86/kvm/irq_comm.c         | 15 +++++++++------
 arch/x86/kvm/x86.c              | 11 +----------
 include/linux/kvm_host.h        |  1 +
 virt/kvm/irqchip.c              |  5 +++++
 virt/kvm/kvm_main.c             |  5 ++---
 7 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 84c8489531bb7c..f5bddf92faba81 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -728,7 +728,6 @@ struct kvm_hv {
 
 enum kvm_irqchip_mode {
 	KVM_IRQCHIP_NONE,
-	KVM_IRQCHIP_INIT_IN_PROGRESS, /* temporarily set during creation */
 	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
 	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 0edd22c3344c9e..d5005cc265217c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -111,7 +111,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
-	return mode > KVM_IRQCHIP_INIT_IN_PROGRESS;
+	return mode != KVM_IRQCHIP_NONE;
 }
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 4517a4c2ac3a1e..3cc3b2d130a0c2 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -274,16 +274,19 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm);
+}
+
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
-	/* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */
-	if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE)
-		return -EINVAL;
-
-	/* Matches smp_wmb() when setting irqchip_mode */
-	smp_rmb();
+	/* We can't check irqchip_in_kernel() here as some callers are
+	 * currently inititalizing the irqchip. Other callers should therefore
+	 * check kvm_arch_can_set_irq_routing() before calling this function.
+	 */
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be2ade58edb9f0..2fe9aa116288b4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3919,14 +3919,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		if (kvm->created_vcpus)
 			goto split_irqchip_unlock;
-		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_empty_irq_routing(kvm);
-		if (r) {
-			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
-			/* Pairs with smp_rmb() when reading irqchip_mode */
-			smp_wmb();
+		if (r)
 			goto split_irqchip_unlock;
-		}
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -4012,12 +4007,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto create_irqchip_unlock;
 		}
 
-		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
-			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
-			/* Pairs with smp_rmb() when reading irqchip_mode */
-			smp_wmb();
 			kvm_ioapic_destroy(kvm);
 			kvm_pic_destroy(kvm);
 			goto create_irqchip_unlock;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a5bfffa8c8d439..25cf258a1c9bf2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1018,6 +1018,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #define KVM_MAX_IRQ_ROUTES 1024
 #endif
 
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index cc30d01a56bee9..31e40c9e81df41 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -172,6 +172,11 @@ void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm)
 {
 }
 
+bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+	return true;
+}
+
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *ue,
 			unsigned nr,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 035bc51f656f99..6281cc2446d59c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3075,6 +3075,8 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (copy_from_user(&routing, argp, sizeof(routing)))
 			goto out;
 		r = -EINVAL;
+		if (!kvm_arch_can_set_irq_routing(kvm))
+			goto out;
 		if (routing.nr > KVM_MAX_IRQ_ROUTES)
 			goto out;
 		if (routing.flags)
@@ -3090,11 +3092,8 @@ static long kvm_vm_ioctl(struct file *filp,
 					   routing.nr * sizeof(*entries)))
 				goto out_free_irq_routing;
 		}
-		/* avoid races with KVM_CREATE_IRQCHIP on x86 */
-		mutex_lock(&kvm->lock);
 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
 					routing.flags);
-		mutex_unlock(&kvm->lock);
 out_free_irq_routing:
 		vfree(entries);
 		break;

From ee5f7d79a80550179b258417442b7bdbccaf476a Mon Sep 17 00:00:00 2001
From: "Justin M. Forbes" <jforbes@fedoraproject.org>
Date: Tue, 11 Apr 2017 10:34:35 -0500
Subject: [PATCH 224/227] tools/kvm: fix top level makefile

The top level tools/Makefile includes kvm_stat as a target in help, but
the actual target is missing.

Signed-off-by: Justin M. Forbes <jforbes@fedoraproject.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 00caacd3ed928c..c8a90d01dd8ede 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -86,10 +86,13 @@ tmon: FORCE
 freefall: FORCE
 	$(call descend,laptop/$@)
 
+kvm_stat: FORCE
+	$(call descend,kvm/$@)
+
 all: acpi cgroup cpupower gpio hv firewire lguest \
 		perf selftests turbostat usb \
 		virtio vm net x86_energy_perf_policy \
-		tmon freefall objtool
+		tmon freefall objtool kvm_stat
 
 acpi_install:
 	$(call descend,power/$(@:_install=),install)

From 4e335d9e7ddbcf83d03e7fbe65797ebed2272c18 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 2 May 2017 16:20:18 +0200
Subject: [PATCH 225/227] Revert "KVM: Support vCPU-based gfn->hva cache"

This reverts commit bbd6411513aa8ef3ea02abab61318daf87c1af1e.

I've been sitting on this revert for too long and it unfortunately
missed 4.11.  It's also the reason why I haven't merged ring-based
dirty tracking for 4.12.

Using kvm_vcpu_memslots in kvm_gfn_to_hva_cache_init and
kvm_vcpu_write_guest_offset_cached means that the MSR value can
now be used to access SMRAM, simply by making it point to an SMRAM
physical address.  This is problematic because it lets the guest
OS overwrite memory that it shouldn't be able to touch.

Cc: stable@vger.kernel.org
Fixes: bbd6411513aa8ef3ea02abab61318daf87c1af1e
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c     | 22 +++++++++++----------
 arch/x86/kvm/x86.c       | 41 ++++++++++++++++++++--------------------
 include/linux/kvm_host.h | 16 ++++++++--------
 virt/kvm/kvm_main.c      | 34 ++++++++++++++++-----------------
 4 files changed, 58 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bad6a25067bc0c..9fa5b816496199 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
-	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
-					   sizeof(val));
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+				      sizeof(val));
 }
 
 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
 {
-	return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
-					  sizeof(*val));
+
+	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+				      sizeof(*val));
 }
 
 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -2285,8 +2287,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
-				       sizeof(u32)))
+	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+				  sizeof(u32)))
 		return;
 
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2338,14 +2340,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 		max_isr = 0;
 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
-				    sizeof(u32));
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+				sizeof(u32));
 }
 
 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
 	if (vapic_addr) {
-		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					&vcpu->arch.apic->vapic_cache,
 					vapic_addr, sizeof(u32)))
 			return -EINVAL;
@@ -2439,7 +2441,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 	vcpu->arch.pv_eoi.msr_val = data;
 	if (!pv_eoi_enabled(vcpu))
 		return 0;
-	return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
+	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
 					 addr, sizeof(u8));
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2fe9aa116288b4..b38a302858a063 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1785,7 +1785,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct pvclock_vcpu_time_info guest_hv_clock;
 
-	if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return;
 
@@ -1806,9 +1806,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-				    &vcpu->hv_clock,
-				    sizeof(vcpu->hv_clock.version));
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
 
 	smp_wmb();
 
@@ -1822,16 +1822,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 
-	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-				    &vcpu->hv_clock,
-				    sizeof(vcpu->hv_clock));
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
 
 	smp_wmb();
 
 	vcpu->hv_clock.version++;
-	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-				    &vcpu->hv_clock,
-				    sizeof(vcpu->hv_clock.version));
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2064,7 +2064,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 		return 0;
 	}
 
-	if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
+	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
 					sizeof(u32)))
 		return 1;
 
@@ -2083,7 +2083,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
+	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 
@@ -2094,7 +2094,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
@@ -2103,14 +2103,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
@@ -2215,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & 1))
 			break;
 
-		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
 		     &vcpu->arch.pv_time, data & ~1ULL,
 		     sizeof(struct pvclock_vcpu_time_info)))
 			vcpu->arch.pv_time_enabled = false;
@@ -2236,7 +2236,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data & KVM_STEAL_RESERVED_MASK)
 			return 1;
 
-		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
 						data & KVM_STEAL_VALID_BITS,
 						sizeof(struct kvm_steal_time)))
 			return 1;
@@ -2858,7 +2858,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.preempted = 1;
 
-	kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
 			&vcpu->arch.st.steal.preempted,
 			offsetof(struct kvm_steal_time, preempted),
 			sizeof(vcpu->arch.st.steal.preempted));
@@ -8527,8 +8527,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
-	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
-					   sizeof(val));
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+				      sizeof(val));
 }
 
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 25cf258a1c9bf2..3727afdf614d57 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -650,18 +650,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 			  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-			       void *data, unsigned long len);
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len);
-int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-				void *data, unsigned long len);
-int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-				       void *data, int offset, unsigned long len);
-int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-				   gpa_t gpa, unsigned long len);
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, int offset, unsigned long len);
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6281cc2446d59c..4c4d3fe1065438 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1975,18 +1975,18 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
 	return 0;
 }
 
-int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 
-int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-				       void *data, int offset, unsigned long len)
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, int offset, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 	gpa_t gpa = ghc->gpa + offset;
 
@@ -1996,7 +1996,7 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_vcpu_write_guest(vcpu, gpa, data, len);
+		return kvm_write_guest(kvm, gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -2008,19 +2008,19 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
+EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
 
-int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-			       void *data, unsigned long len)
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
 {
-	return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
+	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
-int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-			       void *data, unsigned long len)
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 
 	BUG_ON(len > ghc->len);
@@ -2029,7 +2029,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
+		return kvm_read_guest(kvm, ghc->gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -2040,7 +2040,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {

From 0266c894b51c41d3161adb90e3c15eda3e90a7be Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 4 May 2017 15:14:13 +0200
Subject: [PATCH 226/227] KVM: put back #ifndef CONFIG_S390 around
 kvm_vcpu_kick
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The #ifndef was removed in 75aaafb79f73516b69d5639ad30a72d72e75c8b4,
but it was also protecting smp_send_reschedule() in kvm_vcpu_kick().

Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4c4d3fe1065438..6e3b12c1925a00 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2229,6 +2229,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
 
+#ifndef CONFIG_S390
 /*
  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  */
@@ -2247,6 +2248,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+#endif /* !CONFIG_S390 */
 
 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {

From 2e5b0bd9cc6172edef502dfae28ae790f74a882e Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 4 May 2017 11:51:58 -0700
Subject: [PATCH 227/227] kvm: nVMX: Don't validate disabled secondary controls

According to the SDM, if the "activate secondary controls" primary
processor-based VM-execution control is 0, no checks are performed on
the secondary processor-based VM-execution controls.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75e6cc945be029..e7d929103f4af6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10240,9 +10240,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				vmx->nested.nested_vmx_procbased_ctls_low,
 				vmx->nested.nested_vmx_procbased_ctls_high) ||
-	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
-				vmx->nested.nested_vmx_secondary_ctls_low,
-				vmx->nested.nested_vmx_secondary_ctls_high) ||
+	    (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+	     !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+				 vmx->nested.nested_vmx_secondary_ctls_low,
+				 vmx->nested.nested_vmx_secondary_ctls_high)) ||
 	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
 				vmx->nested.nested_vmx_pinbased_ctls_low,
 				vmx->nested.nested_vmx_pinbased_ctls_high) ||