Skip to content

Commit

Permalink
xen: SMP guest support
Browse files Browse the repository at this point in the history
This is a fairly straightforward Xen implementation of smp_ops.

Xen has its own IPI mechanisms, and has no dependency on any
APIC-based IPI.  The smp_ops hooks and the flush_tlb_others pv_op
allow a Xen guest to avoid all APIC code in arch/i386 (the only apic
operation is a single apic_read for the apic version number).

One subtle point which needs to be addressed is unpinning pagetables
when another cpu may have a lazy tlb reference to the pagetable. Xen
will not allow an in-use pagetable to be unpinned, so we must find any
other cpus with a reference to the pagetable and get them to shoot
down their references.

Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Benjamin LaHaise <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Andi Kleen <[email protected]>
  • Loading branch information
Jeremy Fitzhardinge authored and jsgf committed Jul 18, 2007
1 parent ab55028 commit f87e4ca
Show file tree
Hide file tree
Showing 11 changed files with 705 additions and 53 deletions.
2 changes: 1 addition & 1 deletion arch/i386/xen/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

config XEN
bool "Enable support for Xen hypervisor"
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || SMP || NEED_MULTIPLE_NODES)
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES)
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Expand Down
2 changes: 2 additions & 0 deletions arch/i386/xen/Makefile
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
events.o time.o

obj-$(CONFIG_SMP) += smp.o
115 changes: 89 additions & 26 deletions arch/i386/xen/enlighten.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
#include <linux/smp.h>

#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
Expand All @@ -40,6 +41,7 @@
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>

#include "xen-ops.h"
#include "mmu.h"
Expand All @@ -56,7 +58,7 @@ DEFINE_PER_CPU(unsigned long, xen_cr3);
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);

static void xen_vcpu_setup(int cpu)
void xen_vcpu_setup(int cpu)
{
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
Expand Down Expand Up @@ -347,30 +349,46 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
}
}

/* Load a new IDT into Xen. In principle this can be per-CPU, so we
hold a spinlock to protect the static traps[] array (static because
it avoids allocation, and saves stack space). */
static void xen_load_idt(const struct Xgt_desc_struct *desc)
static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
struct trap_info *traps)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];

int cpu = smp_processor_id();
unsigned in, out, count;

per_cpu(idt_desc, cpu) = *desc;

count = (desc->size+1) / 8;
BUG_ON(count > 256);

spin_lock(&lock);
for (in = out = 0; in < count; in++) {
const u32 *entry = (u32 *)(desc->address + in * 8);

if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
out++;
}
traps[out].address = 0;
}

void xen_copy_trap_info(struct trap_info *traps)
{
const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc);

xen_convert_trap_info(desc, traps);

put_cpu_var(idt_desc);
}

/* Load a new IDT into Xen. In principle this can be per-CPU, so we
hold a spinlock to protect the static traps[] array (static because
it avoids allocation, and saves stack space). */
static void xen_load_idt(const struct Xgt_desc_struct *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
int cpu = smp_processor_id();

per_cpu(idt_desc, cpu) = *desc;

spin_lock(&lock);

xen_convert_trap_info(desc, traps);

xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
Expand Down Expand Up @@ -428,6 +446,12 @@ static unsigned long xen_apic_read(unsigned long reg)
{
return 0;
}

static void xen_apic_write(unsigned long reg, unsigned long val)
{
/* Warn to see if there's any stray references */
WARN_ON(1);
}
#endif

static void xen_flush_tlb(void)
Expand All @@ -449,6 +473,40 @@ static void xen_flush_tlb_single(unsigned long addr)
BUG();
}

static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
unsigned long va)
{
struct mmuext_op op;
cpumask_t cpumask = *cpus;

/*
* A couple of (to be removed) sanity checks:
*
* - current CPU must not be in mask
* - mask must exist :)
*/
BUG_ON(cpus_empty(cpumask));
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
BUG_ON(!mm);

/* If a CPU which we ran on has gone down, OK. */
cpus_and(cpumask, cpumask, cpu_online_map);
if (cpus_empty(cpumask))
return;

if (va == TLB_FLUSH_ALL) {
op.cmd = MMUEXT_TLB_FLUSH_MULTI;
op.arg2.vcpumask = (void *)cpus;
} else {
op.cmd = MMUEXT_INVLPG_MULTI;
op.arg1.linear_addr = va;
op.arg2.vcpumask = (void *)cpus;
}

if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}

static unsigned long xen_read_cr2(void)
{
return x86_read_percpu(xen_vcpu)->arch.cr2;
Expand All @@ -460,18 +518,6 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4 & ~X86_CR4_TSD);
}

/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
* must use the following accessor macros to pack/unpack valid MFNs.
*
* Note that Xen is using the fact that the pagetable base is always
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
* of cr3.
*/
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))

static unsigned long xen_read_cr3(void)
{
return x86_read_percpu(xen_cr3);
Expand Down Expand Up @@ -740,8 +786,8 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.io_delay = xen_io_delay,

#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = paravirt_nop,
.apic_write_atomic = paravirt_nop,
.apic_write = xen_apic_write,
.apic_write_atomic = xen_apic_write,
.apic_read = xen_apic_read,
.setup_boot_clock = paravirt_nop,
.setup_secondary_clock = paravirt_nop,
Expand All @@ -751,6 +797,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,

.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
Expand Down Expand Up @@ -796,6 +843,19 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.set_lazy_mode = xen_set_lazy_mode,
};

#ifdef CONFIG_SMP
static const struct smp_ops xen_smp_ops __initdata = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.cpu_up = xen_cpu_up,
.smp_cpus_done = xen_smp_cpus_done,

.smp_send_stop = xen_smp_send_stop,
.smp_send_reschedule = xen_smp_send_reschedule,
.smp_call_function_mask = xen_smp_call_function_mask,
};
#endif /* CONFIG_SMP */

/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
Expand All @@ -808,6 +868,9 @@ asmlinkage void __init xen_start_kernel(void)

/* Install Xen paravirt ops */
paravirt_ops = xen_paravirt_ops;
#ifdef CONFIG_SMP
smp_ops = xen_smp_ops;
#endif

xen_setup_features();

Expand Down
80 changes: 79 additions & 1 deletion arch/i386/xen/events.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};

/* IRQ <-> IPI mapping */
static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};

/* Packed IRQ information: binding type, sub-type index, and event channel. */
struct packed_irq
{
Expand All @@ -58,7 +61,13 @@ struct packed_irq
static struct packed_irq irq_info[NR_IRQS];

/* Binding types. */
enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
enum {
IRQT_UNBOUND,
IRQT_PIRQ,
IRQT_VIRQ,
IRQT_IPI,
IRQT_EVTCHN
};

/* Convenient shorthand for packed representation of an unbound IRQ. */
#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
Expand Down Expand Up @@ -261,6 +270,45 @@ static int bind_evtchn_to_irq(unsigned int evtchn)
return irq;
}

static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
int evtchn, irq;

spin_lock(&irq_mapping_update_lock);

irq = per_cpu(ipi_to_irq, cpu)[ipi];
if (irq == -1) {
irq = find_unbound_irq();
if (irq < 0)
goto out;

dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "ipi");

bind_ipi.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi) != 0)
BUG();
evtchn = bind_ipi.port;

evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);

per_cpu(ipi_to_irq, cpu)[ipi] = irq;

bind_evtchn_to_cpu(evtchn, cpu);
}

irq_bindcount[irq]++;

out:
spin_unlock(&irq_mapping_update_lock);
return irq;
}


static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
Expand Down Expand Up @@ -369,13 +417,43 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
}
EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);

int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id)
{
int irq, retval;

irq = bind_ipi_to_irq(ipi, cpu);
if (irq < 0)
return irq;

retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}

return irq;
}

void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
free_irq(irq, dev_id);
unbind_from_irq(irq);
}
EXPORT_SYMBOL_GPL(unbind_from_irqhandler);

void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
{
int irq = per_cpu(ipi_to_irq, cpu)[vector];
BUG_ON(irq < 0);
notify_remote_via_irq(irq);
}


/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
Expand Down
Loading

0 comments on commit f87e4ca

Please sign in to comment.