1 files changed, 1631 insertions, 0 deletions
diff --git a/system/xen/patches/xen-4.10.2-pre.patch b/system/xen/patches/xen-4.10.2-pre.patch
new file mode 100644
index 0000000000..42477696e1
--- /dev/null
+++ b/system/xen/patches/xen-4.10.2-pre.patch
@@ -0,0 +1,1631 @@
+diff --git a/tools/libacpi/Makefile b/tools/libacpi/Makefile
+index a47a658a25..c17f3924cc 100644
+--- a/tools/libacpi/Makefile
++++ b/tools/libacpi/Makefile
+@@ -43,7 +43,7 @@ all: $(C_SRC) $(H_SRC)
+ 
+ $(H_SRC): $(ACPI_BUILD_DIR)/%.h: %.asl iasl
+ 	iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $<
+-	sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex >$@
++	sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex >$@
+ 	rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex)
+  
+ $(MK_DSDT): mk_dsdt.c
+@@ -76,7 +76,7 @@ $(ACPI_BUILD_DIR)/dsdt_anycpu_arm.asl: $(MK_DSDT)
+ 
+ $(C_SRC): $(ACPI_BUILD_DIR)/%.c: iasl $(ACPI_BUILD_DIR)/%.asl
+ 	iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $(ACPI_BUILD_DIR)/$*.asl
+-	sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX)
++	sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX)
+ 	echo "int $*_len=sizeof($*);" >> $@.$(TMP_SUFFIX)
+ 	mv -f $@.$(TMP_SUFFIX) $@
+ 	rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex)
+#diff --git a/xen/Makefile b/xen/Makefile
+#index ecec297b9b..580af86931 100644
+#--- a/xen/Makefile
+#+++ b/xen/Makefile
+#@@ -2,7 +2,7 @@
+# # All other places this is stored (eg. compile.h) should be autogenerated.
+# export XEN_VERSION       = 4
+# export XEN_SUBVERSION    = 10
+#-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION)
+#+export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION)
+# export XEN_FULLVERSION   = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
+# -include xen-version
+# 
+diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
+index 1e4e5680a7..f7085d3c7b 100644
+--- a/xen/arch/x86/acpi/power.c
++++ b/xen/arch/x86/acpi/power.c
+@@ -28,6 +28,7 @@
+ #include <asm/tboot.h>
+ #include <asm/apic.h>
+ #include <asm/io_apic.h>
++#include <asm/spec_ctrl.h>
+ #include <acpi/cpufreq/cpufreq.h>
+ 
+ uint32_t system_reset_counter = 1;
+@@ -163,6 +164,7 @@ static int enter_state(u32 state)
+ {
+     unsigned long flags;
+     int error;
++    struct cpu_info *ci;
+     unsigned long cr4;
+ 
+     if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) )
+@@ -203,12 +205,18 @@ static int enter_state(u32 state)
+         printk(XENLOG_ERR "Some devices failed to power down.");
+         system_state = SYS_STATE_resume;
+         device_power_up(error);
++        console_end_sync();
+         error = -EIO;
+         goto done;
+     }
+     else
+         error = 0;
+ 
++    ci = get_cpu_info();
++    spec_ctrl_enter_idle(ci);
++    /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */
++    ci->bti_ist_info = 0;
++
+     ACPI_FLUSH_CPU_CACHE();
+ 
+     switch ( state )
+@@ -243,17 +251,23 @@ static int enter_state(u32 state)
+     if ( (state == ACPI_STATE_S3) && error )
+         tboot_s3_error(error);
+ 
++    console_end_sync();
++
++    microcode_resume_cpu(0);
++
++    /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */
++    ci->bti_ist_info = default_bti_ist_info;
++    spec_ctrl_exit_idle(ci);
++
+  done:
+     spin_debug_enable();
+     local_irq_restore(flags);
+-    console_end_sync();
+     acpi_sleep_post(state);
+     if ( hvm_cpu_up() )
+         BUG();
++    cpufreq_add_cpu(0);
+ 
+  enable_cpu:
+-    cpufreq_add_cpu(0);
+-    microcode_resume_cpu(0);
+     rcu_barrier();
+     mtrr_aps_sync_begin();
+     enable_nonboot_cpus();
+diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
+index fdb2bf1779..136adadb63 100644
+--- a/xen/arch/x86/cpu/common.c
++++ b/xen/arch/x86/cpu/common.c
+@@ -747,6 +747,7 @@ void load_system_tables(void)
+ 			[IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE,
+ 			[IST_DF  - 1] = stack_top + IST_DF  * PAGE_SIZE,
+ 			[IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE,
++			[IST_DB  - 1] = stack_top + IST_DB  * PAGE_SIZE,
+ 
+ 			[IST_MAX ... ARRAY_SIZE(tss->ist) - 1] =
+ 				0x8600111111111111ul,
+@@ -774,6 +775,7 @@ void load_system_tables(void)
+ 	set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
+ 	set_ist(&idt_tables[cpu][TRAP_nmi],	      IST_NMI);
+ 	set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
++	set_ist(&idt_tables[cpu][TRAP_debug],         IST_DB);
+ 
+ 	/*
+ 	 * Bottom-of-stack must be 16-byte aligned!
+diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
+index 8229c635e4..f18cbbd55a 100644
+--- a/xen/arch/x86/hpet.c
++++ b/xen/arch/x86/hpet.c
+@@ -509,6 +509,8 @@ static void hpet_attach_channel(unsigned int cpu,
+ static void hpet_detach_channel(unsigned int cpu,
+                                 struct hpet_event_channel *ch)
+ {
++    unsigned int next;
++
+     spin_lock_irq(&ch->lock);
+ 
+     ASSERT(ch == per_cpu(cpu_bc_channel, cpu));
+@@ -517,7 +519,7 @@ static void hpet_detach_channel(unsigned int cpu,
+ 
+     if ( cpu != ch->cpu )
+         spin_unlock_irq(&ch->lock);
+-    else if ( cpumask_empty(ch->cpumask) )
++    else if ( (next = cpumask_first(ch->cpumask)) >= nr_cpu_ids )
+     {
+         ch->cpu = -1;
+         clear_bit(HPET_EVT_USED_BIT, &ch->flags);
+@@ -525,7 +527,7 @@ static void hpet_detach_channel(unsigned int cpu,
+     }
+     else
+     {
+-        ch->cpu = cpumask_first(ch->cpumask);
++        ch->cpu = next;
+         set_channel_irq_affinity(ch);
+         local_irq_enable();
+     }
+diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
+index b282089e03..131480fdd9 100644
+--- a/xen/arch/x86/hvm/emulate.c
++++ b/xen/arch/x86/hvm/emulate.c
+@@ -2113,22 +2113,20 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
+ 
+     vio->mmio_retry = 0;
+ 
+-    switch ( rc = x86_emulate(&hvmemul_ctxt->ctxt, ops) )
++    rc = x86_emulate(&hvmemul_ctxt->ctxt, ops);
++    if ( rc == X86EMUL_OKAY && vio->mmio_retry )
++        rc = X86EMUL_RETRY;
++
++    if ( !hvm_vcpu_io_need_completion(vio) )
+     {
+-    case X86EMUL_OKAY:
+-        if ( vio->mmio_retry )
+-            rc = X86EMUL_RETRY;
+-        /* fall through */
+-    default:
+         vio->mmio_cache_count = 0;
+         vio->mmio_insn_bytes = 0;
+-        break;
+-
+-    case X86EMUL_RETRY:
++    }
++    else
++    {
+         BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
+         vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
+         memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
+-        break;
+     }
+ 
+     if ( hvmemul_ctxt->ctxt.retire.singlestep )
+diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c
+index f7aed7f69e..28377091ca 100644
+--- a/xen/arch/x86/hvm/hpet.c
++++ b/xen/arch/x86/hvm/hpet.c
+@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, unsigned int tn,
+         diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
+             ? (uint32_t)diff : 0;
+ 
++    destroy_periodic_time(&h->pt[tn]);
+     if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
++    {
+         /* if LegacyReplacementRoute bit is set, HPET specification requires
+            timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
+            timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
+         irq = (tn == 0) ? 0 : 8;
++        h->pt[tn].source = PTSRC_isa;
++    }
+     else
++    {
+         irq = timer_int_route(h, tn);
++        h->pt[tn].source = PTSRC_ioapic;
++    }
+ 
+     /*
+      * diff is the time from now when the timer should fire, for a periodic
+diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
+index d5afe20cc8..25b2445429 100644
+--- a/xen/arch/x86/hvm/ioreq.c
++++ b/xen/arch/x86/hvm/ioreq.c
+@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
+ 
+ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+ {
++    unsigned int prev_state = STATE_IOREQ_NONE;
++
+     while ( sv->pending )
+     {
+         unsigned int state = p->state;
+ 
+-        rmb();
+-        switch ( state )
++        smp_rmb();
++
++    recheck:
++        if ( unlikely(state == STATE_IOREQ_NONE) )
+         {
+-        case STATE_IOREQ_NONE:
+             /*
+              * The only reason we should see this case is when an
+              * emulator is dying and it races with an I/O being
+@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+              */
+             hvm_io_assist(sv, ~0ul);
+             break;
++        }
++
++        if ( unlikely(state < prev_state) )
++        {
++            gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
++                     prev_state, state);
++            sv->pending = false;
++            domain_crash(sv->vcpu->domain);
++            return false; /* bail */
++        }
++
++        switch ( prev_state = state )
++        {
+         case STATE_IORESP_READY: /* IORESP_READY -> NONE */
+             p->state = STATE_IOREQ_NONE;
+             hvm_io_assist(sv, p->data);
+             break;
+         case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
+         case STATE_IOREQ_INPROCESS:
+-            wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state);
+-            break;
++            wait_on_xen_event_channel(sv->ioreq_evtchn,
++                                      ({ state = p->state;
++                                         smp_rmb();
++                                         state != prev_state; }));
++            goto recheck;
+         default:
+             gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
+             sv->pending = false;
+diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
+index f528e2d081..c85d004402 100644
+--- a/xen/arch/x86/hvm/irq.c
++++ b/xen/arch/x86/hvm/irq.c
+@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, unsigned ioapic_gsi)
+     vioapic_irq_positive_edge(d, ioapic_gsi);
+ }
+ 
++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level)
++{
++    struct hvm_irq *hvm_irq = hvm_domain_irq(d);
++    int vector;
++
++    if ( gsi >= hvm_irq->nr_gsis )
++    {
++        ASSERT_UNREACHABLE();
++        return -1;
++    }
++
++    spin_lock(&d->arch.hvm_domain.irq_lock);
++    if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 )
++        assert_gsi(d, gsi);
++    vector = vioapic_get_vector(d, gsi);
++    spin_unlock(&d->arch.hvm_domain.irq_lock);
++
++    return vector;
++}
++
+ static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq)
+ {
+     assert_gsi(d, ioapic_gsi);
+diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
+index dedec5752d..3b72b4dc2a 100644
+--- a/xen/arch/x86/hvm/svm/svm.c
++++ b/xen/arch/x86/hvm/svm/svm.c
+@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct vcpu *v)
+     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
+     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NMI);
+     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
++    set_ist(&idt_tables[cpu][TRAP_debug],         IST_DB);
+ }
+ 
+ static void svm_ctxt_switch_to(struct vcpu *v)
+@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vcpu *v)
+     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
++    set_ist(&idt_tables[cpu][TRAP_debug],         IST_NONE);
+ 
+     svm_restore_dr(v);
+ 
+@@ -1836,6 +1838,25 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+     struct vcpu *v = current;
+     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+ 
++    switch ( msr )
++    {
++        /*
++         * Sync not needed while the cross-vendor logic is in unilateral effect.
++    case MSR_IA32_SYSENTER_CS:
++    case MSR_IA32_SYSENTER_ESP:
++    case MSR_IA32_SYSENTER_EIP:
++         */
++    case MSR_STAR:
++    case MSR_LSTAR:
++    case MSR_CSTAR:
++    case MSR_SYSCALL_MASK:
++    case MSR_FS_BASE:
++    case MSR_GS_BASE:
++    case MSR_SHADOW_GS_BASE:
++        svm_sync_vmcb(v);
++        break;
++    }
++
+     switch ( msr )
+     {
+     case MSR_IA32_SYSENTER_CS:
+@@ -1848,6 +1869,34 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+         *msr_content = v->arch.hvm_svm.guest_sysenter_eip;
+         break;
+ 
++    case MSR_STAR:
++        *msr_content = vmcb->star;
++        break;
++
++    case MSR_LSTAR:
++        *msr_content = vmcb->lstar;
++        break;
++
++    case MSR_CSTAR:
++        *msr_content = vmcb->cstar;
++        break;
++
++    case MSR_SYSCALL_MASK:
++        *msr_content = vmcb->sfmask;
++        break;
++
++    case MSR_FS_BASE:
++        *msr_content = vmcb->fs.base;
++        break;
++
++    case MSR_GS_BASE:
++        *msr_content = vmcb->gs.base;
++        break;
++
++    case MSR_SHADOW_GS_BASE:
++        *msr_content = vmcb->kerngsbase;
++        break;
++
+     case MSR_IA32_MCx_MISC(4): /* Threshold register */
+     case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
+         /*
+@@ -1976,32 +2025,81 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+     int ret, result = X86EMUL_OKAY;
+     struct vcpu *v = current;
+     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+-    int sync = 0;
++    bool sync = false;
+ 
+     switch ( msr )
+     {
+     case MSR_IA32_SYSENTER_CS:
+     case MSR_IA32_SYSENTER_ESP:
+     case MSR_IA32_SYSENTER_EIP:
+-        sync = 1;
+-        break;
+-    default:
++    case MSR_STAR:
++    case MSR_LSTAR:
++    case MSR_CSTAR:
++    case MSR_SYSCALL_MASK:
++    case MSR_FS_BASE:
++    case MSR_GS_BASE:
++    case MSR_SHADOW_GS_BASE:
++        sync = true;
+         break;
+     }
+ 
+     if ( sync )
+-        svm_sync_vmcb(v);    
++        svm_sync_vmcb(v);
+ 
+     switch ( msr )
+     {
++    case MSR_IA32_SYSENTER_ESP:
++    case MSR_IA32_SYSENTER_EIP:
++    case MSR_LSTAR:
++    case MSR_CSTAR:
++    case MSR_FS_BASE:
++    case MSR_GS_BASE:
++    case MSR_SHADOW_GS_BASE:
++        if ( !is_canonical_address(msr_content) )
++            goto gpf;
++
++        switch ( msr )
++        {
++        case MSR_IA32_SYSENTER_ESP:
++            vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
++            break;
++
++        case MSR_IA32_SYSENTER_EIP:
++            vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
++            break;
++
++        case MSR_LSTAR:
++            vmcb->lstar = msr_content;
++            break;
++
++        case MSR_CSTAR:
++            vmcb->cstar = msr_content;
++            break;
++
++        case MSR_FS_BASE:
++            vmcb->fs.base = msr_content;
++            break;
++
++        case MSR_GS_BASE:
++            vmcb->gs.base = msr_content;
++            break;
++
++        case MSR_SHADOW_GS_BASE:
++            vmcb->kerngsbase = msr_content;
++            break;
++        }
++        break;
++
+     case MSR_IA32_SYSENTER_CS:
+         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
+         break;
+-    case MSR_IA32_SYSENTER_ESP:
+-        vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
++
++    case MSR_STAR:
++        vmcb->star = msr_content;
+         break;
+-    case MSR_IA32_SYSENTER_EIP:
+-        vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
++
++    case MSR_SYSCALL_MASK:
++        vmcb->sfmask = msr_content;
+         break;
+ 
+     case MSR_IA32_DEBUGCTLMSR:
+diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c
+index 89ef2db932..b5b946aa94 100644
+--- a/xen/arch/x86/hvm/svm/svmdebug.c
++++ b/xen/arch/x86/hvm/svm/svmdebug.c
+@@ -131,9 +131,8 @@ bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb,
+         PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n",
+                vmcb_get_dr7(vmcb));
+ 
+-    if ( efer & ~(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | EFER_SVME |
+-                  EFER_LMSLE | EFER_FFXSE) )
+-        PRINTF("EFER: undefined bits are not zero (%#"PRIx64")\n", efer);
++    if ( efer & ~EFER_KNOWN_MASK )
++        PRINTF("EFER: unknown bits are not zero (%#"PRIx64")\n", efer);
+ 
+     if ( hvm_efer_valid(v, efer, -1) )
+         PRINTF("EFER: %s (%"PRIx64")\n", hvm_efer_valid(v, efer, -1), efer);
+diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c
+index f0fa59d7d5..b02a70d086 100644
+--- a/xen/arch/x86/hvm/viridian.c
++++ b/xen/arch/x86/hvm/viridian.c
+@@ -245,7 +245,7 @@ void cpuid_viridian_leaves(const struct vcpu *v, uint32_t leaf,
+         };
+         union {
+             HV_PARTITION_PRIVILEGE_MASK mask;
+-            uint32_t lo, hi;
++            struct { uint32_t lo, hi; };
+         } u;
+ 
+         if ( !(viridian_feature_mask(d) & HVMPV_no_freq) )
+@@ -966,12 +966,10 @@ int viridian_hypercall(struct cpu_user_regs *regs)
+         gprintk(XENLOG_WARNING, "unimplemented hypercall %04x\n",
+                 input.call_code);
+         /* Fallthrough. */
+-    case HvGetPartitionId:
+     case HvExtCallQueryCapabilities:
+         /*
+-         * These hypercalls seem to be erroneously issued by Windows
+-         * despite neither AccessPartitionId nor EnableExtendedHypercalls
+-         * being set in CPUID leaf 2.
++         * This hypercall seems to be erroneously issued by Windows
++         * despite EnableExtendedHypercalls not being set in CPUID leaf 2.
+          * Given that return a status of 'invalid code' has not so far
+          * caused any problems it's not worth logging.
+          */
+diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
+index 181f4cb631..04e3c2e15b 100644
+--- a/xen/arch/x86/hvm/vpt.c
++++ b/xen/arch/x86/hvm/vpt.c
+@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src)
+ static int pt_irq_masked(struct periodic_time *pt)
+ {
+     struct vcpu *v = pt->vcpu;
+-    unsigned int gsi, isa_irq;
+-    int mask;
+-    uint8_t pic_imr;
++    unsigned int gsi = pt->irq;
+ 
+-    if ( pt->source == PTSRC_lapic )
++    switch ( pt->source )
++    {
++    case PTSRC_lapic:
+     {
+         struct vlapic *vlapic = vcpu_vlapic(v);
++
+         return (!vlapic_enabled(vlapic) ||
+                 (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED));
+     }
+ 
+-    isa_irq = pt->irq;
+-    gsi = hvm_isa_irq_to_gsi(isa_irq);
+-    pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr;
+-    mask = vioapic_get_mask(v->domain, gsi);
+-    if ( mask < 0 )
++    case PTSRC_isa:
+     {
+-        dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n",
+-                v->domain->domain_id, gsi);
+-        domain_crash(v->domain);
+-        return -1;
++        uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr;
++
++        /* Check if the interrupt is unmasked in the PIC. */
++        if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) )
++            return 0;
++
++        gsi = hvm_isa_irq_to_gsi(pt->irq);
++    }
++
++    /* Fallthrough to check if the interrupt is masked on the IO APIC. */
++    case PTSRC_ioapic:
++    {
++        int mask = vioapic_get_mask(v->domain, gsi);
++
++        if ( mask < 0 )
++        {
++            dprintk(XENLOG_WARNING,
++                    "d%d: invalid GSI (%u) for platform timer\n",
++                    v->domain->domain_id, gsi);
++            domain_crash(v->domain);
++            return -1;
++        }
++
++        return mask;
++    }
+     }
+ 
+-    return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) &&
+-            mask);
++    ASSERT_UNREACHABLE();
++    return 1;
+ }
+ 
+ static void pt_lock(struct periodic_time *pt)
+@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v)
+     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
+     struct periodic_time *pt, *temp, *earliest_pt;
+     uint64_t max_lag;
+-    int irq, is_lapic, pt_vector;
++    int irq, pt_vector = -1;
+ 
+     spin_lock(&v->arch.hvm_vcpu.tm_lock);
+ 
+@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v)
+ 
+     earliest_pt->irq_issued = 1;
+     irq = earliest_pt->irq;
+-    is_lapic = (earliest_pt->source == PTSRC_lapic);
+ 
+     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
+ 
+-    /*
+-     * If periodic timer interrut is handled by lapic, its vector in
+-     * IRR is returned and used to set eoi_exit_bitmap for virtual
+-     * interrupt delivery case. Otherwise return -1 to do nothing.
+-     */
+-    if ( is_lapic )
++    switch ( earliest_pt->source )
+     {
++    case PTSRC_lapic:
++        /*
++         * If periodic timer interrupt is handled by lapic, its vector in
++         * IRR is returned and used to set eoi_exit_bitmap for virtual
++         * interrupt delivery case. Otherwise return -1 to do nothing.
++         */
+         vlapic_set_irq(vcpu_vlapic(v), irq, 0);
+         pt_vector = irq;
+-    }
+-    else
+-    {
++        break;
++
++    case PTSRC_isa:
+         hvm_isa_irq_deassert(v->domain, irq);
+         if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
+              v->domain->arch.hvm_domain.vpic[irq >> 3].int_output )
+-        {
+             hvm_isa_irq_assert(v->domain, irq, NULL);
+-            pt_vector = -1;
+-        }
+         else
+         {
+             pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector);
+@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v)
+             if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
+                 pt_vector = -1;
+         }
++        break;
++
++    case PTSRC_ioapic:
++        /*
++         * NB: At the moment IO-APIC routed interrupts generated by vpt devices
++         * (HPET) are edge-triggered.
++         */
++        pt_vector = hvm_ioapic_assert(v->domain, irq, false);
++        if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
++            pt_vector = -1;
++        break;
+     }
+ 
+     return pt_vector;
+@@ -418,7 +444,14 @@ void create_periodic_time(
+     struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+     uint64_t period, uint8_t irq, time_cb *cb, void *data)
+ {
+-    ASSERT(pt->source != 0);
++    if ( !pt->source ||
++         (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) ||
++         (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis &&
++          pt->source == PTSRC_ioapic) )
++    {
++        ASSERT_UNREACHABLE();
++        return;
++    }
+ 
+     destroy_periodic_time(pt);
+ 
+@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
+ {
+     int on_list;
+ 
+-    ASSERT(pt->source == PTSRC_isa);
++    ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
+ 
+     if ( pt->vcpu == NULL )
+         return;
+diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
+index 642ca312bf..c281936af0 100644
+--- a/xen/arch/x86/pv/emul-priv-op.c
++++ b/xen/arch/x86/pv/emul-priv-op.c
+@@ -813,26 +813,6 @@ static int write_cr(unsigned int reg, unsigned long val,
+     return X86EMUL_UNHANDLEABLE;
+ }
+ 
+-static int read_dr(unsigned int reg, unsigned long *val,
+-                   struct x86_emulate_ctxt *ctxt)
+-{
+-    unsigned long res = do_get_debugreg(reg);
+-
+-    if ( IS_ERR_VALUE(res) )
+-        return X86EMUL_UNHANDLEABLE;
+-
+-    *val = res;
+-
+-    return X86EMUL_OKAY;
+-}
+-
+-static int write_dr(unsigned int reg, unsigned long val,
+-                    struct x86_emulate_ctxt *ctxt)
+-{
+-    return do_set_debugreg(reg, val) == 0
+-           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+-}
+-
+ static inline uint64_t guest_misc_enable(uint64_t val)
+ {
+     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+@@ -906,9 +886,16 @@ static int read_msr(unsigned int reg, uint64_t *val,
+         return X86EMUL_OKAY;
+ 
+     case MSR_EFER:
+-        *val = read_efer();
++        /* Hide unknown bits, and unconditionally hide SVME from guests. */
++        *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME;
++        /*
++         * Hide the 64-bit features from 32-bit guests.  SCE has
++         * vendor-dependent behaviour.
++         */
+         if ( is_pv_32bit_domain(currd) )
+-            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
++            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE |
++                      (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
++                       ? EFER_SCE : 0));
+         return X86EMUL_OKAY;
+ 
+     case MSR_K7_FID_VID_CTL:
+@@ -1326,8 +1313,8 @@ static const struct x86_emulate_ops priv_op_ops = {
+     .read_segment        = read_segment,
+     .read_cr             = read_cr,
+     .write_cr            = write_cr,
+-    .read_dr             = read_dr,
+-    .write_dr            = write_dr,
++    .read_dr             = x86emul_read_dr,
++    .write_dr            = x86emul_write_dr,
+     .read_msr            = read_msr,
+     .write_msr           = write_msr,
+     .cpuid               = pv_emul_cpuid,
+diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c
+index 5862130697..1619be7874 100644
+--- a/xen/arch/x86/pv/misc-hypercalls.c
++++ b/xen/arch/x86/pv/misc-hypercalls.c
+@@ -30,22 +30,10 @@ long do_set_debugreg(int reg, unsigned long value)
+ 
+ unsigned long do_get_debugreg(int reg)
+ {
+-    struct vcpu *curr = current;
++    unsigned long val;
++    int res = x86emul_read_dr(reg, &val, NULL);
+ 
+-    switch ( reg )
+-    {
+-    case 0 ... 3:
+-    case 6:
+-        return curr->arch.debugreg[reg];
+-    case 7:
+-        return (curr->arch.debugreg[7] |
+-                curr->arch.debugreg[5]);
+-    case 4 ... 5:
+-        return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
+-                curr->arch.debugreg[reg + 2] : 0);
+-    }
+-
+-    return -EINVAL;
++    return res == X86EMUL_OKAY ? val : -ENODEV;
+ }
+ 
+ long do_fpu_taskswitch(int set)
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index e1d023428c..f81fc2ca65 100644
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -968,6 +968,7 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
++    set_ist(&idt_tables[cpu][TRAP_debug],         IST_NONE);
+ 
+     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 3c7447bfe6..fa67a0ffbd 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -97,12 +97,13 @@ static void __init print_details(enum ind_thunk thunk)
+     printk(XENLOG_DEBUG "Speculative mitigation facilities:\n");
+ 
+     /* Hardware features which pertain to speculative mitigations. */
+-    printk(XENLOG_DEBUG "  Hardware features:%s%s%s%s%s\n",
++    printk(XENLOG_DEBUG "  Hardware features:%s%s%s%s%s%s\n",
+            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP"     : "",
+            (e8b  & cpufeat_mask(X86_FEATURE_IBPB))  ? " IBPB"      : "",
+            (caps & ARCH_CAPABILITIES_IBRS_ALL)      ? " IBRS_ALL"  : "",
+-           (caps & ARCH_CAPABILITIES_RDCL_NO)       ? " RDCL_NO"   : "");
++           (caps & ARCH_CAPABILITIES_RDCL_NO)       ? " RDCL_NO"   : "",
++           (caps & ARCH_CAPS_RSBA)                  ? " RSBA"      : "");
+ 
+     /* Compiled-in support which pertains to BTI mitigations. */
+     if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
+@@ -135,6 +136,20 @@ static bool __init retpoline_safe(void)
+          boot_cpu_data.x86 != 6 )
+         return false;
+ 
++    if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
++    {
++        uint64_t caps;
++
++        rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++        /*
++         * RBSA may be set by a hypervisor to indicate that we may move to a
++         * processor which isn't retpoline-safe.
++         */
++        if ( caps & ARCH_CAPS_RSBA )
++            return false;
++    }
++
+     switch ( boot_cpu_data.x86_model )
+     {
+     case 0x17: /* Penryn */
+@@ -161,18 +176,40 @@ static bool __init retpoline_safe(void)
+          * versions.
+          */
+     case 0x3d: /* Broadwell */
+-        return ucode_rev >= 0x28;
++        return ucode_rev >= 0x2a;
+     case 0x47: /* Broadwell H */
+-        return ucode_rev >= 0x1b;
++        return ucode_rev >= 0x1d;
+     case 0x4f: /* Broadwell EP/EX */
+-        return ucode_rev >= 0xb000025;
++        return ucode_rev >= 0xb000021;
+     case 0x56: /* Broadwell D */
+-        return false; /* TBD. */
++        switch ( boot_cpu_data.x86_mask )
++        {
++        case 2:  return ucode_rev >= 0x15;
++        case 3:  return ucode_rev >= 0x7000012;
++        case 4:  return ucode_rev >= 0xf000011;
++        case 5:  return ucode_rev >= 0xe000009;
++        default:
++            printk("Unrecognised CPU stepping %#x - assuming not reptpoline safe\n",
++                   boot_cpu_data.x86_mask);
++            return false;
++        }
++        break;
+ 
+         /*
+-         * Skylake and later processors are not retpoline-safe.
++         * Skylake, Kabylake and Cannonlake processors are not retpoline-safe.
+          */
++    case 0x4e:
++    case 0x55:
++    case 0x5e:
++    case 0x66:
++    case 0x67:
++    case 0x8e:
++    case 0x9e:
++        return false;
++
+     default:
++        printk("Unrecognised CPU model %#x - assuming not reptpoline safe\n",
++               boot_cpu_data.x86_model);
+         return false;
+     }
+ }
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index 906124331b..e217b0d6e2 100644
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
+ /*
+  * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
+  *
+- * Stack pages 0, 1 and 2:
++ * Stack pages 0 - 3:
+  *   These are all 1-page IST stacks.  Each of these stacks have an exception
+  *   frame and saved register state at the top.  The interesting bound for a
+  *   trace is the word adjacent to this, while the bound for a dump is the
+  *   very top, including the exception frame.
+  *
+- * Stack pages 3, 4 and 5:
++ * Stack pages 4 and 5:
+  *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
+  *   explicitly not present, so attempting to dump or trace it is
+  *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
+@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(unsigned long sp)
+ {
+     switch ( get_stack_page(sp) )
+     {
+-    case 0 ... 2:
++    case 0 ... 3:
+         return ROUNDUP(sp, PAGE_SIZE) -
+             offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
+ 
+ #ifndef MEMORY_GUARD
+-    case 3 ... 5:
++    case 4 ... 5:
+ #endif
+     case 6 ... 7:
+         return ROUNDUP(sp, STACK_SIZE) -
+@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsigned long sp)
+ {
+     switch ( get_stack_page(sp) )
+     {
+-    case 0 ... 2:
++    case 0 ... 3:
+         return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
+ 
+ #ifndef MEMORY_GUARD
+-    case 3 ... 5:
++    case 4 ... 5:
+ #endif
+     case 6 ... 7:
+         return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
+@@ -1761,11 +1761,36 @@ static void ler_enable(void)
+ 
+ void do_debug(struct cpu_user_regs *regs)
+ {
++    unsigned long dr6;
+     struct vcpu *v = current;
+ 
++    /* Stash dr6 as early as possible. */
++    dr6 = read_debugreg(6);
++
+     if ( debugger_trap_entry(TRAP_debug, regs) )
+         return;
+ 
++    /*
++     * At the time of writing (March 2018), on the subject of %dr6:
++     *
++     * The Intel manual says:
++     *   Certain debug exceptions may clear bits 0-3. The remaining contents
++     *   of the DR6 register are never cleared by the processor. To avoid
++     *   confusion in identifying debug exceptions, debug handlers should
++     *   clear the register (except bit 16, which they should set) before
++     *   returning to the interrupted task.
++     *
++     * The AMD manual says:
++     *   Bits 15:13 of the DR6 register are not cleared by the processor and
++     *   must be cleared by software after the contents have been read.
++     *
++     * Some bits are reserved set, some are reserved clear, and some bits
++     * which were previously reserved set are reused and cleared by hardware.
++     * For future compatibility, reset to the default value, which will allow
++     * us to spot any bit being changed by hardware to its non-default value.
++     */
++    write_debugreg(6, X86_DR6_DEFAULT);
++
+     if ( !guest_mode(regs) )
+     {
+         if ( regs->eflags & X86_EFLAGS_TF )
+@@ -1784,21 +1809,50 @@ void do_debug(struct cpu_user_regs *regs)
+                 regs->eflags &= ~X86_EFLAGS_TF;
+             }
+         }
+-        else
++
++        /*
++         * Check for fault conditions.  General Detect, and instruction
++         * breakpoints are faults rather than traps, at which point attempting
++         * to ignore and continue will result in a livelock.
++         */
++        if ( dr6 & DR_GENERAL_DETECT )
+         {
+-            /*
+-             * We ignore watchpoints when they trigger within Xen. This may
+-             * happen when a buffer is passed to us which previously had a
+-             * watchpoint set on it. No need to bump EIP; the only faulting
+-             * trap is an instruction breakpoint, which can't happen to us.
+-             */
+-            WARN_ON(!search_exception_table(regs));
++            printk(XENLOG_ERR "Hit General Detect in Xen context\n");
++            fatal_trap(regs, 0);
++        }
++
++        if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
++        {
++            unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT;
++
++            for ( bp = 0; bp < 4; ++bp )
++            {
++                if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
++                     ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ )
++                {
++                    printk(XENLOG_ERR
++                           "Hit instruction breakpoint in Xen context\n");
++                    fatal_trap(regs, 0);
++                }
++            }
+         }
++
++        /*
++         * Whatever caused this #DB should be a trap.  Note it and continue.
++         * Guests can trigger this in certain corner cases, so ensure the
++         * message is ratelimited.
++         */
++        gprintk(XENLOG_WARNING,
++                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
++                regs->cs, _p(regs->rip), _p(regs->rip),
++                regs->ss, _p(regs->rsp), dr6);
++
+         goto out;
+     }
+ 
+     /* Save debug status register where guest OS can peek at it */
+-    v->arch.debugreg[6] = read_debugreg(6);
++    v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
++    v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
+ 
+     ler_enable();
+     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
+@@ -1917,6 +1971,7 @@ void __init init_idt_traps(void)
+     set_ist(&idt_table[TRAP_double_fault],  IST_DF);
+     set_ist(&idt_table[TRAP_nmi],           IST_NMI);
+     set_ist(&idt_table[TRAP_machine_check], IST_MCE);
++    set_ist(&idt_table[TRAP_debug],         IST_DB);
+ 
+     /* CPU0 uses the master IDT. */
+     idt_tables[0] = idt_table;
+@@ -1984,6 +2039,12 @@ void activate_debugregs(const struct vcpu *curr)
+     }
+ }
+ 
++/*
++ * Used by hypercalls and the emulator.
++ *  -ENODEV => #UD
++ *  -EINVAL => #GP Invalid bit
++ *  -EPERM  => #GP Valid bit, but not permitted to use
++ */
+ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ {
+     int i;
+@@ -2015,7 +2076,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+         if ( v == curr )
+             write_debugreg(3, value);
+         break;
++
++    case 4:
++        if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE )
++            return -ENODEV;
++
++        /* Fallthrough */
+     case 6:
++        /* The upper 32 bits are strictly reserved. */
++        if ( value != (uint32_t)value )
++            return -EINVAL;
++
+         /*
+          * DR6: Bits 4-11,16-31 reserved (set to 1).
+          *      Bit 12 reserved (set to 0).
+@@ -2025,7 +2096,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+         if ( v == curr )
+             write_debugreg(6, value);
+         break;
++
++    case 5:
++        if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE )
++            return -ENODEV;
++
++        /* Fallthrough */
+     case 7:
++        /* The upper 32 bits are strictly reserved. */
++        if ( value != (uint32_t)value )
++            return -EINVAL;
++
+         /*
+          * DR7: Bit 10 reserved (set to 1).
+          *      Bits 11-12,14-15 reserved (set to 0).
+@@ -2038,6 +2119,10 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+          */
+         if ( value & DR_GENERAL_DETECT )
+             return -EPERM;
++
++        /* Zero the IO shadow before recalculating the real %dr7 */
++        v->arch.debugreg[5] = 0;
++
+         /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
+         if ( value & DR7_ACTIVE_MASK )
+         {
+@@ -2070,7 +2155,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+             write_debugreg(7, value);
+         break;
+     default:
+-        return -EINVAL;
++        return -ENODEV;
+     }
+ 
+     v->arch.debugreg[reg] = value;
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index 75497bc292..a47cb9dc19 100644
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events)
+         leaq  irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
+         cmpl  $0,(%rcx,%rax,1)
+         jne   compat_process_softirqs
++
++        /* Inject exception if pending. */
++        lea   VCPU_trap_bounce(%rbx), %rdx
++        testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
++        jnz   .Lcompat_process_trapbounce
++
+         testb $1,VCPU_mce_pending(%rbx)
+         jnz   compat_process_mce
+ .Lcompat_test_guest_nmi:
+@@ -68,15 +74,24 @@ compat_process_softirqs:
+         call  do_softirq
+         jmp   compat_test_all_events
+ 
++        ALIGN
++/* %rbx: struct vcpu, %rdx: struct trap_bounce */
++.Lcompat_process_trapbounce:
++        sti
++.Lcompat_bounce_exception:
++        call  compat_create_bounce_frame
++        movb  $0, TRAPBOUNCE_flags(%rdx)
++        jmp   compat_test_all_events
++
+ 	ALIGN
+ /* %rbx: struct vcpu */
+ compat_process_mce:
+         testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx)
+         jnz   .Lcompat_test_guest_nmi
+         sti
+-        movb $0,VCPU_mce_pending(%rbx)
+-        call set_guest_machinecheck_trapbounce
+-        testl %eax,%eax
++        movb  $0, VCPU_mce_pending(%rbx)
++        call  set_guest_machinecheck_trapbounce
++        test  %al, %al
+         jz    compat_test_all_events
+         movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the
+         movb %dl,VCPU_mce_old_mask(%rbx)            # iret hypercall
+@@ -88,11 +103,11 @@ compat_process_mce:
+ /* %rbx: struct vcpu */
+ compat_process_nmi:
+         testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx)
+-        jnz  compat_test_guest_events
++        jnz   compat_test_guest_events
+         sti
+-        movb  $0,VCPU_nmi_pending(%rbx)
++        movb  $0, VCPU_nmi_pending(%rbx)
+         call  set_guest_nmi_trapbounce
+-        testl %eax,%eax
++        test  %al, %al
+         jz    compat_test_all_events
+         movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the
+         movb %dl,VCPU_nmi_old_mask(%rbx)            # iret hypercall
+@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore)
+         xor   %eax, %eax
+         ret
+ 
+-/* %rdx: trap_bounce, %rbx: struct vcpu */
+-ENTRY(compat_post_handle_exception)
+-        testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
+-        jz    compat_test_all_events
+-.Lcompat_bounce_exception:
+-        call  compat_create_bounce_frame
+-        movb  $0,TRAPBOUNCE_flags(%rdx)
+-        jmp   compat_test_all_events
+-
+         .section .text.entry, "ax", @progbits
+ 
+ /* See lstar_enter for entry register state. */
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index bdd33e727f..41d3ec21a1 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -42,6 +42,12 @@ test_all_events:
+         leaq  irq_stat+IRQSTAT_softirq_pending(%rip), %rcx
+         cmpl  $0, (%rcx, %rax, 1)
+         jne   process_softirqs
++
++        /* Inject exception if pending. */
++        lea   VCPU_trap_bounce(%rbx), %rdx
++        testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
++        jnz   .Lprocess_trapbounce
++
+         cmpb  $0, VCPU_mce_pending(%rbx)
+         jne   process_mce
+ .Ltest_guest_nmi:
+@@ -69,6 +75,15 @@ process_softirqs:
+         call do_softirq
+         jmp  test_all_events
+ 
++        ALIGN
++/* %rbx: struct vcpu, %rdx struct trap_bounce */
++.Lprocess_trapbounce:
++        sti
++.Lbounce_exception:
++        call  create_bounce_frame
++        movb  $0, TRAPBOUNCE_flags(%rdx)
++        jmp   test_all_events
++
+         ALIGN
+ /* %rbx: struct vcpu */
+ process_mce:
+@@ -77,7 +92,7 @@ process_mce:
+         sti
+         movb $0, VCPU_mce_pending(%rbx)
+         call set_guest_machinecheck_trapbounce
+-        test %eax, %eax
++        test %al, %al
+         jz   test_all_events
+         movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the
+         movb %dl, VCPU_mce_old_mask(%rbx)            # iret hypercall
+@@ -93,7 +108,7 @@ process_nmi:
+         sti
+         movb $0, VCPU_nmi_pending(%rbx)
+         call set_guest_nmi_trapbounce
+-        test %eax, %eax
++        test %al, %al
+         jz   test_all_events
+         movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the
+         movb %dl, VCPU_nmi_old_mask(%rbx)            # iret hypercall
+@@ -667,15 +682,9 @@ handle_exception_saved:
+         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+         testb $3,UREGS_cs(%rsp)
+         jz    restore_all_xen
+-        leaq  VCPU_trap_bounce(%rbx),%rdx
+         movq  VCPU_domain(%rbx),%rax
+         testb $1,DOMAIN_is_32bit_pv(%rax)
+-        jnz   compat_post_handle_exception
+-        testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
+-        jz    test_all_events
+-.Lbounce_exception:
+-        call  create_bounce_frame
+-        movb  $0,TRAPBOUNCE_flags(%rdx)
++        jnz   compat_test_all_events
+         jmp   test_all_events
+ 
+ /* No special register assumptions. */
+@@ -730,7 +739,7 @@ ENTRY(device_not_available)
+ ENTRY(debug)
+         pushq $0
+         movl  $TRAP_debug,4(%rsp)
+-        jmp   handle_exception
++        jmp   handle_ist_exception
+ 
+ ENTRY(int3)
+         pushq $0
+@@ -783,12 +792,14 @@ ENTRY(double_fault)
+         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+ 
+         mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx
+-        test  %rbx, %rbx
++        neg   %rbx
+         jz    .Ldblf_cr3_okay
+         jns   .Ldblf_cr3_load
++        mov   %rbx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+         neg   %rbx
+ .Ldblf_cr3_load:
+         mov   %rbx, %cr3
++        movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ .Ldblf_cr3_okay:
+ 
+         movq  %rsp,%rdi
+diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c
+index c7ba221d11..9125c67c9e 100644
+--- a/xen/arch/x86/x86_emulate.c
++++ b/xen/arch/x86/x86_emulate.c
+@@ -14,6 +14,7 @@
+ #include <asm/processor.h> /* current_cpu_info */
+ #include <asm/xstate.h>
+ #include <asm/amd.h> /* cpu_has_amd_erratum() */
++#include <asm/debugreg.h>
+ 
+ /* Avoid namespace pollution. */
+ #undef cmpxchg
+@@ -41,3 +42,75 @@
+ })
+ 
+ #include "x86_emulate/x86_emulate.c"
++
++/* Called with NULL ctxt in hypercall context. */
++int x86emul_read_dr(unsigned int reg, unsigned long *val,
++                    struct x86_emulate_ctxt *ctxt)
++{
++    struct vcpu *curr = current;
++
++    /* HVM support requires a bit more plumbing before it will work. */
++    ASSERT(is_pv_vcpu(curr));
++
++    switch ( reg )
++    {
++    case 0 ... 3:
++    case 6:
++        *val = curr->arch.debugreg[reg];
++        break;
++
++    case 7:
++        *val = (curr->arch.debugreg[7] |
++                curr->arch.debugreg[5]);
++        break;
++
++    case 4 ... 5:
++        if ( !(curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
++        {
++            *val = curr->arch.debugreg[reg + 2];
++            break;
++        }
++
++        /* Fallthrough */
++    default:
++        if ( ctxt )
++            x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt);
++
++        return X86EMUL_EXCEPTION;
++    }
++
++    return X86EMUL_OKAY;
++}
++
++int x86emul_write_dr(unsigned int reg, unsigned long val,
++                     struct x86_emulate_ctxt *ctxt)
++{
++    struct vcpu *curr = current;
++
++    /* HVM support requires a bit more plumbing before it will work. */
++    ASSERT(is_pv_vcpu(curr));
++
++    switch ( set_debugreg(curr, reg, val) )
++    {
++    case 0:
++        return X86EMUL_OKAY;
++
++    case -ENODEV:
++        x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt);
++        return X86EMUL_EXCEPTION;
++
++    default:
++        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
++        return X86EMUL_EXCEPTION;
++    }
++}
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
+index 0c8c80ad5a..9c2bb8157c 100644
+--- a/xen/arch/x86/x86_emulate/x86_emulate.h
++++ b/xen/arch/x86/x86_emulate/x86_emulate.h
+@@ -662,6 +662,11 @@ static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
+ void x86_emulate_free_state(struct x86_emulate_state *state);
+ #endif
+ 
++int x86emul_read_dr(unsigned int reg, unsigned long *val,
++                    struct x86_emulate_ctxt *ctxt);
++int x86emul_write_dr(unsigned int reg, unsigned long val,
++                     struct x86_emulate_ctxt *ctxt);
++
+ #endif
+ 
+ static inline void x86_emul_hw_exception(
+diff --git a/xen/common/schedule.c b/xen/common/schedule.c
+index b7884263f2..f21c3e5a64 100644
+--- a/xen/common/schedule.c
++++ b/xen/common/schedule.c
+@@ -436,14 +436,9 @@ void sched_destroy_domain(struct domain *d)
+     cpupool_rm_domain(d);
+ }
+ 
+-void vcpu_sleep_nosync(struct vcpu *v)
++void vcpu_sleep_nosync_locked(struct vcpu *v)
+ {
+-    unsigned long flags;
+-    spinlock_t *lock;
+-
+-    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+-
+-    lock = vcpu_schedule_lock_irqsave(v, &flags);
++    ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
+ 
+     if ( likely(!vcpu_runnable(v)) )
+     {
+@@ -452,6 +447,18 @@ void vcpu_sleep_nosync(struct vcpu *v)
+ 
+         SCHED_OP(vcpu_scheduler(v), sleep, v);
+     }
++}
++
++void vcpu_sleep_nosync(struct vcpu *v)
++{
++    unsigned long flags;
++    spinlock_t *lock;
++
++    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
++
++    lock = vcpu_schedule_lock_irqsave(v, &flags);
++
++    vcpu_sleep_nosync_locked(v);
+ 
+     vcpu_schedule_unlock_irqrestore(lock, flags, v);
+ }
+@@ -567,13 +574,54 @@ static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
+     sched_move_irqs(v);
+ }
+ 
+-static void vcpu_migrate(struct vcpu *v)
++/*
++ * Initiating migration
++ *
++ * In order to migrate, we need the vcpu in question to have stopped
++ * running and had SCHED_OP(sleep) called (to take it off any
++ * runqueues, for instance); and if it is currently running, it needs
++ * to be scheduled out.  Finally, we need to hold the scheduling locks
++ * for both the processor we're migrating from, and the processor
++ * we're migrating to.
++ *
++ * In order to avoid deadlock while satisfying the final requirement,
++ * we must release any scheduling lock we hold, then try to grab both
++ * locks we want, then double-check to make sure that what we started
++ * to do hasn't been changed in the mean time.
++ *
++ * These steps are encapsulated in the following two functions; they
++ * should be called like this:
++ *
++ *     lock = vcpu_schedule_lock_irq(v);
++ *     vcpu_migrate_start(v);
++ *     vcpu_schedule_unlock_irq(lock, v)
++ *     vcpu_migrate_finish(v);
++ *
++ * vcpu_migrate_finish() will do the work now if it can, or simply
++ * return if it can't (because v is still running); in that case
++ * vcpu_migrate_finish() will be called by context_saved().
++ */
++void vcpu_migrate_start(struct vcpu *v)
++{
++    set_bit(_VPF_migrating, &v->pause_flags);
++    vcpu_sleep_nosync_locked(v);
++}
++
++static void vcpu_migrate_finish(struct vcpu *v)
+ {
+     unsigned long flags;
+     unsigned int old_cpu, new_cpu;
+     spinlock_t *old_lock, *new_lock;
+     bool_t pick_called = 0;
+ 
++    /*
++     * If the vcpu is currently running, this will be handled by
++     * context_saved(); and in any case, if the bit is cleared, then
++     * someone else has already done the work so we don't need to.
++     */
++    if ( v->is_running || !test_bit(_VPF_migrating, &v->pause_flags) )
++        return;
++
+     old_cpu = new_cpu = v->processor;
+     for ( ; ; )
+     {
+@@ -653,14 +701,11 @@ void vcpu_force_reschedule(struct vcpu *v)
+     spinlock_t *lock = vcpu_schedule_lock_irq(v);
+ 
+     if ( v->is_running )
+-        set_bit(_VPF_migrating, &v->pause_flags);
++        vcpu_migrate_start(v);
++
+     vcpu_schedule_unlock_irq(lock, v);
+ 
+-    if ( v->pause_flags & VPF_migrating )
+-    {
+-        vcpu_sleep_nosync(v);
+-        vcpu_migrate(v);
+-    }
++    vcpu_migrate_finish(v);
+ }
+ 
+ void restore_vcpu_affinity(struct domain *d)
+@@ -812,10 +857,10 @@ int cpu_disable_scheduler(unsigned int cpu)
+                  *  * the scheduler will always fine a suitable solution, or
+                  *    things would have failed before getting in here.
+                  */
+-                set_bit(_VPF_migrating, &v->pause_flags);
++                vcpu_migrate_start(v);
+                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
+-                vcpu_sleep_nosync(v);
+-                vcpu_migrate(v);
++
++                vcpu_migrate_finish(v);
+ 
+                 /*
+                  * The only caveat, in this case, is that if a vcpu active in
+@@ -849,18 +894,14 @@ static int vcpu_set_affinity(
+          * Always ask the scheduler to re-evaluate placement
+          * when changing the affinity.
+          */
+-        set_bit(_VPF_migrating, &v->pause_flags);
++        vcpu_migrate_start(v);
+     }
+ 
+     vcpu_schedule_unlock_irq(lock, v);
+ 
+     domain_update_node_affinity(v->domain);
+ 
+-    if ( v->pause_flags & VPF_migrating )
+-    {
+-        vcpu_sleep_nosync(v);
+-        vcpu_migrate(v);
+-    }
++    vcpu_migrate_finish(v);
+ 
+     return ret;
+ }
+@@ -1088,7 +1129,6 @@ int vcpu_pin_override(struct vcpu *v, int cpu)
+         {
+             cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
+             v->affinity_broken = 0;
+-            set_bit(_VPF_migrating, &v->pause_flags);
+             ret = 0;
+         }
+     }
+@@ -1101,20 +1141,18 @@ int vcpu_pin_override(struct vcpu *v, int cpu)
+             cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
+             v->affinity_broken = 1;
+             cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
+-            set_bit(_VPF_migrating, &v->pause_flags);
+             ret = 0;
+         }
+     }
+ 
++    if ( ret == 0 )
++        vcpu_migrate_start(v);
++
+     vcpu_schedule_unlock_irq(lock, v);
+ 
+     domain_update_node_affinity(v->domain);
+ 
+-    if ( v->pause_flags & VPF_migrating )
+-    {
+-        vcpu_sleep_nosync(v);
+-        vcpu_migrate(v);
+-    }
++    vcpu_migrate_finish(v);
+ 
+     return ret;
+ }
+@@ -1501,8 +1539,7 @@ void context_saved(struct vcpu *prev)
+ 
+     SCHED_OP(vcpu_scheduler(prev), context_saved, prev);
+ 
+-    if ( unlikely(prev->pause_flags & VPF_migrating) )
+-        vcpu_migrate(prev);
++    vcpu_migrate_finish(prev);
+ }
+ 
+ /* The scheduler timer: force a run through the scheduler */
+diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h
+index c57914efc6..b3b10eaf40 100644
+--- a/xen/include/asm-x86/debugreg.h
++++ b/xen/include/asm-x86/debugreg.h
+@@ -24,6 +24,8 @@
+ #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
+ #define DR_STATUS_RESERVED_ONE  0xffff0ff0ul /* Reserved, read as one */
+ 
++#define X86_DR6_DEFAULT 0xffff0ff0ul    /* Default %dr6 value. */
++
+ /* Now define a bunch of things for manipulating the control register.
+    The top two bytes of the control register consist of 4 fields of 4
+    bits - each field corresponds to one of the four debug registers,
+diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
+index f756cb5a0d..1a52ec6045 100644
+--- a/xen/include/asm-x86/hvm/irq.h
++++ b/xen/include/asm-x86/hvm/irq.h
+@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
+ 
+ int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data);
+ 
++/* Assert an IO APIC pin. */
++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level);
++
+ void hvm_maybe_deassert_evtchn_irq(void);
+ void hvm_assert_evtchn_irq(struct vcpu *v);
+ void hvm_set_callback_via(struct domain *d, uint64_t via);
+diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
+index 21166edd06..0eb5ff632e 100644
+--- a/xen/include/asm-x86/hvm/vpt.h
++++ b/xen/include/asm-x86/hvm/vpt.h
+@@ -44,6 +44,7 @@ struct periodic_time {
+     bool_t warned_timeout_too_short;
+ #define PTSRC_isa    1 /* ISA time source */
+ #define PTSRC_lapic  2 /* LAPIC time source */
++#define PTSRC_ioapic 3 /* IOAPIC time source */
+     u8 source;                  /* PTSRC_ */
+     u8 irq;
+     struct vcpu *vcpu;          /* vcpu timer interrupt delivers to */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index a8ceecf3e2..68fae91567 100644
+--- a/xen/include/asm-x86/msr-index.h
++++ b/xen/include/asm-x86/msr-index.h
+@@ -31,6 +31,9 @@
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSE		(1<<_EFER_FFXSE)
+ 
++#define EFER_KNOWN_MASK		(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | \
++				 EFER_SVME | EFER_LMSLE | EFER_FFXSE)
++
+ /* Speculation Controls. */
+ #define MSR_SPEC_CTRL			0x00000048
+ #define SPEC_CTRL_IBRS			(_AC(1, ULL) << 0)
+@@ -42,6 +45,7 @@
+ #define MSR_ARCH_CAPABILITIES		0x0000010a
+ #define ARCH_CAPABILITIES_RDCL_NO	(_AC(1, ULL) << 0)
+ #define ARCH_CAPABILITIES_IBRS_ALL	(_AC(1, ULL) << 1)
++#define ARCH_CAPS_RSBA			(_AC(1, ULL) << 2)
+ 
+ /* Intel MSRs. Some also available on other CPUs */
+ #define MSR_IA32_PERFCTR0		0x000000c1
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 80f8411355..a152f1d413 100644
+--- a/xen/include/asm-x86/processor.h
++++ b/xen/include/asm-x86/processor.h
+@@ -445,7 +445,8 @@ struct __packed __cacheline_aligned tss_struct {
+ #define IST_DF   1UL
+ #define IST_NMI  2UL
+ #define IST_MCE  3UL
+-#define IST_MAX  3UL
++#define IST_DB   4UL
++#define IST_MAX  4UL
+ 
+ /* Set the interrupt stack table used by a particular interrupt
+  * descriptor table entry. */