summaryrefslogtreecommitdiffstats
path: root/system/xen/patches/xen-4.10.2-pre.patch
blob: 42477696e15facdda531fa4f83840b049b19209d (plain)
diff --git a/tools/libacpi/Makefile b/tools/libacpi/Makefile
index a47a658a25..c17f3924cc 100644
--- a/tools/libacpi/Makefile
+++ b/tools/libacpi/Makefile
@@ -43,7 +43,7 @@ all: $(C_SRC) $(H_SRC)
 
 $(H_SRC): $(ACPI_BUILD_DIR)/%.h: %.asl iasl
 	iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $<
-	sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex >$@
+	sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex >$@
 	rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex)
  
 $(MK_DSDT): mk_dsdt.c
@@ -76,7 +76,7 @@ $(ACPI_BUILD_DIR)/dsdt_anycpu_arm.asl: $(MK_DSDT)
 
 $(C_SRC): $(ACPI_BUILD_DIR)/%.c: iasl $(ACPI_BUILD_DIR)/%.asl
 	iasl -vs -p $(ACPI_BUILD_DIR)/$*.$(TMP_SUFFIX) -tc $(ACPI_BUILD_DIR)/$*.asl
-	sed -e 's/AmlCode/$*/g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX)
+	sed -e 's/AmlCode/$*/g' -e 's/_aml_code//g' $(ACPI_BUILD_DIR)/$*.hex > $@.$(TMP_SUFFIX)
 	echo "int $*_len=sizeof($*);" >> $@.$(TMP_SUFFIX)
 	mv -f $@.$(TMP_SUFFIX) $@
 	rm -f $(addprefix $(ACPI_BUILD_DIR)/, $*.aml $*.hex)
#diff --git a/xen/Makefile b/xen/Makefile
#index ecec297b9b..580af86931 100644
#--- a/xen/Makefile
#+++ b/xen/Makefile
#@@ -2,7 +2,7 @@
# # All other places this is stored (eg. compile.h) should be autogenerated.
# export XEN_VERSION       = 4
# export XEN_SUBVERSION    = 10
#-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION)
#+export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION)
# export XEN_FULLVERSION   = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
# -include xen-version
# 
diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
index 1e4e5680a7..f7085d3c7b 100644
--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -28,6 +28,7 @@
 #include <asm/tboot.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
+#include <asm/spec_ctrl.h>
 #include <acpi/cpufreq/cpufreq.h>
 
 uint32_t system_reset_counter = 1;
@@ -163,6 +164,7 @@ static int enter_state(u32 state)
 {
     unsigned long flags;
     int error;
+    struct cpu_info *ci;
     unsigned long cr4;
 
     if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) )
@@ -203,12 +205,18 @@ static int enter_state(u32 state)
         printk(XENLOG_ERR "Some devices failed to power down.");
         system_state = SYS_STATE_resume;
         device_power_up(error);
+        console_end_sync();
         error = -EIO;
         goto done;
     }
     else
         error = 0;
 
+    ci = get_cpu_info();
+    spec_ctrl_enter_idle(ci);
+    /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */
+    ci->bti_ist_info = 0;
+
     ACPI_FLUSH_CPU_CACHE();
 
     switch ( state )
@@ -243,17 +251,23 @@ static int enter_state(u32 state)
     if ( (state == ACPI_STATE_S3) && error )
         tboot_s3_error(error);
 
+    console_end_sync();
+
+    microcode_resume_cpu(0);
+
+    /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */
+    ci->bti_ist_info = default_bti_ist_info;
+    spec_ctrl_exit_idle(ci);
+
  done:
     spin_debug_enable();
     local_irq_restore(flags);
-    console_end_sync();
     acpi_sleep_post(state);
     if ( hvm_cpu_up() )
         BUG();
+    cpufreq_add_cpu(0);
 
  enable_cpu:
-    cpufreq_add_cpu(0);
-    microcode_resume_cpu(0);
     rcu_barrier();
     mtrr_aps_sync_begin();
     enable_nonboot_cpus();
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index fdb2bf1779..136adadb63 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -747,6 +747,7 @@ void load_system_tables(void)
 			[IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE,
 			[IST_DF  - 1] = stack_top + IST_DF  * PAGE_SIZE,
 			[IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE,
+			[IST_DB  - 1] = stack_top + IST_DB  * PAGE_SIZE,
 
 			[IST_MAX ... ARRAY_SIZE(tss->ist) - 1] =
 				0x8600111111111111ul,
@@ -774,6 +775,7 @@ void load_system_tables(void)
 	set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
 	set_ist(&idt_tables[cpu][TRAP_nmi],	      IST_NMI);
 	set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
+	set_ist(&idt_tables[cpu][TRAP_debug],         IST_DB);
 
 	/*
 	 * Bottom-of-stack must be 16-byte aligned!
diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
index 8229c635e4..f18cbbd55a 100644
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -509,6 +509,8 @@ static void hpet_attach_channel(unsigned int cpu,
 static void hpet_detach_channel(unsigned int cpu,
                                 struct hpet_event_channel *ch)
 {
+    unsigned int next;
+
     spin_lock_irq(&ch->lock);
 
     ASSERT(ch == per_cpu(cpu_bc_channel, cpu));
@@ -517,7 +519,7 @@ static void hpet_detach_channel(unsigned int cpu,
 
     if ( cpu != ch->cpu )
         spin_unlock_irq(&ch->lock);
-    else if ( cpumask_empty(ch->cpumask) )
+    else if ( (next = cpumask_first(ch->cpumask)) >= nr_cpu_ids )
     {
         ch->cpu = -1;
         clear_bit(HPET_EVT_USED_BIT, &ch->flags);
@@ -525,7 +527,7 @@ static void hpet_detach_channel(unsigned int cpu,
     }
     else
     {
-        ch->cpu = cpumask_first(ch->cpumask);
+        ch->cpu = next;
         set_channel_irq_affinity(ch);
         local_irq_enable();
     }
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index b282089e03..131480fdd9 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2113,22 +2113,20 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
 
     vio->mmio_retry = 0;
 
-    switch ( rc = x86_emulate(&hvmemul_ctxt->ctxt, ops) )
+    rc = x86_emulate(&hvmemul_ctxt->ctxt, ops);
+    if ( rc == X86EMUL_OKAY && vio->mmio_retry )
+        rc = X86EMUL_RETRY;
+
+    if ( !hvm_vcpu_io_need_completion(vio) )
     {
-    case X86EMUL_OKAY:
-        if ( vio->mmio_retry )
-            rc = X86EMUL_RETRY;
-        /* fall through */
-    default:
         vio->mmio_cache_count = 0;
         vio->mmio_insn_bytes = 0;
-        break;
-
-    case X86EMUL_RETRY:
+    }
+    else
+    {
         BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf));
         vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes;
         memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes);
-        break;
     }
 
     if ( hvmemul_ctxt->ctxt.retire.singlestep )
diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c
index f7aed7f69e..28377091ca 100644
--- a/xen/arch/x86/hvm/hpet.c
+++ b/xen/arch/x86/hvm/hpet.c
@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, unsigned int tn,
         diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
             ? (uint32_t)diff : 0;
 
+    destroy_periodic_time(&h->pt[tn]);
     if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
+    {
         /* if LegacyReplacementRoute bit is set, HPET specification requires
            timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
            timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
         irq = (tn == 0) ? 0 : 8;
+        h->pt[tn].source = PTSRC_isa;
+    }
     else
+    {
         irq = timer_int_route(h, tn);
+        h->pt[tn].source = PTSRC_ioapic;
+    }
 
     /*
      * diff is the time from now when the timer should fire, for a periodic
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index d5afe20cc8..25b2445429 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
 
 static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
 {
+    unsigned int prev_state = STATE_IOREQ_NONE;
+
     while ( sv->pending )
     {
         unsigned int state = p->state;
 
-        rmb();
-        switch ( state )
+        smp_rmb();
+
+    recheck:
+        if ( unlikely(state == STATE_IOREQ_NONE) )
         {
-        case STATE_IOREQ_NONE:
             /*
              * The only reason we should see this case is when an
              * emulator is dying and it races with an I/O being
@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
              */
             hvm_io_assist(sv, ~0ul);
             break;
+        }
+
+        if ( unlikely(state < prev_state) )
+        {
+            gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
+                     prev_state, state);
+            sv->pending = false;
+            domain_crash(sv->vcpu->domain);
+            return false; /* bail */
+        }
+
+        switch ( prev_state = state )
+        {
         case STATE_IORESP_READY: /* IORESP_READY -> NONE */
             p->state = STATE_IOREQ_NONE;
             hvm_io_assist(sv, p->data);
             break;
         case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
         case STATE_IOREQ_INPROCESS:
-            wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state);
-            break;
+            wait_on_xen_event_channel(sv->ioreq_evtchn,
+                                      ({ state = p->state;
+                                         smp_rmb();
+                                         state != prev_state; }));
+            goto recheck;
         default:
             gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
             sv->pending = false;
diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index f528e2d081..c85d004402 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, unsigned ioapic_gsi)
     vioapic_irq_positive_edge(d, ioapic_gsi);
 }
 
+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level)
+{
+    struct hvm_irq *hvm_irq = hvm_domain_irq(d);
+    int vector;
+
+    if ( gsi >= hvm_irq->nr_gsis )
+    {
+        ASSERT_UNREACHABLE();
+        return -1;
+    }
+
+    spin_lock(&d->arch.hvm_domain.irq_lock);
+    if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 )
+        assert_gsi(d, gsi);
+    vector = vioapic_get_vector(d, gsi);
+    spin_unlock(&d->arch.hvm_domain.irq_lock);
+
+    return vector;
+}
+
 static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq)
 {
     assert_gsi(d, ioapic_gsi);
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index dedec5752d..3b72b4dc2a 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct vcpu *v)
     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NMI);
     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
+    set_ist(&idt_tables[cpu][TRAP_debug],         IST_DB);
 }
 
 static void svm_ctxt_switch_to(struct vcpu *v)
@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vcpu *v)
     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+    set_ist(&idt_tables[cpu][TRAP_debug],         IST_NONE);
 
     svm_restore_dr(v);
 
@@ -1836,6 +1838,25 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
+    switch ( msr )
+    {
+        /*
+         * Sync not needed while the cross-vendor logic is in unilateral effect.
+    case MSR_IA32_SYSENTER_CS:
+    case MSR_IA32_SYSENTER_ESP:
+    case MSR_IA32_SYSENTER_EIP:
+         */
+    case MSR_STAR:
+    case MSR_LSTAR:
+    case MSR_CSTAR:
+    case MSR_SYSCALL_MASK:
+    case MSR_FS_BASE:
+    case MSR_GS_BASE:
+    case MSR_SHADOW_GS_BASE:
+        svm_sync_vmcb(v);
+        break;
+    }
+
     switch ( msr )
     {
     case MSR_IA32_SYSENTER_CS:
@@ -1848,6 +1869,34 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
         *msr_content = v->arch.hvm_svm.guest_sysenter_eip;
         break;
 
+    case MSR_STAR:
+        *msr_content = vmcb->star;
+        break;
+
+    case MSR_LSTAR:
+        *msr_content = vmcb->lstar;
+        break;
+
+    case MSR_CSTAR:
+        *msr_content = vmcb->cstar;
+        break;
+
+    case MSR_SYSCALL_MASK:
+        *msr_content = vmcb->sfmask;
+        break;
+
+    case MSR_FS_BASE:
+        *msr_content = vmcb->fs.base;
+        break;
+
+    case MSR_GS_BASE:
+        *msr_content = vmcb->gs.base;
+        break;
+
+    case MSR_SHADOW_GS_BASE:
+        *msr_content = vmcb->kerngsbase;
+        break;
+
     case MSR_IA32_MCx_MISC(4): /* Threshold register */
     case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
         /*
@@ -1976,32 +2025,81 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
     int ret, result = X86EMUL_OKAY;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    int sync = 0;
+    bool sync = false;
 
     switch ( msr )
     {
     case MSR_IA32_SYSENTER_CS:
     case MSR_IA32_SYSENTER_ESP:
     case MSR_IA32_SYSENTER_EIP:
-        sync = 1;
-        break;
-    default:
+    case MSR_STAR:
+    case MSR_LSTAR:
+    case MSR_CSTAR:
+    case MSR_SYSCALL_MASK:
+    case MSR_FS_BASE:
+    case MSR_GS_BASE:
+    case MSR_SHADOW_GS_BASE:
+        sync = true;
         break;
     }
 
     if ( sync )
-        svm_sync_vmcb(v);    
+        svm_sync_vmcb(v);
 
     switch ( msr )
     {
+    case MSR_IA32_SYSENTER_ESP:
+    case MSR_IA32_SYSENTER_EIP:
+    case MSR_LSTAR:
+    case MSR_CSTAR:
+    case MSR_FS_BASE:
+    case MSR_GS_BASE:
+    case MSR_SHADOW_GS_BASE:
+        if ( !is_canonical_address(msr_content) )
+            goto gpf;
+
+        switch ( msr )
+        {
+        case MSR_IA32_SYSENTER_ESP:
+            vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
+            break;
+
+        case MSR_IA32_SYSENTER_EIP:
+            vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
+            break;
+
+        case MSR_LSTAR:
+            vmcb->lstar = msr_content;
+            break;
+
+        case MSR_CSTAR:
+            vmcb->cstar = msr_content;
+            break;
+
+        case MSR_FS_BASE:
+            vmcb->fs.base = msr_content;
+            break;
+
+        case MSR_GS_BASE:
+            vmcb->gs.base = msr_content;
+            break;
+
+        case MSR_SHADOW_GS_BASE:
+            vmcb->kerngsbase = msr_content;
+            break;
+        }
+        break;
+
     case MSR_IA32_SYSENTER_CS:
         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
         break;
-    case MSR_IA32_SYSENTER_ESP:
-        vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
+
+    case MSR_STAR:
+        vmcb->star = msr_content;
         break;
-    case MSR_IA32_SYSENTER_EIP:
-        vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
+
+    case MSR_SYSCALL_MASK:
+        vmcb->sfmask = msr_content;
         break;
 
     case MSR_IA32_DEBUGCTLMSR:
diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c
index 89ef2db932..b5b946aa94 100644
--- a/xen/arch/x86/hvm/svm/svmdebug.c
+++ b/xen/arch/x86/hvm/svm/svmdebug.c
@@ -131,9 +131,8 @@ bool svm_vmcb_isvalid(const char *from, const struct vmcb_struct *vmcb,
         PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n",
                vmcb_get_dr7(vmcb));
 
-    if ( efer & ~(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | EFER_SVME |
-                  EFER_LMSLE | EFER_FFXSE) )
-        PRINTF("EFER: undefined bits are not zero (%#"PRIx64")\n", efer);
+    if ( efer & ~EFER_KNOWN_MASK )
+        PRINTF("EFER: unknown bits are not zero (%#"PRIx64")\n", efer);
 
     if ( hvm_efer_valid(v, efer, -1) )
         PRINTF("EFER: %s (%"PRIx64")\n", hvm_efer_valid(v, efer, -1), efer);
diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c
index f0fa59d7d5..b02a70d086 100644
--- a/xen/arch/x86/hvm/viridian.c
+++ b/xen/arch/x86/hvm/viridian.c
@@ -245,7 +245,7 @@ void cpuid_viridian_leaves(const struct vcpu *v, uint32_t leaf,
         };
         union {
             HV_PARTITION_PRIVILEGE_MASK mask;
-            uint32_t lo, hi;
+            struct { uint32_t lo, hi; };
         } u;
 
         if ( !(viridian_feature_mask(d) & HVMPV_no_freq) )
@@ -966,12 +966,10 @@ int viridian_hypercall(struct cpu_user_regs *regs)
         gprintk(XENLOG_WARNING, "unimplemented hypercall %04x\n",
                 input.call_code);
         /* Fallthrough. */
-    case HvGetPartitionId:
     case HvExtCallQueryCapabilities:
         /*
-         * These hypercalls seem to be erroneously issued by Windows
-         * despite neither AccessPartitionId nor EnableExtendedHypercalls
-         * being set in CPUID leaf 2.
+         * This hypercall seems to be erroneously issued by Windows
+         * despite EnableExtendedHypercalls not being set in CPUID leaf 2.
          * Given that return a status of 'invalid code' has not so far
          * caused any problems it's not worth logging.
          */
diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
index 181f4cb631..04e3c2e15b 100644
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src)
 static int pt_irq_masked(struct periodic_time *pt)
 {
     struct vcpu *v = pt->vcpu;
-    unsigned int gsi, isa_irq;
-    int mask;
-    uint8_t pic_imr;
+    unsigned int gsi = pt->irq;
 
-    if ( pt->source == PTSRC_lapic )
+    switch ( pt->source )
+    {
+    case PTSRC_lapic:
     {
         struct vlapic *vlapic = vcpu_vlapic(v);
+
         return (!vlapic_enabled(vlapic) ||
                 (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED));
     }
 
-    isa_irq = pt->irq;
-    gsi = hvm_isa_irq_to_gsi(isa_irq);
-    pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr;
-    mask = vioapic_get_mask(v->domain, gsi);
-    if ( mask < 0 )
+    case PTSRC_isa:
     {
-        dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n",
-                v->domain->domain_id, gsi);
-        domain_crash(v->domain);
-        return -1;
+        uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr;
+
+        /* Check if the interrupt is unmasked in the PIC. */
+        if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) )
+            return 0;
+
+        gsi = hvm_isa_irq_to_gsi(pt->irq);
+    }
+
+    /* Fallthrough to check if the interrupt is masked on the IO APIC. */
+    case PTSRC_ioapic:
+    {
+        int mask = vioapic_get_mask(v->domain, gsi);
+
+        if ( mask < 0 )
+        {
+            dprintk(XENLOG_WARNING,
+                    "d%d: invalid GSI (%u) for platform timer\n",
+                    v->domain->domain_id, gsi);
+            domain_crash(v->domain);
+            return -1;
+        }
+
+        return mask;
+    }
     }
 
-    return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) &&
-            mask);
+    ASSERT_UNREACHABLE();
+    return 1;
 }
 
 static void pt_lock(struct periodic_time *pt)
@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v)
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
     struct periodic_time *pt, *temp, *earliest_pt;
     uint64_t max_lag;
-    int irq, is_lapic, pt_vector;
+    int irq, pt_vector = -1;
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v)
 
     earliest_pt->irq_issued = 1;
     irq = earliest_pt->irq;
-    is_lapic = (earliest_pt->source == PTSRC_lapic);
 
     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
 
-    /*
-     * If periodic timer interrut is handled by lapic, its vector in
-     * IRR is returned and used to set eoi_exit_bitmap for virtual
-     * interrupt delivery case. Otherwise return -1 to do nothing.
-     */
-    if ( is_lapic )
+    switch ( earliest_pt->source )
     {
+    case PTSRC_lapic:
+        /*
+         * If periodic timer interrupt is handled by lapic, its vector in
+         * IRR is returned and used to set eoi_exit_bitmap for virtual
+         * interrupt delivery case. Otherwise return -1 to do nothing.
+         */
         vlapic_set_irq(vcpu_vlapic(v), irq, 0);
         pt_vector = irq;
-    }
-    else
-    {
+        break;
+
+    case PTSRC_isa:
         hvm_isa_irq_deassert(v->domain, irq);
         if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
              v->domain->arch.hvm_domain.vpic[irq >> 3].int_output )
-        {
             hvm_isa_irq_assert(v->domain, irq, NULL);
-            pt_vector = -1;
-        }
         else
         {
             pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector);
@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v)
             if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
                 pt_vector = -1;
         }
+        break;
+
+    case PTSRC_ioapic:
+        /*
+         * NB: At the moment IO-APIC routed interrupts generated by vpt devices
+         * (HPET) are edge-triggered.
+         */
+        pt_vector = hvm_ioapic_assert(v->domain, irq, false);
+        if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
+            pt_vector = -1;
+        break;
     }
 
     return pt_vector;
@@ -418,7 +444,14 @@ void create_periodic_time(
     struct vcpu *v, struct periodic_time *pt, uint64_t delta,
     uint64_t period, uint8_t irq, time_cb *cb, void *data)
 {
-    ASSERT(pt->source != 0);
+    if ( !pt->source ||
+         (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) ||
+         (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis &&
+          pt->source == PTSRC_ioapic) )
+    {
+        ASSERT_UNREACHABLE();
+        return;
+    }
 
     destroy_periodic_time(pt);
 
@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
 {
     int on_list;
 
-    ASSERT(pt->source == PTSRC_isa);
+    ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
 
     if ( pt->vcpu == NULL )
         return;
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 642ca312bf..c281936af0 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -813,26 +813,6 @@ static int write_cr(unsigned int reg, unsigned long val,
     return X86EMUL_UNHANDLEABLE;
 }
 
-static int read_dr(unsigned int reg, unsigned long *val,
-                   struct x86_emulate_ctxt *ctxt)
-{
-    unsigned long res = do_get_debugreg(reg);
-
-    if ( IS_ERR_VALUE(res) )
-        return X86EMUL_UNHANDLEABLE;
-
-    *val = res;
-
-    return X86EMUL_OKAY;
-}
-
-static int write_dr(unsigned int reg, unsigned long val,
-                    struct x86_emulate_ctxt *ctxt)
-{
-    return do_set_debugreg(reg, val) == 0
-           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
-}
-
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -906,9 +886,16 @@ static int read_msr(unsigned int reg, uint64_t *val,
         return X86EMUL_OKAY;
 
     case MSR_EFER:
-        *val = read_efer();
+        /* Hide unknown bits, and unconditionally hide SVME from guests. */
+        *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME;
+        /*
+         * Hide the 64-bit features from 32-bit guests.  SCE has
+         * vendor-dependent behaviour.
+         */
         if ( is_pv_32bit_domain(currd) )
-            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE |
+                      (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
+                       ? EFER_SCE : 0));
         return X86EMUL_OKAY;
 
     case MSR_K7_FID_VID_CTL:
@@ -1326,8 +1313,8 @@ static const struct x86_emulate_ops priv_op_ops = {
     .read_segment        = read_segment,
     .read_cr             = read_cr,
     .write_cr            = write_cr,
-    .read_dr             = read_dr,
-    .write_dr            = write_dr,
+    .read_dr             = x86emul_read_dr,
+    .write_dr            = x86emul_write_dr,
     .read_msr            = read_msr,
     .write_msr           = write_msr,
     .cpuid               = pv_emul_cpuid,
diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c
index 5862130697..1619be7874 100644
--- a/xen/arch/x86/pv/misc-hypercalls.c
+++ b/xen/arch/x86/pv/misc-hypercalls.c
@@ -30,22 +30,10 @@ long do_set_debugreg(int reg, unsigned long value)
 
 unsigned long do_get_debugreg(int reg)
 {
-    struct vcpu *curr = current;
+    unsigned long val;
+    int res = x86emul_read_dr(reg, &val, NULL);
 
-    switch ( reg )
-    {
-    case 0 ... 3:
-    case 6:
-        return curr->arch.debugreg[reg];
-    case 7:
-        return (curr->arch.debugreg[7] |
-                curr->arch.debugreg[5]);
-    case 4 ... 5:
-        return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ?
-                curr->arch.debugreg[reg + 2] : 0);
-    }
-
-    return -EINVAL;
+    return res == X86EMUL_OKAY ? val : -ENODEV;
 }
 
 long do_fpu_taskswitch(int set)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index e1d023428c..f81fc2ca65 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -968,6 +968,7 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+    set_ist(&idt_tables[cpu][TRAP_debug],         IST_NONE);
 
     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 3c7447bfe6..fa67a0ffbd 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -97,12 +97,13 @@ static void __init print_details(enum ind_thunk thunk)
     printk(XENLOG_DEBUG "Speculative mitigation facilities:\n");
 
     /* Hardware features which pertain to speculative mitigations. */
-    printk(XENLOG_DEBUG "  Hardware features:%s%s%s%s%s\n",
+    printk(XENLOG_DEBUG "  Hardware features:%s%s%s%s%s%s\n",
            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP"     : "",
            (e8b  & cpufeat_mask(X86_FEATURE_IBPB))  ? " IBPB"      : "",
            (caps & ARCH_CAPABILITIES_IBRS_ALL)      ? " IBRS_ALL"  : "",
-           (caps & ARCH_CAPABILITIES_RDCL_NO)       ? " RDCL_NO"   : "");
+           (caps & ARCH_CAPABILITIES_RDCL_NO)       ? " RDCL_NO"   : "",
+           (caps & ARCH_CAPS_RSBA)                  ? " RSBA"      : "");
 
     /* Compiled-in support which pertains to BTI mitigations. */
     if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
@@ -135,6 +136,20 @@ static bool __init retpoline_safe(void)
          boot_cpu_data.x86 != 6 )
         return false;
 
+    if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+    {
+        uint64_t caps;
+
+        rdmsrl(MSR_ARCH_CAPABILITIES, caps);
+
+        /*
+         * RBSA may be set by a hypervisor to indicate that we may move to a
+         * processor which isn't retpoline-safe.
+         */
+        if ( caps & ARCH_CAPS_RSBA )
+            return false;
+    }
+
     switch ( boot_cpu_data.x86_model )
     {
     case 0x17: /* Penryn */
@@ -161,18 +176,40 @@ static bool __init retpoline_safe(void)
          * versions.
          */
     case 0x3d: /* Broadwell */
-        return ucode_rev >= 0x28;
+        return ucode_rev >= 0x2a;
     case 0x47: /* Broadwell H */
-        return ucode_rev >= 0x1b;
+        return ucode_rev >= 0x1d;
     case 0x4f: /* Broadwell EP/EX */
-        return ucode_rev >= 0xb000025;
+        return ucode_rev >= 0xb000021;
     case 0x56: /* Broadwell D */
-        return false; /* TBD. */
+        switch ( boot_cpu_data.x86_mask )
+        {
+        case 2:  return ucode_rev >= 0x15;
+        case 3:  return ucode_rev >= 0x7000012;
+        case 4:  return ucode_rev >= 0xf000011;
+        case 5:  return ucode_rev >= 0xe000009;
+        default:
+            printk("Unrecognised CPU stepping %#x - assuming not reptpoline safe\n",
+                   boot_cpu_data.x86_mask);
+            return false;
+        }
+        break;
 
         /*
-         * Skylake and later processors are not retpoline-safe.
+         * Skylake, Kabylake and Cannonlake processors are not retpoline-safe.
          */
+    case 0x4e:
+    case 0x55:
+    case 0x5e:
+    case 0x66:
+    case 0x67:
+    case 0x8e:
+    case 0x9e:
+        return false;
+
     default:
+        printk("Unrecognised CPU model %#x - assuming not reptpoline safe\n",
+               boot_cpu_data.x86_model);
         return false;
     }
 }
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 906124331b..e217b0d6e2 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
 /*
  * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
  *
- * Stack pages 0, 1 and 2:
+ * Stack pages 0 - 3:
  *   These are all 1-page IST stacks.  Each of these stacks have an exception
  *   frame and saved register state at the top.  The interesting bound for a
  *   trace is the word adjacent to this, while the bound for a dump is the
  *   very top, including the exception frame.
  *
- * Stack pages 3, 4 and 5:
+ * Stack pages 4 and 5:
  *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
  *   explicitly not present, so attempting to dump or trace it is
  *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(unsigned long sp)
 {
     switch ( get_stack_page(sp) )
     {
-    case 0 ... 2:
+    case 0 ... 3:
         return ROUNDUP(sp, PAGE_SIZE) -
             offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
 
 #ifndef MEMORY_GUARD
-    case 3 ... 5:
+    case 4 ... 5:
 #endif
     case 6 ... 7:
         return ROUNDUP(sp, STACK_SIZE) -
@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsigned long sp)
 {
     switch ( get_stack_page(sp) )
     {
-    case 0 ... 2:
+    case 0 ... 3:
         return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
 
 #ifndef MEMORY_GUARD
-    case 3 ... 5:
+    case 4 ... 5:
 #endif
     case 6 ... 7:
         return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
@@ -1761,11 +1761,36 @@ static void ler_enable(void)
 
 void do_debug(struct cpu_user_regs *regs)
 {
+    unsigned long dr6;
     struct vcpu *v = current;
 
+    /* Stash dr6 as early as possible. */
+    dr6 = read_debugreg(6);
+
     if ( debugger_trap_entry(TRAP_debug, regs) )
         return;
 
+    /*
+     * At the time of writing (March 2018), on the subject of %dr6:
+     *
+     * The Intel manual says:
+     *   Certain debug exceptions may clear bits 0-3. The remaining contents
+     *   of the DR6 register are never cleared by the processor. To avoid
+     *   confusion in identifying debug exceptions, debug handlers should
+     *   clear the register (except bit 16, which they should set) before
+     *   returning to the interrupted task.
+     *
+     * The AMD manual says:
+     *   Bits 15:13 of the DR6 register are not cleared by the processor and
+     *   must be cleared by software after the contents have been read.
+     *
+     * Some bits are reserved set, some are reserved clear, and some bits
+     * which were previously reserved set are reused and cleared by hardware.
+     * For future compatibility, reset to the default value, which will allow
+     * us to spot any bit being changed by hardware to its non-default value.
+     */
+    write_debugreg(6, X86_DR6_DEFAULT);
+
     if ( !guest_mode(regs) )
     {
         if ( regs->eflags & X86_EFLAGS_TF )
@@ -1784,21 +1809,50 @@ void do_debug(struct cpu_user_regs *regs)
                 regs->eflags &= ~X86_EFLAGS_TF;
             }
         }
-        else
+
+        /*
+         * Check for fault conditions.  General Detect, and instruction
+         * breakpoints are faults rather than traps, at which point attempting
+         * to ignore and continue will result in a livelock.
+         */
+        if ( dr6 & DR_GENERAL_DETECT )
         {
-            /*
-             * We ignore watchpoints when they trigger within Xen. This may
-             * happen when a buffer is passed to us which previously had a
-             * watchpoint set on it. No need to bump EIP; the only faulting
-             * trap is an instruction breakpoint, which can't happen to us.
-             */
-            WARN_ON(!search_exception_table(regs));
+            printk(XENLOG_ERR "Hit General Detect in Xen context\n");
+            fatal_trap(regs, 0);
+        }
+
+        if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
+        {
+            unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT;
+
+            for ( bp = 0; bp < 4; ++bp )
+            {
+                if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
+                     ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ )
+                {
+                    printk(XENLOG_ERR
+                           "Hit instruction breakpoint in Xen context\n");
+                    fatal_trap(regs, 0);
+                }
+            }
         }
+
+        /*
+         * Whatever caused this #DB should be a trap.  Note it and continue.
+         * Guests can trigger this in certain corner cases, so ensure the
+         * message is ratelimited.
+         */
+        gprintk(XENLOG_WARNING,
+                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
+                regs->cs, _p(regs->rip), _p(regs->rip),
+                regs->ss, _p(regs->rsp), dr6);
+
         goto out;
     }
 
     /* Save debug status register where guest OS can peek at it */
-    v->arch.debugreg[6] = read_debugreg(6);
+    v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
+    v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
 
     ler_enable();
     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
@@ -1917,6 +1971,7 @@ void __init init_idt_traps(void)
     set_ist(&idt_table[TRAP_double_fault],  IST_DF);
     set_ist(&idt_table[TRAP_nmi],           IST_NMI);
     set_ist(&idt_table[TRAP_machine_check], IST_MCE);
+    set_ist(&idt_table[TRAP_debug],         IST_DB);
 
     /* CPU0 uses the master IDT. */
     idt_tables[0] = idt_table;
@@ -1984,6 +2039,12 @@ void activate_debugregs(const struct vcpu *curr)
     }
 }
 
+/*
+ * Used by hypercalls and the emulator.
+ *  -ENODEV => #UD
+ *  -EINVAL => #GP Invalid bit
+ *  -EPERM  => #GP Valid bit, but not permitted to use
+ */
 long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
 {
     int i;
@@ -2015,7 +2076,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
         if ( v == curr )
             write_debugreg(3, value);
         break;
+
+    case 4:
+        if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE )
+            return -ENODEV;
+
+        /* Fallthrough */
     case 6:
+        /* The upper 32 bits are strictly reserved. */
+        if ( value != (uint32_t)value )
+            return -EINVAL;
+
         /*
          * DR6: Bits 4-11,16-31 reserved (set to 1).
          *      Bit 12 reserved (set to 0).
@@ -2025,7 +2096,17 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
         if ( v == curr )
             write_debugreg(6, value);
         break;
+
+    case 5:
+        if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE )
+            return -ENODEV;
+
+        /* Fallthrough */
     case 7:
+        /* The upper 32 bits are strictly reserved. */
+        if ( value != (uint32_t)value )
+            return -EINVAL;
+
         /*
          * DR7: Bit 10 reserved (set to 1).
          *      Bits 11-12,14-15 reserved (set to 0).
@@ -2038,6 +2119,10 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
          */
         if ( value & DR_GENERAL_DETECT )
             return -EPERM;
+
+        /* Zero the IO shadow before recalculating the real %dr7 */
+        v->arch.debugreg[5] = 0;
+
         /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
         if ( value & DR7_ACTIVE_MASK )
         {
@@ -2070,7 +2155,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
             write_debugreg(7, value);
         break;
     default:
-        return -EINVAL;
+        return -ENODEV;
     }
 
     v->arch.debugreg[reg] = value;
diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
index 75497bc292..a47cb9dc19 100644
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events)
         leaq  irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
         cmpl  $0,(%rcx,%rax,1)
         jne   compat_process_softirqs
+
+        /* Inject exception if pending. */
+        lea   VCPU_trap_bounce(%rbx), %rdx
+        testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
+        jnz   .Lcompat_process_trapbounce
+
         testb $1,VCPU_mce_pending(%rbx)
         jnz   compat_process_mce
 .Lcompat_test_guest_nmi:
@@ -68,15 +74,24 @@ compat_process_softirqs:
         call  do_softirq
         jmp   compat_test_all_events
 
+        ALIGN
+/* %rbx: struct vcpu, %rdx: struct trap_bounce */
+.Lcompat_process_trapbounce:
+        sti
+.Lcompat_bounce_exception:
+        call  compat_create_bounce_frame
+        movb  $0, TRAPBOUNCE_flags(%rdx)
+        jmp   compat_test_all_events
+
 	ALIGN
 /* %rbx: struct vcpu */
 compat_process_mce:
         testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx)
         jnz   .Lcompat_test_guest_nmi
         sti
-        movb $0,VCPU_mce_pending(%rbx)
-        call set_guest_machinecheck_trapbounce
-        testl %eax,%eax
+        movb  $0, VCPU_mce_pending(%rbx)
+        call  set_guest_machinecheck_trapbounce
+        test  %al, %al
         jz    compat_test_all_events
         movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the
         movb %dl,VCPU_mce_old_mask(%rbx)            # iret hypercall
@@ -88,11 +103,11 @@ compat_process_mce:
 /* %rbx: struct vcpu */
 compat_process_nmi:
         testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx)
-        jnz  compat_test_guest_events
+        jnz   compat_test_guest_events
         sti
-        movb  $0,VCPU_nmi_pending(%rbx)
+        movb  $0, VCPU_nmi_pending(%rbx)
         call  set_guest_nmi_trapbounce
-        testl %eax,%eax
+        test  %al, %al
         jz    compat_test_all_events
         movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the
         movb %dl,VCPU_nmi_old_mask(%rbx)            # iret hypercall
@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore)
         xor   %eax, %eax
         ret
 
-/* %rdx: trap_bounce, %rbx: struct vcpu */
-ENTRY(compat_post_handle_exception)
-        testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
-        jz    compat_test_all_events
-.Lcompat_bounce_exception:
-        call  compat_create_bounce_frame
-        movb  $0,TRAPBOUNCE_flags(%rdx)
-        jmp   compat_test_all_events
-
         .section .text.entry, "ax", @progbits
 
 /* See lstar_enter for entry register state. */
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index bdd33e727f..41d3ec21a1 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -42,6 +42,12 @@ test_all_events:
         leaq  irq_stat+IRQSTAT_softirq_pending(%rip), %rcx
         cmpl  $0, (%rcx, %rax, 1)
         jne   process_softirqs
+
+        /* Inject exception if pending. */
+        lea   VCPU_trap_bounce(%rbx), %rdx
+        testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
+        jnz   .Lprocess_trapbounce
+
         cmpb  $0, VCPU_mce_pending(%rbx)
         jne   process_mce
 .Ltest_guest_nmi:
@@ -69,6 +75,15 @@ process_softirqs:
         call do_softirq
         jmp  test_all_events
 
+        ALIGN
+/* %rbx: struct vcpu, %rdx struct trap_bounce */
+.Lprocess_trapbounce:
+        sti
+.Lbounce_exception:
+        call  create_bounce_frame
+        movb  $0, TRAPBOUNCE_flags(%rdx)
+        jmp   test_all_events
+
         ALIGN
 /* %rbx: struct vcpu */
 process_mce:
@@ -77,7 +92,7 @@ process_mce:
         sti
         movb $0, VCPU_mce_pending(%rbx)
         call set_guest_machinecheck_trapbounce
-        test %eax, %eax
+        test %al, %al
         jz   test_all_events
         movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the
         movb %dl, VCPU_mce_old_mask(%rbx)            # iret hypercall
@@ -93,7 +108,7 @@ process_nmi:
         sti
         movb $0, VCPU_nmi_pending(%rbx)
         call set_guest_nmi_trapbounce
-        test %eax, %eax
+        test %al, %al
         jz   test_all_events
         movzbl VCPU_async_exception_mask(%rbx), %edx # save mask for the
         movb %dl, VCPU_nmi_old_mask(%rbx)            # iret hypercall
@@ -667,15 +682,9 @@ handle_exception_saved:
         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
-        leaq  VCPU_trap_bounce(%rbx),%rdx
         movq  VCPU_domain(%rbx),%rax
         testb $1,DOMAIN_is_32bit_pv(%rax)
-        jnz   compat_post_handle_exception
-        testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
-        jz    test_all_events
-.Lbounce_exception:
-        call  create_bounce_frame
-        movb  $0,TRAPBOUNCE_flags(%rdx)
+        jnz   compat_test_all_events
         jmp   test_all_events
 
 /* No special register assumptions. */
@@ -730,7 +739,7 @@ ENTRY(device_not_available)
 ENTRY(debug)
         pushq $0
         movl  $TRAP_debug,4(%rsp)
-        jmp   handle_exception
+        jmp   handle_ist_exception
 
 ENTRY(int3)
         pushq $0
@@ -783,12 +792,14 @@ ENTRY(double_fault)
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx
-        test  %rbx, %rbx
+        neg   %rbx
         jz    .Ldblf_cr3_okay
         jns   .Ldblf_cr3_load
+        mov   %rbx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         neg   %rbx
 .Ldblf_cr3_load:
         mov   %rbx, %cr3
+        movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
 .Ldblf_cr3_okay:
 
         movq  %rsp,%rdi
diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c
index c7ba221d11..9125c67c9e 100644
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -14,6 +14,7 @@
 #include <asm/processor.h> /* current_cpu_info */
 #include <asm/xstate.h>
 #include <asm/amd.h> /* cpu_has_amd_erratum() */
+#include <asm/debugreg.h>
 
 /* Avoid namespace pollution. */
 #undef cmpxchg
@@ -41,3 +42,75 @@
 })
 
 #include "x86_emulate/x86_emulate.c"
+
+/* Called with NULL ctxt in hypercall context. */
+int x86emul_read_dr(unsigned int reg, unsigned long *val,
+                    struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    /* HVM support requires a bit more plumbing before it will work. */
+    ASSERT(is_pv_vcpu(curr));
+
+    switch ( reg )
+    {
+    case 0 ... 3:
+    case 6:
+        *val = curr->arch.debugreg[reg];
+        break;
+
+    case 7:
+        *val = (curr->arch.debugreg[7] |
+                curr->arch.debugreg[5]);
+        break;
+
+    case 4 ... 5:
+        if ( !(curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
+        {
+            *val = curr->arch.debugreg[reg + 2];
+            break;
+        }
+
+        /* Fallthrough */
+    default:
+        if ( ctxt )
+            x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt);
+
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+int x86emul_write_dr(unsigned int reg, unsigned long val,
+                     struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    /* HVM support requires a bit more plumbing before it will work. */
+    ASSERT(is_pv_vcpu(curr));
+
+    switch ( set_debugreg(curr, reg, val) )
+    {
+    case 0:
+        return X86EMUL_OKAY;
+
+    case -ENODEV:
+        x86_emul_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC, ctxt);
+        return X86EMUL_EXCEPTION;
+
+    default:
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
index 0c8c80ad5a..9c2bb8157c 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -662,6 +662,11 @@ static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
 void x86_emulate_free_state(struct x86_emulate_state *state);
 #endif
 
+int x86emul_read_dr(unsigned int reg, unsigned long *val,
+                    struct x86_emulate_ctxt *ctxt);
+int x86emul_write_dr(unsigned int reg, unsigned long val,
+                     struct x86_emulate_ctxt *ctxt);
+
 #endif
 
 static inline void x86_emul_hw_exception(
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index b7884263f2..f21c3e5a64 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -436,14 +436,9 @@ void sched_destroy_domain(struct domain *d)
     cpupool_rm_domain(d);
 }
 
-void vcpu_sleep_nosync(struct vcpu *v)
+void vcpu_sleep_nosync_locked(struct vcpu *v)
 {
-    unsigned long flags;
-    spinlock_t *lock;
-
-    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
-
-    lock = vcpu_schedule_lock_irqsave(v, &flags);
+    ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
 
     if ( likely(!vcpu_runnable(v)) )
     {
@@ -452,6 +447,18 @@ void vcpu_sleep_nosync(struct vcpu *v)
 
         SCHED_OP(vcpu_scheduler(v), sleep, v);
     }
+}
+
+void vcpu_sleep_nosync(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *lock;
+
+    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+
+    lock = vcpu_schedule_lock_irqsave(v, &flags);
+
+    vcpu_sleep_nosync_locked(v);
 
     vcpu_schedule_unlock_irqrestore(lock, flags, v);
 }
@@ -567,13 +574,54 @@ static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
     sched_move_irqs(v);
 }
 
-static void vcpu_migrate(struct vcpu *v)
+/*
+ * Initiating migration
+ *
+ * In order to migrate, we need the vcpu in question to have stopped
+ * running and had SCHED_OP(sleep) called (to take it off any
+ * runqueues, for instance); and if it is currently running, it needs
+ * to be scheduled out.  Finally, we need to hold the scheduling locks
+ * for both the processor we're migrating from, and the processor
+ * we're migrating to.
+ *
+ * In order to avoid deadlock while satisfying the final requirement,
+ * we must release any scheduling lock we hold, then try to grab both
+ * locks we want, then double-check to make sure that what we started
+ * to do hasn't been changed in the mean time.
+ *
+ * These steps are encapsulated in the following two functions; they
+ * should be called like this:
+ *
+ *     lock = vcpu_schedule_lock_irq(v);
+ *     vcpu_migrate_start(v);
+ *     vcpu_schedule_unlock_irq(lock, v)
+ *     vcpu_migrate_finish(v);
+ *
+ * vcpu_migrate_finish() will do the work now if it can, or simply
+ * return if it can't (because v is still running); in that case
+ * vcpu_migrate_finish() will be called by context_saved().
+ */
+void vcpu_migrate_start(struct vcpu *v)
+{
+    set_bit(_VPF_migrating, &v->pause_flags);
+    vcpu_sleep_nosync_locked(v);
+}
+
+static void vcpu_migrate_finish(struct vcpu *v)
 {
     unsigned long flags;
     unsigned int old_cpu, new_cpu;
     spinlock_t *old_lock, *new_lock;
     bool_t pick_called = 0;
 
+    /*
+     * If the vcpu is currently running, this will be handled by
+     * context_saved(); and in any case, if the bit is cleared, then
+     * someone else has already done the work so we don't need to.
+     */
+    if ( v->is_running || !test_bit(_VPF_migrating, &v->pause_flags) )
+        return;
+
     old_cpu = new_cpu = v->processor;
     for ( ; ; )
     {
@@ -653,14 +701,11 @@ void vcpu_force_reschedule(struct vcpu *v)
     spinlock_t *lock = vcpu_schedule_lock_irq(v);
 
     if ( v->is_running )
-        set_bit(_VPF_migrating, &v->pause_flags);
+        vcpu_migrate_start(v);
+
     vcpu_schedule_unlock_irq(lock, v);
 
-    if ( v->pause_flags & VPF_migrating )
-    {
-        vcpu_sleep_nosync(v);
-        vcpu_migrate(v);
-    }
+    vcpu_migrate_finish(v);
 }
 
 void restore_vcpu_affinity(struct domain *d)
@@ -812,10 +857,10 @@ int cpu_disable_scheduler(unsigned int cpu)
                  *  * the scheduler will always fine a suitable solution, or
                  *    things would have failed before getting in here.
                  */
-                set_bit(_VPF_migrating, &v->pause_flags);
+                vcpu_migrate_start(v);
                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
-                vcpu_sleep_nosync(v);
-                vcpu_migrate(v);
+
+                vcpu_migrate_finish(v);
 
                 /*
                  * The only caveat, in this case, is that if a vcpu active in
@@ -849,18 +894,14 @@ static int vcpu_set_affinity(
          * Always ask the scheduler to re-evaluate placement
          * when changing the affinity.
          */
-        set_bit(_VPF_migrating, &v->pause_flags);
+        vcpu_migrate_start(v);
     }
 
     vcpu_schedule_unlock_irq(lock, v);
 
     domain_update_node_affinity(v->domain);
 
-    if ( v->pause_flags & VPF_migrating )
-    {
-        vcpu_sleep_nosync(v);
-        vcpu_migrate(v);
-    }
+    vcpu_migrate_finish(v);
 
     return ret;
 }
@@ -1088,7 +1129,6 @@ int vcpu_pin_override(struct vcpu *v, int cpu)
         {
             cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
             v->affinity_broken = 0;
-            set_bit(_VPF_migrating, &v->pause_flags);
             ret = 0;
         }
     }
@@ -1101,20 +1141,18 @@ int vcpu_pin_override(struct vcpu *v, int cpu)
             cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
             v->affinity_broken = 1;
             cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
-            set_bit(_VPF_migrating, &v->pause_flags);
             ret = 0;
         }
     }
 
+    if ( ret == 0 )
+        vcpu_migrate_start(v);
+
     vcpu_schedule_unlock_irq(lock, v);
 
     domain_update_node_affinity(v->domain);
 
-    if ( v->pause_flags & VPF_migrating )
-    {
-        vcpu_sleep_nosync(v);
-        vcpu_migrate(v);
-    }
+    vcpu_migrate_finish(v);
 
     return ret;
 }
@@ -1501,8 +1539,7 @@ void context_saved(struct vcpu *prev)
 
     SCHED_OP(vcpu_scheduler(prev), context_saved, prev);
 
-    if ( unlikely(prev->pause_flags & VPF_migrating) )
-        vcpu_migrate(prev);
+    vcpu_migrate_finish(prev);
 }
 
 /* The scheduler timer: force a run through the scheduler */
diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h
index c57914efc6..b3b10eaf40 100644
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -24,6 +24,8 @@
 #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
 #define DR_STATUS_RESERVED_ONE  0xffff0ff0ul /* Reserved, read as one */
 
+#define X86_DR6_DEFAULT 0xffff0ff0ul    /* Default %dr6 value. */
+
 /* Now define a bunch of things for manipulating the control register.
    The top two bytes of the control register consist of 4 fields of 4
    bits - each field corresponds to one of the four debug registers,
diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
index f756cb5a0d..1a52ec6045 100644
--- a/xen/include/asm-x86/hvm/irq.h
+++ b/xen/include/asm-x86/hvm/irq.h
@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
 
 int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data);
 
+/* Assert an IO APIC pin. */
+int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level);
+
 void hvm_maybe_deassert_evtchn_irq(void);
 void hvm_assert_evtchn_irq(struct vcpu *v);
 void hvm_set_callback_via(struct domain *d, uint64_t via);
diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
index 21166edd06..0eb5ff632e 100644
--- a/xen/include/asm-x86/hvm/vpt.h
+++ b/xen/include/asm-x86/hvm/vpt.h
@@ -44,6 +44,7 @@ struct periodic_time {
     bool_t warned_timeout_too_short;
 #define PTSRC_isa    1 /* ISA time source */
 #define PTSRC_lapic  2 /* LAPIC time source */
+#define PTSRC_ioapic 3 /* IOAPIC time source */
     u8 source;                  /* PTSRC_ */
     u8 irq;
     struct vcpu *vcpu;          /* vcpu timer interrupt delivers to */
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index a8ceecf3e2..68fae91567 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -31,6 +31,9 @@
 #define EFER_LMSLE		(1<<_EFER_LMSLE)
 #define EFER_FFXSE		(1<<_EFER_FFXSE)
 
+#define EFER_KNOWN_MASK		(EFER_SCE | EFER_LME | EFER_LMA | EFER_NX | \
+				 EFER_SVME | EFER_LMSLE | EFER_FFXSE)
+
 /* Speculation Controls. */
 #define MSR_SPEC_CTRL			0x00000048
 #define SPEC_CTRL_IBRS			(_AC(1, ULL) << 0)
@@ -42,6 +45,7 @@
 #define MSR_ARCH_CAPABILITIES		0x0000010a
 #define ARCH_CAPABILITIES_RDCL_NO	(_AC(1, ULL) << 0)
 #define ARCH_CAPABILITIES_IBRS_ALL	(_AC(1, ULL) << 1)
+#define ARCH_CAPS_RSBA			(_AC(1, ULL) << 2)
 
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 80f8411355..a152f1d413 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -445,7 +445,8 @@ struct __packed __cacheline_aligned tss_struct {
 #define IST_DF   1UL
 #define IST_NMI  2UL
 #define IST_MCE  3UL
-#define IST_MAX  3UL
+#define IST_DB   4UL
+#define IST_MAX  4UL
 
 /* Set the interrupt stack table used by a particular interrupt
  * descriptor table entry. */