1 files changed, 249 insertions, 0 deletions
diff --git a/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch
new file mode 100644
index 0000000000..db407416b9
--- /dev/null
+++ b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch
@@ -0,0 +1,249 @@
+From 0ff9a8453dc47cd47eee9659d5916afb5094e871 Mon Sep 17 00:00:00 2001
+From: Hongyan Xia <hongyxia@amazon.com>
+Date: Sat, 11 Jan 2020 21:57:43 +0000
+Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates
+
+map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB
+superpages if possible, to maximize TLB efficiency.  This means both
+replacing superpage entries with smaller entries, and replacing
+smaller entries with superpages.
+
+Unfortunately, while some potential races are handled correctly,
+others are not.  These include:
+
+1. When one processor modifies a sub-superpage mapping while another
+processor replaces the entire range with a superpage.
+
+Take the following example:
+
+Suppose L3[N] points to L2.  And suppose we have two processors, A and
+B.
+
+* A walks the pagetables, get a pointer to L2.
+* B replaces L3[N] with a 1GiB mapping.
+* B Frees L2
+* A writes L2[M] #
+
+This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't
+handle higher-level superpages properly: If you call virt_xen_to_l2e
+on a virtual address within an L3 superpage, you'll either hit a BUG()
+(most likely), or get a pointer into the middle of a data page; same
+with virt_xen_to_l1 on a virtual address within either an L3 or L2
+superpage.
+
+So take the following example:
+
+* A reads pl3e and discovers it to point to an L2.
+* B replaces L3[N] with a 1GiB mapping
+* A calls virt_to_xen_l2e() and hits the BUG_ON() #
+
+2. When two processors simultaneously try to replace a sub-superpage
+mapping with a superpage mapping.
+
+Take the following example:
+
+Suppose L3[N] points to L2.  And suppose we have two processors, A and B,
+both trying to replace L3[N] with a superpage.
+
+* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2.
+* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2.
+* A writes the new value into L3[N]
+* B writes the new value into L3[N]
+* A recursively frees all the L1's under L2, then frees L2
+* B recursively double-frees all the L1's under L2, then double-frees L2 #
+
+Fix this by grabbing a lock for the entirety of the mapping update
+operation.
+
+Rather than grabbing map_pgdir_lock for the entire operation, however,
+repurpose the PGT_locked bit from L3's page->type_info as a lock.
+This means that rather than locking the entire address space, we
+"only" lock a single 512GiB chunk of hypervisor address space at a
+time.
+
+There was a proposal for a lock-and-reverify approach, where we walk
+the pagetables to the point where we decide what to do; then grab the
+map_pgdir_lock, re-verify the information we collected without the
+lock, and finally make the change (starting over again if anything had
+changed).  Without being able to guarantee that the L2 table wasn't
+freed, however, that means every read would need to be considered
+potentially unsafe.  Thinking carefully about that is probably
+something that wants to be done on public, not under time pressure.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 89 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index af726d3274..d6a0761f43 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2167,6 +2167,50 @@ void page_unlock(struct page_info *page)
+     current_locked_page_set(NULL);
+ }
+ 
++/*
++ * L3 table locks:
++ *
++ * Used for serialization in map_pages_to_xen() and modify_xen_mappings().
++ *
++ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to
++ * reuse the PGT_locked flag. This lock is taken only when we move down to L3
++ * tables and below, since L4 (and above, for 5-level paging) is still globally
++ * protected by map_pgdir_lock.
++ *
++ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock().
++ * This has two implications:
++ * - We cannot reuse reuse current_locked_page_* for debugging
++ * - To avoid the chance of deadlock, even for different pages, we
++ *   must never grab page_lock() after grabbing l3t_lock().  This
++ *   includes any page_lock()-based locks, such as
++ *   mem_sharing_page_lock().
++ *
++ * Also note that we grab the map_pgdir_lock while holding the
++ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in
++ * reverse order.
++ */
++static void l3t_lock(struct page_info *page)
++{
++    unsigned long x, nx;
++
++    do {
++        while ( (x = page->u.inuse.type_info) & PGT_locked )
++            cpu_relax();
++        nx = x | PGT_locked;
++    } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
++}
++
++static void l3t_unlock(struct page_info *page)
++{
++    unsigned long x, nx, y = page->u.inuse.type_info;
++
++    do {
++        x = y;
++        BUG_ON(!(x & PGT_locked));
++        nx = x & ~PGT_locked;
++    } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
++}
++
+ #ifdef CONFIG_PV
+ /*
+  * PTE flags that a guest may change without re-validating the PTE.
+@@ -5177,6 +5221,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+                          flush_area_local((const void *)v, f) : \
+                          flush_area_all((const void *)v, f))
+ 
++#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
++
++#define L3T_LOCK(page)        \
++    do {                      \
++        if ( locking )        \
++            l3t_lock(page);   \
++    } while ( false )
++
++#define L3T_UNLOCK(page)                           \
++    do {                                           \
++        if ( locking && (page) != ZERO_BLOCK_PTR ) \
++        {                                          \
++            l3t_unlock(page);                      \
++            (page) = ZERO_BLOCK_PTR;               \
++        }                                          \
++    } while ( false )
++
+ int map_pages_to_xen(
+     unsigned long virt,
+     mfn_t mfn,
+@@ -5188,6 +5249,7 @@ int map_pages_to_xen(
+     l1_pgentry_t *pl1e, ol1e;
+     unsigned int  i;
+     int rc = -ENOMEM;
++    struct page_info *current_l3page;
+ 
+ #define flush_flags(oldf) do {                 \
+     unsigned int o_ = (oldf);                  \
+@@ -5203,13 +5265,20 @@ int map_pages_to_xen(
+     }                                          \
+ } while (0)
+ 
++    L3T_INIT(current_l3page);
++
+     while ( nr_mfns != 0 )
+     {
+-        l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
++        l3_pgentry_t *pl3e, ol3e;
+ 
++        L3T_UNLOCK(current_l3page);
++
++        pl3e = virt_to_xen_l3e(virt);
+         if ( !pl3e )
+             goto out;
+ 
++        current_l3page = virt_to_page(pl3e);
++        L3T_LOCK(current_l3page);
+         ol3e = *pl3e;
+ 
+         if ( cpu_has_page1gb &&
+@@ -5543,6 +5612,7 @@ int map_pages_to_xen(
+     rc = 0;
+ 
+  out:
++    L3T_UNLOCK(current_l3page);
+     return rc;
+ }
+ 
+@@ -5571,6 +5641,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+     unsigned int  i;
+     unsigned long v = s;
+     int rc = -ENOMEM;
++    struct page_info *current_l3page;
+ 
+     /* Set of valid PTE bits which may be altered. */
+ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
+@@ -5579,11 +5650,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+     ASSERT(IS_ALIGNED(s, PAGE_SIZE));
+     ASSERT(IS_ALIGNED(e, PAGE_SIZE));
+ 
++    L3T_INIT(current_l3page);
++
+     while ( v < e )
+     {
+-        l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
++        l3_pgentry_t *pl3e;
++
++        L3T_UNLOCK(current_l3page);
+ 
+-        if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
++        pl3e = virt_to_xen_l3e(v);
++        if ( !pl3e )
++            goto out;
++
++        current_l3page = virt_to_page(pl3e);
++        L3T_LOCK(current_l3page);
++
++        if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+         {
+             /* Confirm the caller isn't trying to create new mappings. */
+             ASSERT(!(nf & _PAGE_PRESENT));
+@@ -5801,9 +5883,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+     rc = 0;
+ 
+  out:
++    L3T_UNLOCK(current_l3page);
+     return rc;
+ }
+ 
++#undef L3T_LOCK
++#undef L3T_UNLOCK
++
+ #undef flush_area
+ 
+ int destroy_xen_mappings(unsigned long s, unsigned long e)
+-- 
+2.25.1
+