summaryrefslogtreecommitdiffstats
path: root/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch
diff options
context:
space:
mode:
Diffstat (limited to 'system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch')
-rw-r--r--system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch249
1 files changed, 249 insertions, 0 deletions
diff --git a/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch
new file mode 100644
index 0000000000..db407416b9
--- /dev/null
+++ b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch
@@ -0,0 +1,249 @@
+From 0ff9a8453dc47cd47eee9659d5916afb5094e871 Mon Sep 17 00:00:00 2001
+From: Hongyan Xia <hongyxia@amazon.com>
+Date: Sat, 11 Jan 2020 21:57:43 +0000
+Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates
+
+map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB
+superpages if possible, to maximize TLB efficiency. This means both
+replacing superpage entries with smaller entries, and replacing
+smaller entries with superpages.
+
+Unfortunately, while some potential races are handled correctly,
+others are not. These include:
+
+1. When one processor modifies a sub-superpage mapping while another
+processor replaces the entire range with a superpage.
+
+Take the following example:
+
+Suppose L3[N] points to L2. And suppose we have two processors, A and
+B.
+
+* A walks the pagetables, get a pointer to L2.
+* B replaces L3[N] with a 1GiB mapping.
+* B Frees L2
+* A writes L2[M] #
+
+This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't
+handle higher-level superpages properly: If you call virt_xen_to_l2e
+on a virtual address within an L3 superpage, you'll either hit a BUG()
+(most likely), or get a pointer into the middle of a data page; same
+with virt_xen_to_l1 on a virtual address within either an L3 or L2
+superpage.
+
+So take the following example:
+
+* A reads pl3e and discovers it to point to an L2.
+* B replaces L3[N] with a 1GiB mapping
+* A calls virt_to_xen_l2e() and hits the BUG_ON() #
+
+2. When two processors simultaneously try to replace a sub-superpage
+mapping with a superpage mapping.
+
+Take the following example:
+
+Suppose L3[N] points to L2. And suppose we have two processors, A and B,
+both trying to replace L3[N] with a superpage.
+
+* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2.
+* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2.
+* A writes the new value into L3[N]
+* B writes the new value into L3[N]
+* A recursively frees all the L1's under L2, then frees L2
+* B recursively double-frees all the L1's under L2, then double-frees L2 #
+
+Fix this by grabbing a lock for the entirety of the mapping update
+operation.
+
+Rather than grabbing map_pgdir_lock for the entire operation, however,
+repurpose the PGT_locked bit from L3's page->type_info as a lock.
+This means that rather than locking the entire address space, we
+"only" lock a single 512GiB chunk of hypervisor address space at a
+time.
+
+There was a proposal for a lock-and-reverify approach, where we walk
+the pagetables to the point where we decide what to do; then grab the
+map_pgdir_lock, re-verify the information we collected without the
+lock, and finally make the change (starting over again if anything had
+changed). Without being able to guarantee that the L2 table wasn't
+freed, however, that means every read would need to be considered
+potentially unsafe. Thinking carefully about that is probably
+something that wants to be done on public, not under time pressure.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 89 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index af726d3274..d6a0761f43 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2167,6 +2167,50 @@ void page_unlock(struct page_info *page)
+ current_locked_page_set(NULL);
+ }
+
++/*
++ * L3 table locks:
++ *
++ * Used for serialization in map_pages_to_xen() and modify_xen_mappings().
++ *
++ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to
++ * reuse the PGT_locked flag. This lock is taken only when we move down to L3
++ * tables and below, since L4 (and above, for 5-level paging) is still globally
++ * protected by map_pgdir_lock.
++ *
++ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock().
++ * This has two implications:
++ * - We cannot reuse reuse current_locked_page_* for debugging
++ * - To avoid the chance of deadlock, even for different pages, we
++ * must never grab page_lock() after grabbing l3t_lock(). This
++ * includes any page_lock()-based locks, such as
++ * mem_sharing_page_lock().
++ *
++ * Also note that we grab the map_pgdir_lock while holding the
++ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in
++ * reverse order.
++ */
++static void l3t_lock(struct page_info *page)
++{
++ unsigned long x, nx;
++
++ do {
++ while ( (x = page->u.inuse.type_info) & PGT_locked )
++ cpu_relax();
++ nx = x | PGT_locked;
++ } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
++}
++
++static void l3t_unlock(struct page_info *page)
++{
++ unsigned long x, nx, y = page->u.inuse.type_info;
++
++ do {
++ x = y;
++ BUG_ON(!(x & PGT_locked));
++ nx = x & ~PGT_locked;
++ } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
++}
++
+ #ifdef CONFIG_PV
+ /*
+ * PTE flags that a guest may change without re-validating the PTE.
+@@ -5177,6 +5221,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+ flush_area_local((const void *)v, f) : \
+ flush_area_all((const void *)v, f))
+
++#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
++
++#define L3T_LOCK(page) \
++ do { \
++ if ( locking ) \
++ l3t_lock(page); \
++ } while ( false )
++
++#define L3T_UNLOCK(page) \
++ do { \
++ if ( locking && (page) != ZERO_BLOCK_PTR ) \
++ { \
++ l3t_unlock(page); \
++ (page) = ZERO_BLOCK_PTR; \
++ } \
++ } while ( false )
++
+ int map_pages_to_xen(
+ unsigned long virt,
+ mfn_t mfn,
+@@ -5188,6 +5249,7 @@ int map_pages_to_xen(
+ l1_pgentry_t *pl1e, ol1e;
+ unsigned int i;
+ int rc = -ENOMEM;
++ struct page_info *current_l3page;
+
+ #define flush_flags(oldf) do { \
+ unsigned int o_ = (oldf); \
+@@ -5203,13 +5265,20 @@ int map_pages_to_xen(
+ } \
+ } while (0)
+
++ L3T_INIT(current_l3page);
++
+ while ( nr_mfns != 0 )
+ {
+- l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
++ l3_pgentry_t *pl3e, ol3e;
+
++ L3T_UNLOCK(current_l3page);
++
++ pl3e = virt_to_xen_l3e(virt);
+ if ( !pl3e )
+ goto out;
+
++ current_l3page = virt_to_page(pl3e);
++ L3T_LOCK(current_l3page);
+ ol3e = *pl3e;
+
+ if ( cpu_has_page1gb &&
+@@ -5543,6 +5612,7 @@ int map_pages_to_xen(
+ rc = 0;
+
+ out:
++ L3T_UNLOCK(current_l3page);
+ return rc;
+ }
+
+@@ -5571,6 +5641,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ unsigned int i;
+ unsigned long v = s;
+ int rc = -ENOMEM;
++ struct page_info *current_l3page;
+
+ /* Set of valid PTE bits which may be altered. */
+ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
+@@ -5579,11 +5650,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ ASSERT(IS_ALIGNED(s, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(e, PAGE_SIZE));
+
++ L3T_INIT(current_l3page);
++
+ while ( v < e )
+ {
+- l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
++ l3_pgentry_t *pl3e;
++
++ L3T_UNLOCK(current_l3page);
+
+- if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
++ pl3e = virt_to_xen_l3e(v);
++ if ( !pl3e )
++ goto out;
++
++ current_l3page = virt_to_page(pl3e);
++ L3T_LOCK(current_l3page);
++
++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+ {
+ /* Confirm the caller isn't trying to create new mappings. */
+ ASSERT(!(nf & _PAGE_PRESENT));
+@@ -5801,9 +5883,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ rc = 0;
+
+ out:
++ L3T_UNLOCK(current_l3page);
+ return rc;
+ }
+
++#undef L3T_LOCK
++#undef L3T_UNLOCK
++
+ #undef flush_area
+
+ int destroy_xen_mappings(unsigned long s, unsigned long e)
+--
+2.25.1
+