summaryrefslogtreecommitdiffstats
path: root/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
diff options
context:
space:
mode:
author Mario Preksavec2019-11-19 13:17:56 +0100
committer Willy Sudiarto Raharjo2019-11-23 10:02:01 +0100
commit903c02712d4cf39ae8218eb47149258dfa8c7d8a (patch)
tree61c869ed4904270e41898b9bcf74090822e28418 /system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
parent604be6a3da8dc95e2d89a426877c7f4021eb91df (diff)
downloadslackbuilds-903c02712d4cf39ae8218eb47149258dfa8c7d8a.tar.gz
system/xen: Updated for version 4.12.1.
Signed-off-by: Mario Preksavec <mario@slackware.hr>
Diffstat (limited to 'system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch')
-rw-r--r--system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch413
1 files changed, 413 insertions, 0 deletions
diff --git a/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
new file mode 100644
index 0000000000..ad7e6fee1b
--- /dev/null
+++ b/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
@@ -0,0 +1,413 @@
+From ea3dc624c5e6325a9c2f079e52a85965d4ab6ce8 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:50 +0100
+Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to
+ begin with
+
+Validation and de-validation of pagetable trees may take arbitrarily
+large amounts of time, and so must be preemptible. This is indicated
+by setting the PGT_partial bit in the type_info, and setting
+nr_validated_entries and partial_flags appropriately. Specifically,
+if the entry at [nr_validated_entries] is partially validated,
+partial_flags should have the PGT_partial_set bit set, and the entry
+should hold a general reference count. During de-validation,
+put_page_type() is called on partially validated entries.
+
+Unfortunately, there are a number of issues with the current algorithm.
+
+First, doing a "normal" put_page_type() is not safe when no type ref
+is held: there is nothing to stop another vcpu from coming along and
+picking up validation again: at which point the put_page_type may drop
+the only page ref on an in-use page. Some examples are listed in the
+appendix.
+
+The core issue is that put_page_type() is being called both to clean
+up PGT_partial, and to drop a type count; and has no way of knowing
+which is which; and so if in between, PGT_partial is cleared,
+put_page_type() will drop the type ref erroneously.
+
+What is needed is to distinguish between two states:
+- Dropping a type ref which is held
+- Cleaning up a page which has been partially de/validated
+
+Fix this by telling put_page_type() which of the two activities you
+intend.
+
+When cleaning up a partial de/validation, take no action unless you
+find a page partially validated.
+
+If put_page_type() is called without PTF_partial_set, and finds the
+page in a PGT_partial state anyway, then there's certainly been a
+misaccounting somewhere, and carrying on would almost certainly cause
+a security issue, so crash the host instead.
+
+In put_page_from_lNe, pass partial_flags on to _put_page_type().
+
+old_guest_table may be set either with a fully validated page (when
+using the "deferred put" pattern), or with a partially validated page
+(when a normal "de-validation" is interrupted, or when a validation
+fails part-way through due to invalid entries). Add a flag,
+old_guest_table_partial, to indicate which of these it is, and use
+that to pass the appropriate flag to _put_page_type().
+
+While here, delete stray trailing whitespace.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+-----
+Appendix:
+
+Suppose page A, when interpreted as an l3 pagetable, contains all
+valid entries; and suppose A[x] points to page B, which when
+interpreted as an l2 pagetable, contains all valid entries.
+
+P1: PIN_L3_TABLE
+ A -> PGT_l3_table | 1 | valid
+ B -> PGT_l2_table | 1 | valid
+
+P1: UNPIN_TABLE
+ > Arrange to interrupt after B has been de-validated
+ B:
+ type_info -> PGT_l2_table | 0
+ A:
+ type_info -> PGT_l3_table | 1 | partial
+ nr_validated_enties -> (less than x)
+
+P2: mod_l4_entry to point to A
+ > Arrange for this to be interrupted while B is being validated
+ B:
+ type_info -> PGT_l2_table | 1 | partial
+ (nr_validated_entires &c set as appropriate)
+ A:
+ type_info -> PGT_l3_table | 1 | partial
+ nr_validated_entries -> x
+ partial_pte = 1
+
+P3: mod_l3_entry some other unrelated l3 to point to B:
+ B:
+ type_info -> PGT_l2_table | 1
+
+P1: Restart UNPIN_TABLE
+
+At this point, since A.nr_validate_entries == x and A.partial_pte !=
+0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping
+its type count to 0 while it's still being pointed to by some other l3
+
+A similar issue arises with old_guest_table. Consider the following
+scenario:
+
+Suppose A is a page which, when interpreted as an l2, has valid entries
+until entry x, which is invalid.
+
+V1: PIN_L2_TABLE(A)
+ <Validate until we try to validate [x], get -EINVAL>
+ A -> PGT_l2_table | 1 | PGT_partial
+ V1 -> old_guest_table = A
+ <delayed>
+
+V2: PIN_L2_TABLE(A)
+ <Pick up where V1 left off, try to re-validate [x], get -EINVAL>
+ A -> PGT_l2_table | 1 | PGT_partial
+ V2 -> old_guest_table = A
+ <restart>
+ put_old_guest_table()
+ _put_page_type(A)
+ A -> PGT_l2_table | 0
+
+V1: <restart>
+ put_old_guest_table()
+ _put_page_type(A) # UNDERFLOW
+
+Indeed, it is possible to engineer for old_guest_table for every vcpu
+a guest has to point to the same page.
+---
+ xen/arch/x86/domain.c | 6 +++
+ xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++-----
+ xen/include/asm-x86/domain.h | 4 +-
+ 3 files changed, 95 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 59df8a6d8d..f1ae5f89f5 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -1104,9 +1104,15 @@ int arch_set_info_guest(
+ rc = -ERESTART;
+ /* Fallthrough */
+ case -ERESTART:
++ /*
++ * NB that we're putting the kernel-mode table
++ * here, which we've already successfully
++ * validated above; hence partial = false;
++ */
+ v->arch.old_guest_ptpg = NULL;
+ v->arch.old_guest_table =
+ pagetable_get_page(v->arch.guest_table);
++ v->arch.old_guest_table_partial = false;
+ v->arch.guest_table = pagetable_null();
+ break;
+ default:
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index a432e69c74..81774368a0 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -1359,10 +1359,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ {
+ current->arch.old_guest_ptpg = ptpg;
+ current->arch.old_guest_table = pg;
++ current->arch.old_guest_table_partial = false;
+ }
+ else
+ {
+- rc = _put_page_type(pg, PTF_preemptible, ptpg);
++ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg);
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+@@ -1385,6 +1386,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ unsigned long mfn = l3e_get_pfn(l3e);
+ bool writeable = l3e_get_flags(l3e) & _PAGE_RW;
+
++ ASSERT(!(flags & PTF_partial_set));
+ ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
+ do {
+ put_data_page(mfn_to_page(_mfn(mfn)), writeable);
+@@ -1397,12 +1399,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+
+ if ( flags & PTF_defer )
+ {
++ ASSERT(!(flags & PTF_partial_set));
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
++ current->arch.old_guest_table_partial = false;
+ return 0;
+ }
+
+- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
++ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+
+@@ -1421,12 +1425,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+
+ if ( flags & PTF_defer )
+ {
++ ASSERT(!(flags & PTF_partial_set));
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
++ current->arch.old_guest_table_partial = false;
+ return 0;
+ }
+
+- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
++ rc = _put_page_type(pg, flags | PTF_preemptible,
++ mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+@@ -1535,6 +1542,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+
+ pl2e = map_domain_page(_mfn(pfn));
+
++ /*
++ * NB that alloc_l2_table will never set partial_pte on an l2; but
++ * free_l2_table might if a linear_pagetable entry is interrupted
++ * partway through de-validation. In that circumstance,
++ * get_page_from_l2e() will always return -EINVAL; and we must
++ * retain the type ref by doing the normal partial_flags tracking.
++ */
++
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
+ i++, partial_flags = 0 )
+ {
+@@ -1598,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ page->partial_flags = partial_flags;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
++ current->arch.old_guest_table_partial = true;
+ }
+ }
+ if ( rc < 0 )
+@@ -1704,12 +1720,16 @@ static int alloc_l3_table(struct page_info *page)
+ * builds.
+ */
+ if ( current->arch.old_guest_table == l3e_get_page(l3e) )
++ {
++ ASSERT(current->arch.old_guest_table_partial);
+ page->partial_flags = PTF_partial_set;
++ }
+ else
+ ASSERT_UNREACHABLE();
+ }
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
++ current->arch.old_guest_table_partial = true;
+ }
+ while ( i-- > 0 )
+ pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+@@ -1897,12 +1917,16 @@ static int alloc_l4_table(struct page_info *page)
+ * builds.
+ */
+ if ( current->arch.old_guest_table == l4e_get_page(l4e) )
++ {
++ ASSERT(current->arch.old_guest_table_partial);
+ page->partial_flags = PTF_partial_set;
++ }
+ else
+ ASSERT_UNREACHABLE();
+ }
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
++ current->arch.old_guest_table_partial = true;
+ }
+ }
+ }
+@@ -2831,6 +2855,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
+ x = y;
+ nx = x - 1;
+
++ /*
++ * Is this expected to do a full reference drop, or only
++ * cleanup partial validation / devalidation?
++ *
++ * If the former, the caller must hold a "full" type ref;
++ * which means the page must be validated. If the page is
++ * *not* fully validated, continuing would almost certainly
++ * open up a security hole. An exception to this is during
++ * domain destruction, where PGT_validated can be dropped
++ * without dropping a type ref.
++ *
++ * If the latter, do nothing unless type PGT_partial is set.
++ * If it is set, the type count must be 1.
++ */
++ if ( !(flags & PTF_partial_set) )
++ BUG_ON((x & PGT_partial) ||
++ !((x & PGT_validated) || page_get_owner(page)->is_dying));
++ else if ( !(x & PGT_partial) )
++ return 0;
++ else
++ BUG_ON((x & PGT_count_mask) != 1);
++
+ ASSERT((x & PGT_count_mask) != 0);
+
+ switch ( nx & (PGT_locked | PGT_count_mask) )
+@@ -3092,17 +3138,34 @@ int put_old_guest_table(struct vcpu *v)
+ if ( !v->arch.old_guest_table )
+ return 0;
+
+- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
+- v->arch.old_guest_ptpg) )
++ rc = _put_page_type(v->arch.old_guest_table,
++ PTF_preemptible |
++ ( v->arch.old_guest_table_partial ?
++ PTF_partial_set : 0 ),
++ v->arch.old_guest_ptpg);
++
++ if ( rc == -ERESTART || rc == -EINTR )
+ {
+- case -EINTR:
+- case -ERESTART:
++ v->arch.old_guest_table_partial = (rc == -ERESTART);
+ return -ERESTART;
+- case 0:
+- put_page(v->arch.old_guest_table);
+ }
+
++ /*
++ * It shouldn't be possible for _put_page_type() to return
++ * anything else at the moment; but if it does happen in
++ * production, leaking the type ref is probably the best thing to
++ * do. Either way, drop the general ref held by old_guest_table.
++ */
++ ASSERT(rc == 0);
++
++ put_page(v->arch.old_guest_table);
+ v->arch.old_guest_table = NULL;
++ v->arch.old_guest_ptpg = NULL;
++ /*
++ * Safest default if someone sets old_guest_table without
++ * explicitly setting old_guest_table_partial.
++ */
++ v->arch.old_guest_table_partial = true;
+
+ return rc;
+ }
+@@ -3253,11 +3316,11 @@ int new_guest_cr3(mfn_t mfn)
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+- rc = -ERESTART;
+- /* fallthrough */
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ curr->arch.old_guest_table_partial = (rc == -ERESTART);
++ rc = -ERESTART;
+ break;
+ default:
+ BUG_ON(rc);
+@@ -3494,6 +3557,7 @@ long do_mmuext_op(
+ {
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ curr->arch.old_guest_table_partial = false;
+ }
+ }
+ }
+@@ -3528,6 +3592,11 @@ long do_mmuext_op(
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ /*
++ * EINTR means we still hold the type ref; ERESTART
++ * means PGT_partial holds the type ref
++ */
++ curr->arch.old_guest_table_partial = (rc == -ERESTART);
+ rc = 0;
+ break;
+ default:
+@@ -3596,11 +3665,15 @@ long do_mmuext_op(
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+- rc = -ERESTART;
+- /* fallthrough */
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ /*
++ * EINTR means we still hold the type ref;
++ * ERESTART means PGT_partial holds the ref
++ */
++ curr->arch.old_guest_table_partial = (rc == -ERESTART);
++ rc = -ERESTART;
+ break;
+ default:
+ BUG_ON(rc);
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index 214e44ce1c..2cfce7b36b 100644
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -307,7 +307,7 @@ struct arch_domain
+
+ struct paging_domain paging;
+ struct p2m_domain *p2m;
+- /* To enforce lock ordering in the pod code wrt the
++ /* To enforce lock ordering in the pod code wrt the
+ * page_alloc lock */
+ int page_alloc_unlock_level;
+
+@@ -581,6 +581,8 @@ struct arch_vcpu
+ struct page_info *old_guest_table; /* partially destructed pagetable */
+ struct page_info *old_guest_ptpg; /* containing page table of the */
+ /* former, if any */
++ bool old_guest_table_partial; /* Are we dropping a type ref, or just
++ * finishing up a partial de-validation? */
+ /* guest_table holds a ref to the page, and also a type-count unless
+ * shadow refcounts are in use */
+ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
+--
+2.23.0
+