[prev in list] [next in list] [prev in thread] [next in thread] 

List:       oss-security
Subject:    [oss-security] Xen Security Advisory 286 v4 - x86 PV guest INVLPG-like flushes may leave stale TLB e
From:       Xen.org security team <security () xen ! org>
Date:       2020-10-20 12:00:29
Message-ID: E1kUqJZ-0001xT-Qb () xenbits ! xenproject ! org
[Download RAW message or body]

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA256

                    Xen Security Advisory XSA-286
                              version 4

     x86 PV guest INVLPG-like flushes may leave stale TLB entries

UPDATES IN VERSION 4
====================

Warn about performance impact.

Public release.

ISSUE DESCRIPTION
=================

x86 PV guest kernels may use hypercalls with INVLPG-like behavior to
invalidate TLB entries even after changes to non-leaf page tables.  Such
changes to non-leaf page tables will, however, also render stale
possible TLB entries created by Xen's internal use of linear page tables
to process guest requests like update-va-mapping.  Invalidation of these
TLB entries has been missing, allowing subsequent guest requests to
change address mappings for one process to potentially modify memory
meanwhile in use elsewhere.

IMPACT
======

Malicious x86 PV guest user mode may be able to escalate their privilege
to that of the guest kernel.

VULNERABLE SYSTEMS
==================

All versions of Xen expose the vulnerability.

The vulnerability is exposed to x86 PV guests only.  x86 HVM/PVH guests
as well as ARM ones are not vulnerable.

MITIGATION
==========

There is no known mitigation.

CREDITS
=======

This issue was discovered by Jann Horn of Google Project Zero.

RESOLUTION
==========

Applying the appropriate set of attached patches resolves this issue.

Note that these patches are known to produce serious performence
problems for at least some workloads.  Work is ongoing to improve the
performance, and this XSA will be updated when new patches are
available.

xsa286/*.patch           xen-unstable
xsa286-4.14/*.patch      Xen 4.14.x
xsa286-4.13/*.patch      Xen 4.13.x
xsa286-4.12/*.patch      Xen 4.12.x
xsa286-4.11/*.patch      Xen 4.11.x
xsa286-4.10/*.patch      Xen 4.10.x

$ sha256sum xsa286* xsa286*/*
e67a0828be2157c54282a4cc6956234581d32b793021e12ee61676bad4d3b740  xsa286.meta
95c1650a7e0496577929fd5d3240b14ab69e4086b613a52117fcfd879c9aea0d  \
xsa286-4.10/0001-x86-shadow-drop-further-32-bit-relics.patch \
18218556f8f9218a57dc9afeffddeaa2133fbe2788871082d8b040ab67abb68c  \
xsa286-4.10/0002-x86-shadow-don-t-pass-wrong-L4-MFN-to-guest_walk_tab.patch \
e8c89338a74b5fce9ee2c82e360889123afd7efe72536c236cf521c28e48ecc9  \
xsa286-4.10/0003-x86-shadow-don-t-use-map_domain_page_global-on-paths.patch \
a677ddb308359450087c21085536d64a15d9339cd02a88eecd8d694c9d26837c  \
xsa286-4.10/0004-x86-don-t-allow-clearing-of-TF_kernel_mode-for-other.patch \
927b68d2ffb4afb677b2f3d820f2e12ae61a37a2b4797fdc8316fe65adc0e46f  \
xsa286-4.10/0005-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch \
9c6dfc0a2bf7408c1852de2410fecbaab48a4b885f8cf4836781d31727f0c69d  \
xsa286-4.10/0006-x86-mm-check-page-types-in-do_page_walk.patch \
e4c0fcbfd558a95d52b5312902b10aced0dd8d23d1d1af5b3f7d39c3641010c4  \
xsa286-4.10/0007-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch \
c0e481010cc801c1455008f52e5f2799240223dbfda2f252c872381d30f5af74  \
xsa286-4.10/0008-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch \
89cf1192938027eaa7bb38dcbe268f54771bac0e107fa33f4f625c1fef3397c1  \
xsa286-4.10/0009-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch \
6c01ba250ae9ffcd894f603d519bf3a170c92c9e0bc3bb7a79c3e67412ffcf35  \
xsa286-4.10/0010-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch \
21d8f1f05a537bae19088ed28cfb8990c2c19f3f93fbe894f40b354ea7702d3a  \
xsa286-4.11/0001-x86-don-t-allow-clearing-of-TF_kernel_mode-for-other.patch \
6e4023e7d366e53ccda8f80a4b36bd77f194b84ef0a30d1af6f18d56c11c5256  \
xsa286-4.11/0002-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch \
aabccc3a6c267e41c3ac47916592ce645f6e6788cad900845dbf254801f9dd23  \
xsa286-4.11/0003-x86-mm-check-page-types-in-do_page_walk.patch \
9bbc53a6533209f85edb005e8a517d5a80fda9db8af39b9378eba74857b9fa6b  \
xsa286-4.11/0004-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch \
19f09374a4d3383eb1e5b9f0b465ff33e17f71fe7131e6cd61599ac9f79e8b00  \
xsa286-4.11/0005-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch \
7b17dd013bdd9520a7c0cec2a6b56da677d980b982b6869865c99f374b3c1560  \
xsa286-4.11/0006-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch \
c4c52a34efd14745a0a80f15a73daf7d7e72cfdf2925045a5ec1d22c798baaf2  \
xsa286-4.11/0007-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch \
e831e18b528d845e3aef329df98244ebed1ade0388f1bfb083d8de626e148388  \
xsa286-4.12/0001-x86-don-t-allow-clearing-of-TF_kernel_mode-for-other.patch \
9db102623f51bb5757fff3a2364a675893ca52e09f47868dad15ce1cb44c40a8  \
xsa286-4.12/0002-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch \
3b0768c05123681db5256a96af3a8abe18e0a488b767183be728cb0a06969333  \
xsa286-4.12/0003-x86-mm-check-page-types-in-do_page_walk.patch \
30342a0c11e0a48b50cd461b6388dce8640f6380cde1d738a3ddd8b95a8c7b1b  \
xsa286-4.12/0004-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch \
3c1a687243e2df7b64a49279cd99ceabdcc00d9f48476faacd8915a9cc61e775  \
xsa286-4.12/0005-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch \
c5aacc47b696b74e80e368c25f0deba20aa69e1de6823f334fddd6a40c6f6a22  \
xsa286-4.12/0006-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch \
07e3f5bcbfbdec070e28fc1bd727b0eb7adc2a75c26636e081800e9939c9bcc2  \
xsa286-4.12/0007-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch \
089d284b29b7179b5c9c04beaf90f24b4b79d81eb3bf55b607405755f3f8b6d8  \
xsa286-4.13/0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch \
96a6391f2a027a034f0e3308119b2cc9b3543db985f9975067db42eb553d2ff9  \
xsa286-4.13/0002-x86-mm-check-page-types-in-do_page_walk.patch \
4060ed8ef41314ddf413e307d39254a899a8e65bd28d5dcb6da73a0d922b5cfb  \
xsa286-4.13/0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch \
38ed58aceaeb3ab4d9991f12ec11b1eaa0a6d4853023e0e59c0d92589ed112b6  \
xsa286-4.13/0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch \
99a0afbc30acf6df26e152752bb65a1de33d9a0a45a5e7ad7693c01c1d1f53d6  \
xsa286-4.13/0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch \
4ce310f802a3fcc55d704af755ef4f6d04e029cddc7df03827f7518d3a8faaf9  \
xsa286-4.13/0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch \
36f1631c0880a615de48006fc5318c1b305be2ba5ed9c3cd0dd4ed82bd481bed  \
xsa286-4.14/0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch \
4c10f102b71f26e86b2f5ef9d7bbb31000e389b64185cc23491a113311d70983  \
xsa286-4.14/0002-x86-mm-check-page-types-in-do_page_walk.patch \
ee227e37cb6de2bed50395a72c6ce9493bdfc0018ed2dc70c81334c98751564d  \
xsa286-4.14/0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch \
4892411356967838e49a5063e80f6169446e460bc3bee8765fe8f4852b801851  \
xsa286-4.14/0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch \
e840cb758e21be76cfcbd85be82a23bb91e2766d435d314047a62e3b333b7dcb  \
xsa286-4.14/0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch \
16763d9a407ae30d79e1f849d1ae1e03e9a1d5dca15181f5d53f49e4eb4708a2  \
xsa286-4.14/0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch \
34c00849d8cf56897d5c17c65d83b9ef2f0ce849766c9d7ca07b56de4a4c1307  \
xsa286/0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch \
b82ff481239b72f7b364cc4a66363a812a9205b3e605b215b2267cd6f13f9a06  \
xsa286/0002-x86-mm-check-page-types-in-do_page_walk.patch \
527f7a74c0bee0f137b86b3b6475ff4648266f9dcaa5368df2f06b91efd9dbb8  \
xsa286/0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch \
fd11aaacef7fd090fbc6409069b57d76a06a7e835b9a9eb5de8e15ace3ffb3e5  \
xsa286/0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch \
dbfce6e2975d191b6568392593a835b5fc87d3869a5fad1917aecc11d1f2a62a  \
xsa286/0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch \
5ad8a48a603b3fd195168d62d481516999422f86e5bbc65b01cde1ff1d262fb2  \
xsa286/0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch $

DEPLOYMENT DURING EMBARGO
=========================

Deployment of the patches and/or mitigations described above (or
others which are substantially similar) is permitted during the
embargo, even on public-facing systems with untrusted guest users and
administrators.

But: Distribution of updated software is prohibited (except to other
members of the predisclosure list).

Predisclosure list members who wish to deploy significantly different
patches and/or mitigations, please contact the Xen Project Security
Team.

(Note: this during-embargo deployment notice is retained in
post-embargo publicly released Xen Project advisories, even though it
is then no longer applicable.  This is to enable the community to have
oversight of the Xen Project Security Team's decisionmaking.)

For more information about permissible uses of embargoed information,
consult the Xen Project community's agreed Security Policy:
  http://www.xenproject.org/security-policy.html
-----BEGIN PGP SIGNATURE-----

iQFABAEBCAAqFiEEI+MiLBRfRHX6gGCng/4UyVfoK9kFAl+OzqAMHHBncEB4ZW4u
b3JnAAoJEIP+FMlX6CvZf44IAITjC6CiRoB4BdmqcMwpQ2bJbC/XqNN/xV/DXTsg
p0sv0w3lXQQIOIzR7UG70IlA2vjW9LNX6k6qjqDGpOJQn5d2Pbj4jFkd11kq24IK
PWuoxpswQRaVu0CU5aPvvtAIkOu9v0wZ6//M3cpe81h1Pl+Mg413SSArP6qRFjhY
tVdzlBzOwqXYMH5prlvWG+td43D6e5UeMPZM4o4Rkovdjk3QPkpsBrElnlZInmqH
ntbFSTCCUcIpaLRY88yPOksTHXxPtdrDh2l9okNYhLhf7Ywk0z1SWkiueazT15t4
ytDw6OdmYjajqFhI9+FvyYLRiQK+twl6iPUXKarqEQyBbEU=
=ori5
-----END PGP SIGNATURE-----


["xsa286.meta" (application/octet-stream)]
["xsa286-4.10/0001-x86-shadow-drop-further-32-bit-relics.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: drop further 32-bit relics

PV guests don't ever get shadowed in other than 4-level mode anymore;
commit 5a3ce8f85e ("x86/shadow: drop stray name tags from
sh_{guest_get,map}_eff_l1e()") didn't go quite fare enough (and there's
a good chance that further cleanup opportunity exists, which I simply
didn't notice).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>

diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 778f7907f1..35e08f9097 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3804,8 +3804,7 @@ sh_update_linear_entries(struct vcpu *v)
 
 #elif SHADOW_PAGING_LEVELS == 3
 
-    /* PV: XXX
-     *
+    /*
      * HVM: To give ourselves a linear map of the  shadows, we need to
      * extend a PAE shadow to 4 levels.  We do this by  having a monitor
      * l3 in slot 0 of the monitor l4 table, and  copying the PAE l3
@@ -3814,7 +3813,7 @@ sh_update_linear_entries(struct vcpu *v)
      * the shadows.
      */
 
-    if ( shadow_mode_external(d) )
+    ASSERT(shadow_mode_external(d));
     {
         /* Install copies of the shadow l3es into the monitor l2 table
          * that maps SH_LINEAR_PT_VIRT_START. */
@@ -3860,8 +3859,6 @@ sh_update_linear_entries(struct vcpu *v)
         if ( v != current )
             unmap_domain_page(ml2e);
     }
-    else
-        domain_crash(d); /* XXX */
 
 #else
 #error this should not happen
@@ -4090,12 +4087,9 @@ sh_update_cr3(struct vcpu *v, int do_locking)
       * until the next CR3 write makes us refresh our cache. */
      ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
 
-     if ( shadow_mode_external(d) )
-         /* Find where in the page the l3 table is */
-         guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
-     else
-         /* PV guest: l3 is at the start of a page */
-         guest_idx = 0;
+     ASSERT(shadow_mode_external(d));
+     /* Find where in the page the l3 table is */
+     guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
 
      // Ignore the low 2 bits of guest_idx -- they are really just
      // cache control.
@@ -4106,17 +4100,13 @@ sh_update_cr3(struct vcpu *v, int do_locking)
          v->arch.paging.shadow.gl3e[i] = gl3e[i];
      unmap_domain_page(gl3e);
 #elif GUEST_PAGING_LEVELS == 2
-    if ( shadow_mode_external(d) || shadow_mode_translate(d) )
-    {
-        if ( v->arch.paging.shadow.guest_vtable )
-            unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
-        v->arch.paging.shadow.guest_vtable = map_domain_page_global(gmfn);
-        /* Does this really need map_domain_page_global?  Handle the
-         * error properly if so. */
-        BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
-    }
-    else
-        v->arch.paging.shadow.guest_vtable = __linear_l2_table;
+    ASSERT(shadow_mode_external(d));
+    if ( v->arch.paging.shadow.guest_vtable )
+        unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
+    v->arch.paging.shadow.guest_vtable = map_domain_page_global(gmfn);
+    /* Does this really need map_domain_page_global?  Handle the
+     * error properly if so. */
+    BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
 #else
 #error this should never happen
 #endif
@@ -4225,21 +4215,15 @@ sh_update_cr3(struct vcpu *v, int do_locking)
     {
         make_cr3(v, pagetable_get_mfn(v->arch.monitor_table));
     }
+#if SHADOW_PAGING_LEVELS == 4
     else // not shadow_mode_external...
     {
         /* We don't support PV except guest == shadow == config levels */
-        BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
-#if SHADOW_PAGING_LEVELS == 3
-        /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
-         * Don't use make_cr3 because (a) we know it's below 4GB, and
-         * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
-        ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
-        v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
-#else
-        /* 4-on-4: Just use the shadow top-level directly */
+        BUILD_BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
+        /* Just use the shadow top-level directly */
         make_cr3(v, pagetable_get_mfn(v->arch.shadow_table[0]));
-#endif
     }
+#endif
 
 
     ///

["xsa286-4.10/0002-x86-shadow-don-t-pass-wrong-L4-MFN-to-guest_walk_tab.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: don't pass wrong L4 MFN to guest_walk_tables()

64-bit PV guest user mode runs on a different L4 table. Make sure
- the accessed bit gets set in the correct table (and in log-dirty
  mode the correct page gets marked dirty) during guest walks,
- the correct table gets audited by sh_audit_gw(),
- correct info gets logged by print_gw().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: George Dunlap <george.dunlap@citrix.com>
master commit: db2af23d15077605f286d8ef86c8f5d9c1b8302a
master date: 2019-02-20 17:07:17 +0100

diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 35e08f9097..e9e4ded427 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -180,7 +180,10 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                              INVALID_MFN,
                              v->arch.paging.shadow.gl3e
 #else /* 32 or 64 */
-                             pagetable_get_mfn(v->arch.guest_table),
+                             (((v->arch.flags & TF_kernel_mode) ||
+                               is_pv_32bit_vcpu(v))
+                              ? pagetable_get_mfn(v->arch.guest_table)
+                              : pagetable_get_mfn(v->arch.guest_table_user)),
                              v->arch.paging.shadow.guest_vtable
 #endif
                              );

["xsa286-4.10/0003-x86-shadow-don-t-use-map_domain_page_global-on-paths.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: don't use map_domain_page_global() on paths that may not
 fail

The assumption (according to one comment) and hope (according to
another) that map_domain_page_global() can't fail are both wrong on
large enough systems. Do away with the guest_vtable field altogether,
and establish / tear down the desired mapping as necessary.

The alternatives, discarded as being undesirable, would have been to
either crash the guest in sh_update_cr3() when the mapping fails, or to
bubble up an error indicator, which upper layers would have a hard time
to deal with (other than again by crashing the guest).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Tim Deegan <tim@xen.org>

diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index e9e4ded427..4f8cfd6212 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -175,18 +175,22 @@ static inline bool
 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                      uint32_t pfec)
 {
-    return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
 #if GUEST_PAGING_LEVELS == 3 /* PAE */
-                             INVALID_MFN,
-                             v->arch.paging.shadow.gl3e
+    return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
+                             INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
-                             (((v->arch.flags & TF_kernel_mode) ||
-                               is_pv_32bit_vcpu(v))
-                              ? pagetable_get_mfn(v->arch.guest_table)
-                              : pagetable_get_mfn(v->arch.guest_table_user)),
-                             v->arch.paging.shadow.guest_vtable
+    const struct domain *d = v->domain;
+    mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
+                      ? pagetable_get_mfn(v->arch.guest_table)
+                      : pagetable_get_mfn(v->arch.guest_table_user));
+    void *root_map = map_domain_page(root_mfn);
+    bool ok = guest_walk_tables(v, p2m_get_hostp2m(d), va, gw, pfec,
+                                root_mfn, root_map);
+
+    unmap_domain_page(root_map);
+
+    return ok;
 #endif
-                             );
 }
 
 /* This validation is called with lock held, and after write permission
@@ -226,8 +230,9 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
     perfc_incr(shadow_check_gwalk);
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
+    l4p = map_domain_page(gw->l4mfn);
     mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
+    unmap_domain_page(l4p);
     l3p = map_domain_page(gw->l3mfn);
     mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
     unmap_domain_page(l3p);
@@ -235,13 +240,11 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
     mismatch |= (gw->l3e.l3 !=
                  v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
 #endif
+#endif
     l2p = map_domain_page(gw->l2mfn);
     mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
     unmap_domain_page(l2p);
-#else
-    l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
-    mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
-#endif
+
     if ( !(guest_can_use_l2_superpages(v) &&
            (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
     {
@@ -3884,7 +3887,8 @@ sh_update_linear_entries(struct vcpu *v)
 }
 
 
-/* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
+/*
+ * Removes vcpu->arch.shadow_table[].
  * Does all appropriate management/bookkeeping/refcounting/etc...
  */
 static void
@@ -3895,23 +3899,6 @@ sh_detach_old_tables(struct vcpu *v)
     int i = 0;
 
     ////
-    //// vcpu->arch.paging.shadow.guest_vtable
-    ////
-
-#if GUEST_PAGING_LEVELS == 3
-    /* PAE guests don't have a mapping of the guest top-level table */
-    ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
-#else
-    if ( v->arch.paging.shadow.guest_vtable )
-    {
-        if ( shadow_mode_external(d) || shadow_mode_translate(d) )
-            unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
-        v->arch.paging.shadow.guest_vtable = NULL;
-    }
-#endif // !NDEBUG
-
-
-    ////
     //// vcpu->arch.shadow_table[]
     ////
 
@@ -4068,28 +4055,10 @@ sh_update_cr3(struct vcpu *v, int do_locking)
 #endif
         gmfn = pagetable_get_mfn(v->arch.guest_table);
 
-
-    ////
-    //// vcpu->arch.paging.shadow.guest_vtable
-    ////
-#if GUEST_PAGING_LEVELS == 4
-    if ( shadow_mode_external(d) || shadow_mode_translate(d) )
-    {
-        if ( v->arch.paging.shadow.guest_vtable )
-            unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
-        v->arch.paging.shadow.guest_vtable = map_domain_page_global(gmfn);
-        /* PAGING_LEVELS==4 implies 64-bit, which means that
-         * map_domain_page_global can't fail */
-        BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
-    }
-    else
-        v->arch.paging.shadow.guest_vtable = __linear_l4_table;
-#elif GUEST_PAGING_LEVELS == 3
+#if GUEST_PAGING_LEVELS == 3
      /* On PAE guests we don't use a mapping of the guest's own top-level
       * table.  We cache the current state of that table and shadow that,
       * until the next CR3 write makes us refresh our cache. */
-     ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
-
      ASSERT(shadow_mode_external(d));
      /* Find where in the page the l3 table is */
      guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
@@ -4102,16 +4071,6 @@ sh_update_cr3(struct vcpu *v, int do_locking)
      for ( i = 0; i < 4 ; i++ )
          v->arch.paging.shadow.gl3e[i] = gl3e[i];
      unmap_domain_page(gl3e);
-#elif GUEST_PAGING_LEVELS == 2
-    ASSERT(shadow_mode_external(d));
-    if ( v->arch.paging.shadow.guest_vtable )
-        unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
-    v->arch.paging.shadow.guest_vtable = map_domain_page_global(gmfn);
-    /* Does this really need map_domain_page_global?  Handle the
-     * error properly if so. */
-    BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
-#else
-#error this should never happen
 #endif
 
 
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 5afaf6b9de..af03ebd772 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -135,8 +135,6 @@ struct shadow_vcpu {
     l3_pgentry_t l3table[4] __attribute__((__aligned__(32)));
     /* PAE guests: per-vcpu cache of the top-level *guest* entries */
     l3_pgentry_t gl3e[4] __attribute__((__aligned__(32)));
-    /* Non-PAE guests: pointer to guest top-level pagetable */
-    void *guest_vtable;
     /* Last MFN that we emulated a write to as unshadow heuristics. */
     unsigned long last_emulated_mfn_for_unshadow;
     /* MFN of the last shadow that we shot a writeable mapping in */

["xsa286-4.10/0004-x86-don-t-allow-clearing-of-TF_kernel_mode-for-other.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV

The flag is really only meant for those, both HVM and 32-bit PV tell
kernel from user mode based on CPL/RPL. Remove the all-question-marks
comment and let's be on the safe side here and also suppress clearing
for 32-bit PV (this isn't a fast path after all).

Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and
sh_walk_guest_tables(). Note that shadow_one_bit_disable() already
assumes the new behavior.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 7ae7266c5f..341569397f 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -808,9 +808,15 @@ int arch_set_info_guest(
 
     v->fpu_initialised = !!(flags & VGCF_I387_VALID);
 
-    v->arch.flags &= ~TF_kernel_mode;
-    if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
-        v->arch.flags |= TF_kernel_mode;
+    v->arch.flags |= TF_kernel_mode;
+    if ( unlikely(!(flags & VGCF_in_kernel)) &&
+         /*
+          * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
+          * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
+          * shadow_one_bit_disable() for why that is.
+          */
+         !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
+        v->arch.flags &= ~TF_kernel_mode;
 
     v->arch.vgc_flags = flags;
 
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 4f8cfd6212..734dc14322 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                              INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
     const struct domain *d = v->domain;
-    mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
+    mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
                       ? pagetable_get_mfn(v->arch.guest_table)
                       : pagetable_get_mfn(v->arch.guest_table_user));
     void *root_map = map_domain_page(root_mfn);
@@ -4049,7 +4049,7 @@ sh_update_cr3(struct vcpu *v, int do_locking)
                   v, (unsigned long)pagetable_get_pfn(v->arch.guest_table));
 
 #if GUEST_PAGING_LEVELS == 4
-    if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
+    if ( !(v->arch.flags & TF_kernel_mode) )
         gmfn = pagetable_get_mfn(v->arch.guest_table_user);
     else
 #endif

["xsa286-4.10/0005-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 34cd8457cf..c008999f96 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;

["xsa286-4.10/0006-x86-mm-check-page-types-in-do_page_walk.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index c008999f96..7bcd6ebe31 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn_x(mfn));
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn_x(mfn));
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn_x(mfn));
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn_x(mfn));
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*

["xsa286-4.10/0007-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 57490e18b0..ecd9dc462e 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -46,11 +46,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 7bcd6ebe31..763f641f61 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn_x(mfn));
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 704345335c..5c210275f7 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -593,7 +593,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.10/0008-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index ecd9dc462e..4ecd707061 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -65,27 +65,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index 976209ba4c..cc4ee1affb 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index 622bb7dff0..ab6fff7008 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -345,7 +345,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 763f641f61..dcf5cc1586 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn_x(mfn));
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 5c210275f7..f4caf4f6b2 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -596,6 +596,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.10/0009-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in
 {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 9a52276866..dc322b51e1 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     if ( !is_pv_domain(d) || is_idle_domain(d) )
         return 0;
@@ -244,8 +246,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -254,9 +258,25 @@ int mapcache_domain_init(struct domain *d)
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -347,7 +367,7 @@ unsigned long domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_pfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index fd734ff947..981c8447e9 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5952,6 +5952,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);

["xsa286-4.10/0010-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 981c8447e9..16d50e81c3 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1786,9 +1786,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index b2dae7a473..8000a3e754 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -137,6 +137,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index dcf5cc1586..0cb2073eb5 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -828,9 +828,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 9ef9d03ca7..4670ab99f6 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 45ca742678..8d42b14f94 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -276,19 +276,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;

["xsa286-4.11/0001-x86-don-t-allow-clearing-of-TF_kernel_mode-for-other.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV

The flag is really only meant for those, both HVM and 32-bit PV tell
kernel from user mode based on CPL/RPL. Remove the all-question-marks
comment and let's be on the safe side here and also suppress clearing
for 32-bit PV (this isn't a fast path after all).

Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and
sh_walk_guest_tables(). Note that shadow_one_bit_disable() already
assumes the new behavior.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 35857dbe86..1d0ac81c5b 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -804,9 +804,15 @@ int arch_set_info_guest(
 
     v->fpu_initialised = !!(flags & VGCF_I387_VALID);
 
-    v->arch.flags &= ~TF_kernel_mode;
-    if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
-        v->arch.flags |= TF_kernel_mode;
+    v->arch.flags |= TF_kernel_mode;
+    if ( unlikely(!(flags & VGCF_in_kernel)) &&
+         /*
+          * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
+          * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
+          * shadow_one_bit_disable() for why that is.
+          */
+         !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
+        v->arch.flags &= ~TF_kernel_mode;
 
     v->arch.vgc_flags = flags;
 
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 8ab343d16e..a2ebb4943f 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                              INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
     const struct domain *d = v->domain;
-    mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
+    mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
                       ? pagetable_get_mfn(v->arch.guest_table)
                       : pagetable_get_mfn(v->arch.guest_table_user));
     void *root_map = map_domain_page(root_mfn);
@@ -4018,7 +4018,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
                   v, (unsigned long)pagetable_get_pfn(v->arch.guest_table));
 
 #if GUEST_PAGING_LEVELS == 4
-    if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
+    if ( !(v->arch.flags & TF_kernel_mode) )
         gmfn = pagetable_get_mfn(v->arch.guest_table_user);
     else
 #endif

["xsa286-4.11/0002-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 3bd157967a..e73daa55e4 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;

["xsa286-4.11/0003-x86-mm-check-page-types-in-do_page_walk.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index e73daa55e4..1ca9547d68 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*

["xsa286-4.11/0004-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 80bf280fb2..ee08c13881 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 1ca9547d68..dfa33ba894 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 7825691d06..afafe87fe7 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -585,7 +585,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.11/0005-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index ee08c13881..c70785d0cf 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index 976209ba4c..cc4ee1affb 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index a3c0c2dd19..c9ee5156f8 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index dfa33ba894..cca7ea6e9d 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index afafe87fe7..423313ae3a 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -588,6 +588,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.11/0006-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in
 {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 0c24530ed9..d89fa27f8e 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_mfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 626768a950..8f975a747d 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -6038,6 +6038,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);

["xsa286-4.11/0007-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 8f975a747d..10175764e8 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1755,9 +1755,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index c7fa18925b..1933a6a2a2 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -137,6 +137,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index cca7ea6e9d..d7551e594a 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -833,9 +833,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 9ef9d03ca7..4670ab99f6 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index c1e92937c0..e72c277b9f 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;

["xsa286-4.12/0001-x86-don-t-allow-clearing-of-TF_kernel_mode-for-other.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV

The flag is really only meant for those, both HVM and 32-bit PV tell
kernel from user mode based on CPL/RPL. Remove the all-question-marks
comment and let's be on the safe side here and also suppress clearing
for 32-bit PV (this isn't a fast path after all).

Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and
sh_walk_guest_tables(). Note that shadow_one_bit_disable() already
assumes the new behavior.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: George Dunlap <george.dunlap@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 019dc57459..607b346737 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -840,9 +840,15 @@ int arch_set_info_guest(
             return -EINVAL;
     }
 
-    v->arch.flags &= ~TF_kernel_mode;
-    if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
-        v->arch.flags |= TF_kernel_mode;
+    v->arch.flags |= TF_kernel_mode;
+    if ( unlikely(!(flags & VGCF_in_kernel)) &&
+         /*
+          * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
+          * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
+          * shadow_one_bit_disable() for why that is.
+          */
+         !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
+        v->arch.flags &= ~TF_kernel_mode;
 
     v->arch.vgc_flags = flags;
 
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 3e5651d029..4ae8e05ec8 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                              INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
     const struct domain *d = v->domain;
-    mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
+    mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
                       ? pagetable_get_mfn(v->arch.guest_table)
                       : pagetable_get_mfn(v->arch.guest_table_user));
     void *root_map = map_domain_page(root_mfn);
@@ -4025,7 +4025,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
                   v, (unsigned long)pagetable_get_pfn(v->arch.guest_table));
 
 #if GUEST_PAGING_LEVELS == 4
-    if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
+    if ( !(v->arch.flags & TF_kernel_mode) )
         gmfn = pagetable_get_mfn(v->arch.guest_table_user);
     else
 #endif

["xsa286-4.12/0002-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 7246ee50ef..9c89c7d076 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;

["xsa286-4.12/0003-x86-mm-check-page-types-in-do_page_walk.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 9c89c7d076..c764237651 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*

["xsa286-4.12/0004-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 2b0dadc8da..acebf9e957 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index c764237651..062069a908 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 845556eb6d..504f3fdc14 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -585,7 +585,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.12/0005-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index acebf9e957..7624447246 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index 976209ba4c..cc4ee1affb 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index e7a7179dda..ebac3f476a 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -360,7 +360,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 062069a908..7dbfabf267 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 504f3fdc14..0aed4c3b77 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -588,6 +588,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.12/0006-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in
 {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 4a07cfb18e..660bd06aaf 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_mfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index b4c90bd054..d234e191a6 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -6071,6 +6071,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);

["xsa286-4.12/0007-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d234e191a6..51826ecb14 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1786,9 +1786,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index 580ef3e29e..f082f02670 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -135,6 +135,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 7dbfabf267..577938bc17 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -833,9 +833,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index feeb456bee..01a56dcab0 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index c1e92937c0..e72c277b9f 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;

["xsa286-4.13/0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index db4f035d8d..b1582b56fb 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;

["xsa286-4.13/0002-x86-mm-check-page-types-in-do_page_walk.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index b1582b56fb..7d439639b7 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*

["xsa286-4.13/0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 2b0dadc8da..acebf9e957 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 7d439639b7..670aa3f892 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 320c6cd196..cd3e7ec501 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -577,7 +577,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.13/0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index acebf9e957..7624447246 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index a1bd473b29..43d33a1fd1 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index a920fb5e15..2bf4497a16 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 670aa3f892..c5686e0d25 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index cd3e7ec501..865db999c1 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -580,6 +580,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 int __sync_local_execstate(void);
 

["xsa286-4.13/0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in
 {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 4a07cfb18e..660bd06aaf 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_mfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 30dffb68e8..279664a83e 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -6031,6 +6031,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);

["xsa286-4.13/0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 279664a83e..fa0f813d29 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1757,9 +1757,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index 3217777921..b214087194 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -135,6 +135,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index c5686e0d25..dcb20d1d9d 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -833,9 +833,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 8d79a71398..9d587a076a 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index c1e92937c0..e72c277b9f 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;

["xsa286-4.14/0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 48fd60a876..c25eb01e41 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;

["xsa286-4.14/0002-x86-mm-check-page-types-in-do_page_walk.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index c25eb01e41..6305cf6033 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*

["xsa286-4.14/0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 5d4cd00941..7be098f5ef 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 6305cf6033..71a8bfc024 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 7e74996053..12ea812381 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -579,7 +579,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 /* Allocator functions for Xen pagetables. */
 void *alloc_xen_pagetable(void);

["xsa286-4.14/0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 7be098f5ef..5e4081aecd 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index a1bd473b29..43d33a1fd1 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index 0eedb70002..ce31dd401d 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -349,7 +349,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 71a8bfc024..9e87a55174 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 12ea812381..da1a6f5712 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -582,6 +582,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 /* Allocator functions for Xen pagetables. */
 void *alloc_xen_pagetable(void);

["xsa286-4.14/0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in
 {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index b03728e18e..ed6a2bf081 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_mfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 82bc676553..582ea09725 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5953,6 +5953,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);

["xsa286-4.14/0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 582ea09725..57333bb120 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1682,9 +1682,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index 3fd3f0617a..bb2f50cb6e 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -139,6 +139,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 9e87a55174..ce03f83f52 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -808,9 +808,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 665e9cc31d..17b8ea0cfd 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -197,7 +197,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index f632affaef..fd2574267c 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -294,19 +294,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;

["xsa286/0001-x86-mm-split-L4-and-L3-parts-of-the-walk-out-of-do_p.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()

The L3 one at least is going to be re-used by a subsequent patch, and
splitting the L4 one then as well seems only natural.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index bce1561e1a..c75bac0712 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -59,26 +59,47 @@ extern unsigned int compat_machine_to_phys_mapping[];
 
 #endif /* CONFIG_PV32 */
 
-void *do_page_walk(struct vcpu *v, unsigned long addr)
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn = pagetable_get_pfn(root);
+    l4_pgentry_t *l4t, l4e;
 
-    if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
-        return NULL;
+    if ( !is_canonical_address(addr) )
+        return l4e_empty();
 
     l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
+
+    return l4e;
+}
+
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+{
+    l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+    l3_pgentry_t *l3t, l3e;
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return NULL;
+        return l3e_empty();
 
     l3t = map_l3t_from_l4e(l4e);
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
+
+    return l3e;
+}
+
+void *do_page_walk(struct vcpu *v, unsigned long addr)
+{
+    l3_pgentry_t l3e;
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned long mfn;
+
+    if ( !is_pv_vcpu(v) )
+        return NULL;
+
+    l3e = page_walk_get_l3e(v->arch.guest_table, addr);
     mfn = l3e_get_pfn(l3e);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
         return NULL;

["xsa286/0002-x86-mm-check-page-types-in-do_page_walk.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: check page types in do_page_walk()

For page table entries read to be guaranteed valid, transiently locking
the pages and validating their types is necessary. Note that guest use
of linear page tables is intentionally not taken into account here, as
ordinary data (guest stacks) can't possibly live inside page tables.

This is part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index c75bac0712..c4a52a5c4c 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -61,15 +61,29 @@ extern unsigned int compat_machine_to_phys_mapping[];
 
 static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 {
-    unsigned long mfn = pagetable_get_pfn(root);
-    l4_pgentry_t *l4t, l4e;
+    mfn_t mfn = pagetable_get_mfn(root);
+    /* current's root page table can't disappear under our feet. */
+    bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
+    struct page_info *pg;
+    l4_pgentry_t l4e = l4e_empty();
 
     if ( !is_canonical_address(addr) )
         return l4e_empty();
 
-    l4t = map_domain_page(_mfn(mfn));
-    l4e = l4t[l4_table_offset(addr)];
-    unmap_domain_page(l4t);
+    pg = mfn_to_page(mfn);
+    if ( need_lock && !page_lock(pg) )
+        return l4e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
+    {
+        l4_pgentry_t *l4t = map_domain_page(mfn);
+
+        l4e = l4t[l4_table_offset(addr)];
+        unmap_domain_page(l4t);
+    }
+
+    if ( need_lock )
+        page_unlock(pg);
 
     return l4e;
 }
@@ -77,14 +91,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
 static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 {
     l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
-    l3_pgentry_t *l3t, l3e;
+    mfn_t mfn = l4e_get_mfn(l4e);
+    struct page_info *pg;
+    l3_pgentry_t l3e = l3e_empty();
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return l3e_empty();
 
-    l3t = map_l3t_from_l4e(l4e);
-    l3e = l3t[l3_table_offset(addr)];
-    unmap_domain_page(l3t);
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l3e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
+    {
+        l3_pgentry_t *l3t = map_domain_page(mfn);
+
+        l3e = l3t[l3_table_offset(addr)];
+        unmap_domain_page(l3t);
+    }
+
+    page_unlock(pg);
 
     return l3e;
 }
@@ -92,44 +118,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-    unsigned long mfn;
+    l2_pgentry_t l2e = l2e_empty();
+    l1_pgentry_t l1e = l1e_empty();
+    mfn_t mfn;
+    struct page_info *pg;
 
     if ( !is_pv_vcpu(v) )
         return NULL;
 
     l3e = page_walk_get_l3e(v->arch.guest_table, addr);
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    mfn = l3e_get_mfn(l3e);
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l2t = map_domain_page(_mfn(mfn));
-    l2e = l2t[l2_table_offset(addr)];
-    unmap_domain_page(l2t);
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        const l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l2e_get_mfn(l2e);
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
+        mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
         goto ret;
     }
 
-    l1t = map_domain_page(_mfn(mfn));
-    l1e = l1t[l1_table_offset(addr)];
-    unmap_domain_page(l1t);
-    mfn = l1e_get_pfn(l1e);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return NULL;
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        const l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    mfn = l1e_get_mfn(l1e);
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
         return NULL;
 
  ret:
-    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
+    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
 }
 
 /*

["xsa286/0003-x86-mm-avoid-using-linear-page-tables-in-map_guest_l.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the linear L2 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 14cb0f2d4e..61bf736ed9 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
     if ( unlikely(!__addr_ok(linear)) )
         return NULL;
 
-    /* Find this l1e and its enclosing l1mfn in the linear map. */
-    if ( __copy_from_user(&l2e,
-                          &__linear_l2_table[l2_linear_offset(linear)],
-                          sizeof(l2_pgentry_t)) )
+    if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
+    {
+        ASSERT_UNREACHABLE();
         return NULL;
+    }
+
+    /* Find this l1e and its enclosing l1mfn. */
+    l2e = page_walk_get_l2e(current->arch.guest_table, linear);
 
     /* Check flags that it will be safe to read the l1e. */
     if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index c4a52a5c4c..17c30565da 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -115,6 +115,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
     return l3e;
 }
 
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+{
+    l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
+    mfn_t mfn = l3e_get_mfn(l3e);
+    struct page_info *pg;
+    l2_pgentry_t l2e = l2e_empty();
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
+         (l3e_get_flags(l3e) & _PAGE_PSE) )
+        return l2e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l2e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
+    {
+        l2_pgentry_t *l2t = map_domain_page(mfn);
+
+        l2e = l2t[l2_table_offset(addr)];
+        unmap_domain_page(l2t);
+    }
+
+    page_unlock(pg);
+
+    return l2e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index deeba75a1c..101d402d62 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -569,7 +569,9 @@ void audit_domains(void);
 void make_cr3(struct vcpu *v, mfn_t mfn);
 void update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
+
 void *do_page_walk(struct vcpu *v, unsigned long addr);
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
 
 /* Allocator functions for Xen pagetables. */
 void *alloc_xen_pagetable(void);

["xsa286/0004-x86-mm-avoid-using-linear-page-tables-in-guest_get_e.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()

First of all drop guest_get_eff_l1e() entirely - there's no actual user
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
its only call site.

Then replace the linear L1 table access by an actual page walk.

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 61bf736ed9..ed10986d11 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
 }
 
 /*
- * Read the guest's l1e that maps this address, from the kernel-mode
- * page tables.
- */
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
-{
-    struct vcpu *curr = current;
-    const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
-    l1_pgentry_t l1e;
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    l1e = guest_get_eff_l1e(linear);
-
-    if ( user_mode )
-        toggle_guest_pt(curr);
-
-    return l1e;
-}
-
-/*
  * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
  * into Xen's virtual range.  Returns true if the mapping changed, false
  * otherwise.
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index b1b66e46c8..837743d712 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
 
-/* Read a PV guest's l1e that maps this linear address. */
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
 {
-    l1_pgentry_t l1e;
+    l1_pgentry_t l1e = l1e_empty();
 
     ASSERT(!paging_mode_translate(current->domain));
     ASSERT(!paging_mode_external(current->domain));
 
-    if ( unlikely(!__addr_ok(linear)) ||
-         __copy_from_user(&l1e,
-                          &__linear_l1_table[l1_linear_offset(linear)],
-                          sizeof(l1_pgentry_t)) )
-        l1e = l1e_empty();
+    if ( likely(__addr_ok(linear)) )
+        l1e = page_walk_get_l1e(current->arch.guest_table, linear);
 
     return l1e;
 }
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index 7f6fbc92fb..8d0007ede5 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -342,7 +342,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     bool mmio_ro;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
-    pte = guest_get_eff_l1e(addr);
+    pte = guest_get_eff_kern_l1e(addr);
 
     /* We are only looking for read-only mappings */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 17c30565da..e1efef5c4c 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -143,6 +143,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
     return l2e;
 }
 
+/*
+ * For now no "set_accessed" parameter, as all callers want it set to true.
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
+ * mappings, and we don't want to set the dirty bit there (conflicts with
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
+ */
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
+{
+    l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
+    mfn_t mfn = l2e_get_mfn(l2e);
+    struct page_info *pg;
+    l1_pgentry_t l1e = l1e_empty();
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
+         (l2e_get_flags(l2e) & _PAGE_PSE) )
+        return l1e_empty();
+
+    pg = mfn_to_page(mfn);
+    if ( !page_lock(pg) )
+        return l1e_empty();
+
+    if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
+    {
+        l1_pgentry_t *l1t = map_domain_page(mfn);
+
+        l1e = l1t[l1_table_offset(addr)];
+
+        if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
+             _PAGE_PRESENT )
+        {
+            l1_pgentry_t ol1e = l1e;
+
+            l1e_add_flags(l1e, _PAGE_ACCESSED);
+            /*
+             * Best effort only; with the lock held the page shouldn't
+             * change anyway, except for the dirty bit to perhaps become set.
+             */
+            while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
+                            l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
+                    l1e_get_intpte(ol1e) &&
+                    !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
+            {
+                l1e_add_flags(ol1e, _PAGE_DIRTY);
+                l1e_add_flags(l1e, _PAGE_DIRTY);
+            }
+        }
+
+        unmap_domain_page(l1t);
+    }
+
+    page_unlock(pg);
+
+    return l1e;
+}
+
 void *do_page_walk(struct vcpu *v, unsigned long addr)
 {
     l3_pgentry_t l3e;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 101d402d62..7017a193b8 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -572,6 +572,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
 
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
 
 /* Allocator functions for Xen pagetables. */
 void *alloc_xen_pagetable(void);

["xsa286/0005-x86-mm-avoid-using-top-level-linear-page-tables-in-u.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: avoid using top level linear page tables in {,un}map_domain_page()

Move the page table recursion two levels down. This entails avoiding
to free the recursive mapping prematurely in free_perdomain_mappings().

This is part of XSA-286.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index b03728e18e..ed6a2bf081 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
 #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
 #define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
+    ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
+                      ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
 
 void *map_domain_page(mfn_t mfn)
 {
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
     unsigned int bitmap_pages;
+    int rc;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
         return 0;
 #endif
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
+                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
+                 (1U << L2_PAGETABLE_SHIFT) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
     bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
     dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
 
     spin_lock_init(&dcache->lock);
 
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+                                  2 * bitmap_pages + 1,
+                                  NIL(l1_pgentry_t *), NULL);
+    if ( !rc )
+    {
+        /*
+         * Install mapping of our L2 table into its own last slot, for easy
+         * access to the L1 entries via MAPCACHE_L1ENT().
+         */
+        l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
+        l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
+        l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
+
+        l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
+        unmap_domain_page(l2t);
+        unmap_domain_page(l3t);
+    }
+
+    return rc;
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     else
     {
         ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
+        pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
     }
 
     return l1e_get_mfn(*pl1e);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 8c8f054186..f188c0f67f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5944,6 +5944,10 @@ void free_perdomain_mappings(struct domain *d)
                 {
                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
 
+                    /* mapcache_domain_init() installs a recursive entry. */
+                    if ( l1pg == l2pg )
+                        continue;
+
                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
                     {
                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);

["xsa286/0006-x86-mm-restrict-use-of-linear-page-tables-to-shadow-.patch" (application/octet-stream)]

From: Jan Beulich <jbeulich@suse.com>
Subject: x86/mm: restrict use of linear page tables to shadow mode code

Other code does not require them to be set up anymore, so restrict when
to populate the respective L4 slot and reduce visibility of the
accessors.

While with the removal of all uses the vulnerability is actually fixed,
removing the creation of the linear mapping adds an extra layer of
protection. Similarly reducing visibility of the accessors mostly
eliminates the risk of undue re-introduction of uses of the linear
mappings.

This is (not strictly) part of XSA-286.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index f188c0f67f..8c51c2913d 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1677,9 +1677,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
 
-    /* Slot 258: Self linear mappings. */
+    /* Slot 258: Self linear mappings (shadow pt only). */
     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
+        !shadow_mode_external(d) ? l4e_empty() :
         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
 
     /* Slot 259: Shadow linear mappings (if applicable) .*/
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index 992461d4bc..a7ae99f086 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -139,6 +139,15 @@ enum {
 # define GUEST_PTE_SIZE 4
 #endif
 
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
 /******************************************************************************
  * Auditing routines
  */
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index e1efef5c4c..eaeb827099 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -832,9 +832,6 @@ void __init paging_init(void)
 
     machine_to_phys_mapping_valid = 1;
 
-    /* Set up linear page table mapping. */
-    l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
-              l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
     return;
 
  nomem:
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index eb25fc4758..c6f8b18942 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -197,7 +197,7 @@ extern unsigned char boot_edid_info[128];
  */
 #define PCI_MCFG_VIRT_START     (PML4_ADDR(257))
 #define PCI_MCFG_VIRT_END       (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
-/* Slot 258: linear page table (guest table). */
+/* Slot 258: linear page table (monitor table, HVM only). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
 /* Slot 259: linear page table (shadow table). */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index f632affaef..fd2574267c 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -294,19 +294,6 @@ void copy_page_sse2(void *, const void *);
 #define vmap_to_mfn(va)     _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
 #define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
-#endif /* !defined(__ASSEMBLY__) */
-
-/* Where to find each level of the linear mapping */
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
-
-
-#ifndef __ASSEMBLY__
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 extern l2_pgentry_t  *compat_idle_pg_table_l2;
 extern unsigned int   m2p_compat_vstart;


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic