summaryrefslogtreecommitdiff
path: root/emulators/xen-kernel
diff options
context:
space:
mode:
authorRoger Pau Monné <royger@FreeBSD.org>2019-03-12 15:02:35 +0000
committerRoger Pau Monné <royger@FreeBSD.org>2019-03-12 15:02:35 +0000
commit803787f39fe9a2034d7df6796e7ad7c921002277 (patch)
treebf4c021f17f5179dcf36c3e7094b1358fc03cf02 /emulators/xen-kernel
parenttextproc/re-flex: update to 1.1.3 (diff)
emulators/xen-kernel: backport fixes and apply XSAs
Backport a couple of fixes critical for PVH dom0 and fixes for XSA-{284,287,290,292-294}. Sponsored-by: Citrix Systems R&D Reviewed by: bapt Differential revision: https://reviews.freebsd.org/D19413
Notes
Notes: svn path=/head/; revision=495458
Diffstat (limited to 'emulators/xen-kernel')
-rw-r--r--emulators/xen-kernel/Makefile25
-rw-r--r--emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch115
-rw-r--r--emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch39
-rw-r--r--emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch101
-rw-r--r--emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch210
-rw-r--r--emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch319
-rw-r--r--emulators/xen-kernel/files/xsa284.patch31
-rw-r--r--emulators/xen-kernel/files/xsa287-4.11.patch328
-rw-r--r--emulators/xen-kernel/files/xsa290-4.11-1.patch237
-rw-r--r--emulators/xen-kernel/files/xsa290-4.11-2.patch71
-rw-r--r--emulators/xen-kernel/files/xsa292.patch95
-rw-r--r--emulators/xen-kernel/files/xsa293-4.11-1.patch317
-rw-r--r--emulators/xen-kernel/files/xsa293-4.11-2.patch260
-rw-r--r--emulators/xen-kernel/files/xsa294-4.11.patch71
14 files changed, 2218 insertions, 1 deletions
diff --git a/emulators/xen-kernel/Makefile b/emulators/xen-kernel/Makefile
index 06455d4df721..ff6b4de3b3e1 100644
--- a/emulators/xen-kernel/Makefile
+++ b/emulators/xen-kernel/Makefile
@@ -2,7 +2,7 @@
PORTNAME= xen
PORTVERSION= 4.11.1
-PORTREVISION= 0
+PORTREVISION= 1
CATEGORIES= emulators
MASTER_SITES= http://downloads.xenproject.org/release/xen/${PORTVERSION}/
PKGNAMESUFFIX= -kernel
@@ -45,6 +45,29 @@ EXTRA_PATCHES+= ${FILESDIR}/0001-x86-mtrr-introduce-mask-to-get-VCNT-from-MTRRca
EXTRA_PATCHES+= ${FILESDIR}/0001-x86-replace-usage-in-the-linker-script.patch:-p1
# Fix PVH Dom0 build with shadow paging
EXTRA_PATCHES+= ${FILESDIR}/0001-x86-pvh-change-the-order-of-the-iommu-initialization.patch:-p1
+# Forward dom0 lapic EOIs to underlying hardware
+EXTRA_PATCHES+= ${FILESDIR}/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch:-p1
+# Fix deadlock in IO-APIC gsi mapping
+EXTRA_PATCHES+= ${FILESDIR}/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch:-p1
+# Fix for migration/save
+EXTRA_PATCHES+= ${FILESDIR}/0001-x86-mm-locks-remove-trailing-whitespace.patch:-p1 \
+ ${FILESDIR}/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch:-p1 \
+ ${FILESDIR}/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch:-p1
+
+# XSA-284
+EXTRA_PATCHES+= ${FILESDIR}/xsa284.patch:-p1
+# XSA-287
+EXTRA_PATCHES+= ${FILESDIR}/xsa287-4.11.patch:-p1
+# XSA-290
+EXTRA_PATCHES+= ${FILESDIR}/xsa290-4.11-1.patch:-p1 \
+ ${FILESDIR}/xsa290-4.11-2.patch:-p1
+# XSA-292
+EXTRA_PATCHES+= ${FILESDIR}/xsa292.patch:-p1
+# XSA-293
+EXTRA_PATCHES+= ${FILESDIR}/xsa293-4.11-1.patch:-p1 \
+ ${FILESDIR}/xsa293-4.11-2.patch:-p1
+# XSA-294
+EXTRA_PATCHES+= ${FILESDIR}/xsa294-4.11.patch:-p1
.include <bsd.port.options.mk>
diff --git a/emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch b/emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch
new file mode 100644
index 000000000000..e2258d28d369
--- /dev/null
+++ b/emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch
@@ -0,0 +1,115 @@
+From 603ad88fe8a681a2c5408c3f432d7083dd1c41b1 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau@citrix.com>
+Date: Mon, 28 Jan 2019 15:22:45 +0100
+Subject: [PATCH] pvh/dom0: fix deadlock in GSI mapping
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current GSI mapping code can cause the following deadlock:
+
+(XEN) *** Dumping CPU0 host state: ***
+(XEN) ----[ Xen-4.12.0-rc x86_64 debug=y Tainted: C ]----
+[...]
+(XEN) Xen call trace:
+(XEN) [<ffff82d080239852>] vmac.c#_spin_lock_cb+0x32/0x70
+(XEN) [<ffff82d0802ed40f>] vmac.c#hvm_gsi_assert+0x2f/0x60 <- pick hvm.irq_lock
+(XEN) [<ffff82d080255cc9>] io.c#hvm_dirq_assist+0xd9/0x130 <- pick event_lock
+(XEN) [<ffff82d080255b4b>] io.c#dpci_softirq+0xdb/0x120
+(XEN) [<ffff82d080238ce6>] softirq.c#__do_softirq+0x46/0xa0
+(XEN) [<ffff82d08026f955>] domain.c#idle_loop+0x35/0x90
+(XEN)
+[...]
+(XEN) *** Dumping CPU3 host state: ***
+(XEN) ----[ Xen-4.12.0-rc x86_64 debug=y Tainted: C ]----
+[...]
+(XEN) Xen call trace:
+(XEN) [<ffff82d08023985d>] vmac.c#_spin_lock_cb+0x3d/0x70
+(XEN) [<ffff82d080281fc8>] vmac.c#allocate_and_map_gsi_pirq+0xc8/0x130 <- pick event_lock
+(XEN) [<ffff82d0802f44c0>] vioapic.c#vioapic_hwdom_map_gsi+0x80/0x130
+(XEN) [<ffff82d0802f4399>] vioapic.c#vioapic_write_redirent+0x119/0x1c0 <- pick hvm.irq_lock
+(XEN) [<ffff82d0802f4075>] vioapic.c#vioapic_write+0x35/0x40
+(XEN) [<ffff82d0802e96a2>] vmac.c#hvm_process_io_intercept+0xd2/0x230
+(XEN) [<ffff82d0802e9842>] vmac.c#hvm_io_intercept+0x22/0x50
+(XEN) [<ffff82d0802dbe9b>] emulate.c#hvmemul_do_io+0x21b/0x3c0
+(XEN) [<ffff82d0802db302>] emulate.c#hvmemul_do_io_buffer+0x32/0x70
+(XEN) [<ffff82d0802dcd29>] emulate.c#hvmemul_do_mmio_buffer+0x29/0x30
+(XEN) [<ffff82d0802dcc19>] emulate.c#hvmemul_phys_mmio_access+0xf9/0x1b0
+(XEN) [<ffff82d0802dc6d0>] emulate.c#hvmemul_linear_mmio_access+0xf0/0x180
+(XEN) [<ffff82d0802de971>] emulate.c#hvmemul_linear_mmio_write+0x21/0x30
+(XEN) [<ffff82d0802de742>] emulate.c#linear_write+0xa2/0x100
+(XEN) [<ffff82d0802dce15>] emulate.c#hvmemul_write+0xb5/0x120
+(XEN) [<ffff82d0802babba>] vmac.c#x86_emulate+0x132aa/0x149a0
+(XEN) [<ffff82d0802c04f9>] vmac.c#x86_emulate_wrapper+0x29/0x70
+(XEN) [<ffff82d0802db570>] emulate.c#_hvm_emulate_one+0x50/0x140
+(XEN) [<ffff82d0802e9e31>] vmac.c#hvm_emulate_one_insn+0x41/0x100
+(XEN) [<ffff82d080345066>] guest_4.o#sh_page_fault__guest_4+0x976/0xd30
+(XEN) [<ffff82d08030cc69>] vmac.c#vmx_vmexit_handler+0x949/0xea0
+(XEN) [<ffff82d08031411a>] vmac.c#vmx_asm_vmexit_handler+0xfa/0x270
+
+In order to solve it move the vioapic_hwdom_map_gsi outside of the
+locked region in vioapic_write_redirent. vioapic_hwdom_map_gsi will
+not access any of the vioapic fields, so there's no need to call the
+function holding the hvm.irq_lock.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Release-acked-by: Juergen Gross <jgross@suse.com>
+---
+ xen/arch/x86/hvm/vioapic.c | 32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
+index 2b74f92d51..2d71c33c1c 100644
+--- a/xen/arch/x86/hvm/vioapic.c
++++ b/xen/arch/x86/hvm/vioapic.c
+@@ -236,20 +236,6 @@ static void vioapic_write_redirent(
+
+ *pent = ent;
+
+- if ( is_hardware_domain(d) && unmasked )
+- {
+- int ret;
+-
+- ret = vioapic_hwdom_map_gsi(gsi, ent.fields.trig_mode,
+- ent.fields.polarity);
+- if ( ret )
+- {
+- /* Mask the entry again. */
+- pent->fields.mask = 1;
+- unmasked = 0;
+- }
+- }
+-
+ if ( gsi == 0 )
+ {
+ vlapic_adjust_i8259_target(d);
+@@ -266,6 +252,24 @@ static void vioapic_write_redirent(
+
+ spin_unlock(&d->arch.hvm.irq_lock);
+
++ if ( is_hardware_domain(d) && unmasked )
++ {
++ /*
++ * NB: don't call vioapic_hwdom_map_gsi while holding hvm.irq_lock
++ * since it can cause deadlocks as event_lock is taken by
++ * allocate_and_map_gsi_pirq, and that will invert the locking order
++ * used by other parts of the code.
++ */
++ int ret = vioapic_hwdom_map_gsi(gsi, ent.fields.trig_mode,
++ ent.fields.polarity);
++ if ( ret )
++ {
++ gprintk(XENLOG_ERR,
++ "unable to bind gsi %u to hardware domain: %d\n", gsi, ret);
++ unmasked = 0;
++ }
++ }
++
+ if ( gsi == 0 || unmasked )
+ pt_may_unmask_irq(d, NULL);
+ }
+--
+2.17.2 (Apple Git-113)
+
diff --git a/emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch b/emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch
new file mode 100644
index 000000000000..aaf3a2da22c7
--- /dev/null
+++ b/emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch
@@ -0,0 +1,39 @@
+From 19d2bce1c3cbfdc636c142cdf0ae38795f2202dd Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau@citrix.com>
+Date: Thu, 14 Feb 2019 14:41:03 +0100
+Subject: [PATCH for-4.12] x86/dom0: propagate PVH vlapic EOIs to hardware
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Current check for MSI EIO is missing a special case for PVH Dom0,
+which doesn't have a hvm_irq_dpci struct but requires EIOs to be
+forwarded to the physical lapic for passed-through devices.
+
+Add a short-circuit to allow EOIs from PVH Dom0 to be propagated.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+---
+Cc: Jan Beulich <jbeulich@suse.com>
+Cc: Juergen Gross <jgross@suse.com>
+---
+ xen/drivers/passthrough/io.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
+index a6eb8a4336..4290c7c710 100644
+--- a/xen/drivers/passthrough/io.c
++++ b/xen/drivers/passthrough/io.c
+@@ -869,7 +869,8 @@ static int _hvm_dpci_msi_eoi(struct domain *d,
+
+ void hvm_dpci_msi_eoi(struct domain *d, int vector)
+ {
+- if ( !iommu_enabled || !hvm_domain_irq(d)->dpci )
++ if ( !iommu_enabled ||
++ (!hvm_domain_irq(d)->dpci && !is_hardware_domain(d)) )
+ return;
+
+ spin_lock(&d->event_lock);
+--
+2.17.2 (Apple Git-113)
+
diff --git a/emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch b/emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch
new file mode 100644
index 000000000000..6f6210f9ad03
--- /dev/null
+++ b/emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch
@@ -0,0 +1,101 @@
+From 468937da985661e5cd1d6b2df6d6ab2d1fb1e5e4 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 12 Mar 2019 12:21:03 +0100
+Subject: [PATCH 1/3] x86/mm-locks: remove trailing whitespace
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+No functional change.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+---
+ xen/arch/x86/mm/mm-locks.h | 24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
+index e5fceb2d2e..6c15b9a4cc 100644
+--- a/xen/arch/x86/mm/mm-locks.h
++++ b/xen/arch/x86/mm/mm-locks.h
+@@ -3,11 +3,11 @@
+ *
+ * Spinlocks used by the code in arch/x86/mm.
+ *
+- * Copyright (c) 2011 Citrix Systems, inc.
++ * Copyright (c) 2011 Citrix Systems, inc.
+ * Copyright (c) 2007 Advanced Micro Devices (Wei Huang)
+ * Copyright (c) 2006-2007 XenSource Inc.
+ * Copyright (c) 2006 Michael A Fetterman
+- *
++ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+@@ -41,7 +41,7 @@ static inline void mm_lock_init(mm_lock_t *l)
+ l->unlock_level = 0;
+ }
+
+-static inline int mm_locked_by_me(mm_lock_t *l)
++static inline int mm_locked_by_me(mm_lock_t *l)
+ {
+ return (l->lock.recurse_cpu == current->processor);
+ }
+@@ -67,7 +67,7 @@ do { \
+
+ static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec)
+ {
+- if ( !((mm_locked_by_me(l)) && rec) )
++ if ( !((mm_locked_by_me(l)) && rec) )
+ __check_lock_level(level);
+ spin_lock_recursive(&l->lock);
+ if ( l->lock.recurse_cnt == 1 )
+@@ -186,7 +186,7 @@ static inline void mm_unlock(mm_lock_t *l)
+ spin_unlock_recursive(&l->lock);
+ }
+
+-static inline void mm_enforce_order_unlock(int unlock_level,
++static inline void mm_enforce_order_unlock(int unlock_level,
+ unsigned short *recurse_count)
+ {
+ if ( recurse_count )
+@@ -310,7 +310,7 @@ declare_mm_rwlock(altp2m);
+ #define gfn_locked_by_me(p,g) p2m_locked_by_me(p)
+
+ /* PoD lock (per-p2m-table)
+- *
++ *
+ * Protects private PoD data structs: entry and cache
+ * counts, page lists, sweep parameters. */
+
+@@ -322,7 +322,7 @@ declare_mm_lock(pod)
+
+ /* Page alloc lock (per-domain)
+ *
+- * This is an external lock, not represented by an mm_lock_t. However,
++ * This is an external lock, not represented by an mm_lock_t. However,
+ * pod code uses it in conjunction with the p2m lock, and expecting
+ * the ordering which we enforce here.
+ * The lock is not recursive. */
+@@ -338,13 +338,13 @@ declare_mm_order_constraint(page_alloc)
+ * For shadow pagetables, this lock protects
+ * - all changes to shadow page table pages
+ * - the shadow hash table
+- * - the shadow page allocator
++ * - the shadow page allocator
+ * - all changes to guest page table pages
+ * - all changes to the page_info->tlbflush_timestamp
+- * - the page_info->count fields on shadow pages
+- *
+- * For HAP, it protects the NPT/EPT tables and mode changes.
+- *
++ * - the page_info->count fields on shadow pages
++ *
++ * For HAP, it protects the NPT/EPT tables and mode changes.
++ *
+ * It also protects the log-dirty bitmap from concurrent accesses (and
+ * teardowns, etc). */
+
+--
+2.17.2 (Apple Git-113)
+
diff --git a/emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch b/emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch
new file mode 100644
index 000000000000..86750bb16292
--- /dev/null
+++ b/emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch
@@ -0,0 +1,210 @@
+From 45e260afe7ee0e6b18a7e64173a081eec6e056aa Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau@citrix.com>
+Date: Tue, 12 Mar 2019 12:24:37 +0100
+Subject: [PATCH 2/3] x86/mm-locks: convert some macros to inline functions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+And rename to have only one prefix underscore where applicable.
+
+No functional change.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+---
+ xen/arch/x86/mm/mm-locks.h | 98 ++++++++++++++++++++------------------
+ 1 file changed, 52 insertions(+), 46 deletions(-)
+
+diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
+index 6c15b9a4cc..d3497713e9 100644
+--- a/xen/arch/x86/mm/mm-locks.h
++++ b/xen/arch/x86/mm/mm-locks.h
+@@ -29,7 +29,6 @@
+
+ /* Per-CPU variable for enforcing the lock ordering */
+ DECLARE_PER_CPU(int, mm_lock_level);
+-#define __get_lock_level() (this_cpu(mm_lock_level))
+
+ DECLARE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
+
+@@ -46,43 +45,47 @@ static inline int mm_locked_by_me(mm_lock_t *l)
+ return (l->lock.recurse_cpu == current->processor);
+ }
+
++static inline int _get_lock_level(void)
++{
++ return this_cpu(mm_lock_level);
++}
++
+ /*
+ * If you see this crash, the numbers printed are order levels defined
+ * in this file.
+ */
+-#define __check_lock_level(l) \
+-do { \
+- if ( unlikely(__get_lock_level() > (l)) ) \
+- { \
+- printk("mm locking order violation: %i > %i\n", \
+- __get_lock_level(), (l)); \
+- BUG(); \
+- } \
+-} while(0)
+-
+-#define __set_lock_level(l) \
+-do { \
+- __get_lock_level() = (l); \
+-} while(0)
++static inline void _check_lock_level(int l)
++{
++ if ( unlikely(_get_lock_level() > l) )
++ {
++ printk("mm locking order violation: %i > %i\n", _get_lock_level(), l);
++ BUG();
++ }
++}
++
++static inline void _set_lock_level(int l)
++{
++ this_cpu(mm_lock_level) = l;
++}
+
+ static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec)
+ {
+ if ( !((mm_locked_by_me(l)) && rec) )
+- __check_lock_level(level);
++ _check_lock_level(level);
+ spin_lock_recursive(&l->lock);
+ if ( l->lock.recurse_cnt == 1 )
+ {
+ l->locker_function = func;
+- l->unlock_level = __get_lock_level();
++ l->unlock_level = _get_lock_level();
+ }
+ else if ( (unlikely(!rec)) )
+- panic("mm lock already held by %s", l->locker_function);
+- __set_lock_level(level);
++ panic("mm lock already held by %s\n", l->locker_function);
++ _set_lock_level(level);
+ }
+
+ static inline void _mm_enforce_order_lock_pre(int level)
+ {
+- __check_lock_level(level);
++ _check_lock_level(level);
+ }
+
+ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level,
+@@ -92,12 +95,12 @@ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level,
+ {
+ if ( (*recurse_count)++ == 0 )
+ {
+- *unlock_level = __get_lock_level();
++ *unlock_level = _get_lock_level();
+ }
+ } else {
+- *unlock_level = __get_lock_level();
++ *unlock_level = _get_lock_level();
+ }
+- __set_lock_level(level);
++ _set_lock_level(level);
+ }
+
+
+@@ -118,12 +121,12 @@ static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level)
+ {
+ if ( !mm_write_locked_by_me(l) )
+ {
+- __check_lock_level(level);
++ _check_lock_level(level);
+ percpu_write_lock(p2m_percpu_rwlock, &l->lock);
+ l->locker = get_processor_id();
+ l->locker_function = func;
+- l->unlock_level = __get_lock_level();
+- __set_lock_level(level);
++ l->unlock_level = _get_lock_level();
++ _set_lock_level(level);
+ }
+ l->recurse_count++;
+ }
+@@ -134,13 +137,13 @@ static inline void mm_write_unlock(mm_rwlock_t *l)
+ return;
+ l->locker = -1;
+ l->locker_function = "nobody";
+- __set_lock_level(l->unlock_level);
++ _set_lock_level(l->unlock_level);
+ percpu_write_unlock(p2m_percpu_rwlock, &l->lock);
+ }
+
+ static inline void _mm_read_lock(mm_rwlock_t *l, int level)
+ {
+- __check_lock_level(level);
++ _check_lock_level(level);
+ percpu_read_lock(p2m_percpu_rwlock, &l->lock);
+ /* There's nowhere to store the per-CPU unlock level so we can't
+ * set the lock level. */
+@@ -181,7 +184,7 @@ static inline void mm_unlock(mm_lock_t *l)
+ if ( l->lock.recurse_cnt == 1 )
+ {
+ l->locker_function = "nobody";
+- __set_lock_level(l->unlock_level);
++ _set_lock_level(l->unlock_level);
+ }
+ spin_unlock_recursive(&l->lock);
+ }
+@@ -194,10 +197,10 @@ static inline void mm_enforce_order_unlock(int unlock_level,
+ BUG_ON(*recurse_count == 0);
+ if ( (*recurse_count)-- == 1 )
+ {
+- __set_lock_level(unlock_level);
++ _set_lock_level(unlock_level);
+ }
+ } else {
+- __set_lock_level(unlock_level);
++ _set_lock_level(unlock_level);
+ }
+ }
+
+@@ -287,21 +290,24 @@ declare_mm_lock(altp2mlist)
+
+ #define MM_LOCK_ORDER_altp2m 40
+ declare_mm_rwlock(altp2m);
+-#define p2m_lock(p) \
+- do { \
+- if ( p2m_is_altp2m(p) ) \
+- mm_write_lock(altp2m, &(p)->lock); \
+- else \
+- mm_write_lock(p2m, &(p)->lock); \
+- (p)->defer_flush++; \
+- } while (0)
+-#define p2m_unlock(p) \
+- do { \
+- if ( --(p)->defer_flush == 0 ) \
+- p2m_unlock_and_tlb_flush(p); \
+- else \
+- mm_write_unlock(&(p)->lock); \
+- } while (0)
++
++static inline void p2m_lock(struct p2m_domain *p)
++{
++ if ( p2m_is_altp2m(p) )
++ mm_write_lock(altp2m, &p->lock);
++ else
++ mm_write_lock(p2m, &p->lock);
++ p->defer_flush++;
++}
++
++static inline void p2m_unlock(struct p2m_domain *p)
++{
++ if ( --p->defer_flush == 0 )
++ p2m_unlock_and_tlb_flush(p);
++ else
++ mm_write_unlock(&p->lock);
++}
++
+ #define gfn_lock(p,g,o) p2m_lock(p)
+ #define gfn_unlock(p,g,o) p2m_unlock(p)
+ #define p2m_read_lock(p) mm_read_lock(p2m, &(p)->lock)
+--
+2.17.2 (Apple Git-113)
+
diff --git a/emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch b/emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch
new file mode 100644
index 000000000000..4927a4fd7cee
--- /dev/null
+++ b/emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch
@@ -0,0 +1,319 @@
+From efce89c1df5969486bef82eec05223a4a6522d2d Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau@citrix.com>
+Date: Tue, 12 Mar 2019 12:25:21 +0100
+Subject: [PATCH 3/3] x86/mm-locks: apply a bias to lock levels for control
+ domain
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+paging_log_dirty_op function takes mm locks from a subject domain and
+then attempts to perform copy to operations against the caller domain
+in order to copy the result of the hypercall into the caller provided
+buffer.
+
+This works fine when the caller is a non-paging domain, but triggers a
+lock order panic when the caller is a paging domain due to the fact
+that at the point where the copy to operation is performed the subject
+domain paging lock is locked, and the copy operation requires
+locking the caller p2m lock which has a lower level.
+
+Fix this limitation by adding a bias to the level of control domain mm
+locks, so that the lower control domain mm lock always has a level
+greater than the higher unprivileged domain lock level. This allows
+locking the subject domain mm locks and then locking the control
+domain mm locks, while keeping the same lock ordering and the changes
+mostly confined to mm-locks.h.
+
+Note that so far only this flow (locking a subject domain locks and
+then the control domain ones) has been identified, but not all
+possible code paths have been inspected. Hence this solution attempts
+to be a non-intrusive fix for the problem at hand, without discarding
+further changes in the future if other valid code paths are found that
+require more complex lock level ordering.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+---
+ xen/arch/x86/mm/mm-locks.h | 119 +++++++++++++++++++++++--------------
+ xen/arch/x86/mm/p2m-pod.c | 5 +-
+ 2 files changed, 78 insertions(+), 46 deletions(-)
+
+diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
+index d3497713e9..d6c073dc5c 100644
+--- a/xen/arch/x86/mm/mm-locks.h
++++ b/xen/arch/x86/mm/mm-locks.h
+@@ -50,15 +50,35 @@ static inline int _get_lock_level(void)
+ return this_cpu(mm_lock_level);
+ }
+
++#define MM_LOCK_ORDER_MAX 64
++/*
++ * Return the lock level taking the domain bias into account. If the domain is
++ * privileged a bias of MM_LOCK_ORDER_MAX is applied to the lock level, so that
++ * mm locks that belong to a control domain can be acquired after having
++ * acquired mm locks of an unprivileged domain.
++ *
++ * This is required in order to use some hypercalls from a paging domain that
++ * take locks of a subject domain and then attempt to copy data to/from the
++ * caller domain.
++ */
++static inline int _lock_level(const struct domain *d, int l)
++{
++ ASSERT(l <= MM_LOCK_ORDER_MAX);
++
++ return l + (d && is_control_domain(d) ? MM_LOCK_ORDER_MAX : 0);
++}
++
+ /*
+ * If you see this crash, the numbers printed are order levels defined
+ * in this file.
+ */
+-static inline void _check_lock_level(int l)
++static inline void _check_lock_level(const struct domain *d, int l)
+ {
+- if ( unlikely(_get_lock_level() > l) )
++ int lvl = _lock_level(d, l);
++
++ if ( unlikely(_get_lock_level() > lvl) )
+ {
+- printk("mm locking order violation: %i > %i\n", _get_lock_level(), l);
++ printk("mm locking order violation: %i > %i\n", _get_lock_level(), lvl);
+ BUG();
+ }
+ }
+@@ -68,10 +88,11 @@ static inline void _set_lock_level(int l)
+ this_cpu(mm_lock_level) = l;
+ }
+
+-static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec)
++static inline void _mm_lock(const struct domain *d, mm_lock_t *l,
++ const char *func, int level, int rec)
+ {
+ if ( !((mm_locked_by_me(l)) && rec) )
+- _check_lock_level(level);
++ _check_lock_level(d, level);
+ spin_lock_recursive(&l->lock);
+ if ( l->lock.recurse_cnt == 1 )
+ {
+@@ -80,16 +101,17 @@ static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec)
+ }
+ else if ( (unlikely(!rec)) )
+ panic("mm lock already held by %s\n", l->locker_function);
+- _set_lock_level(level);
++ _set_lock_level(_lock_level(d, level));
+ }
+
+-static inline void _mm_enforce_order_lock_pre(int level)
++static inline void _mm_enforce_order_lock_pre(const struct domain *d, int level)
+ {
+- _check_lock_level(level);
++ _check_lock_level(d, level);
+ }
+
+-static inline void _mm_enforce_order_lock_post(int level, int *unlock_level,
+- unsigned short *recurse_count)
++static inline void _mm_enforce_order_lock_post(const struct domain *d, int level,
++ int *unlock_level,
++ unsigned short *recurse_count)
+ {
+ if ( recurse_count )
+ {
+@@ -100,7 +122,7 @@ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level,
+ } else {
+ *unlock_level = _get_lock_level();
+ }
+- _set_lock_level(level);
++ _set_lock_level(_lock_level(d, level));
+ }
+
+
+@@ -117,16 +139,17 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l)
+ return (l->locker == get_processor_id());
+ }
+
+-static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level)
++static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
++ const char *func, int level)
+ {
+ if ( !mm_write_locked_by_me(l) )
+ {
+- _check_lock_level(level);
++ _check_lock_level(d, level);
+ percpu_write_lock(p2m_percpu_rwlock, &l->lock);
+ l->locker = get_processor_id();
+ l->locker_function = func;
+ l->unlock_level = _get_lock_level();
+- _set_lock_level(level);
++ _set_lock_level(_lock_level(d, level));
+ }
+ l->recurse_count++;
+ }
+@@ -141,9 +164,10 @@ static inline void mm_write_unlock(mm_rwlock_t *l)
+ percpu_write_unlock(p2m_percpu_rwlock, &l->lock);
+ }
+
+-static inline void _mm_read_lock(mm_rwlock_t *l, int level)
++static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l,
++ int level)
+ {
+- _check_lock_level(level);
++ _check_lock_level(d, level);
+ percpu_read_lock(p2m_percpu_rwlock, &l->lock);
+ /* There's nowhere to store the per-CPU unlock level so we can't
+ * set the lock level. */
+@@ -156,28 +180,32 @@ static inline void mm_read_unlock(mm_rwlock_t *l)
+
+ /* This wrapper uses the line number to express the locking order below */
+ #define declare_mm_lock(name) \
+- static inline void mm_lock_##name(mm_lock_t *l, const char *func, int rec)\
+- { _mm_lock(l, func, MM_LOCK_ORDER_##name, rec); }
++ static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \
++ const char *func, int rec) \
++ { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); }
+ #define declare_mm_rwlock(name) \
+- static inline void mm_write_lock_##name(mm_rwlock_t *l, const char *func) \
+- { _mm_write_lock(l, func, MM_LOCK_ORDER_##name); } \
+- static inline void mm_read_lock_##name(mm_rwlock_t *l) \
+- { _mm_read_lock(l, MM_LOCK_ORDER_##name); }
++ static inline void mm_write_lock_##name(const struct domain *d, \
++ mm_rwlock_t *l, const char *func) \
++ { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \
++ static inline void mm_read_lock_##name(const struct domain *d, \
++ mm_rwlock_t *l) \
++ { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); }
+ /* These capture the name of the calling function */
+-#define mm_lock(name, l) mm_lock_##name(l, __func__, 0)
+-#define mm_lock_recursive(name, l) mm_lock_##name(l, __func__, 1)
+-#define mm_write_lock(name, l) mm_write_lock_##name(l, __func__)
+-#define mm_read_lock(name, l) mm_read_lock_##name(l)
++#define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0)
++#define mm_lock_recursive(name, d, l) mm_lock_##name(d, l, __func__, 1)
++#define mm_write_lock(name, d, l) mm_write_lock_##name(d, l, __func__)
++#define mm_read_lock(name, d, l) mm_read_lock_##name(d, l)
+
+ /* This wrapper is intended for "external" locks which do not use
+ * the mm_lock_t types. Such locks inside the mm code are also subject
+ * to ordering constraints. */
+-#define declare_mm_order_constraint(name) \
+- static inline void mm_enforce_order_lock_pre_##name(void) \
+- { _mm_enforce_order_lock_pre(MM_LOCK_ORDER_##name); } \
+- static inline void mm_enforce_order_lock_post_##name( \
+- int *unlock_level, unsigned short *recurse_count) \
+- { _mm_enforce_order_lock_post(MM_LOCK_ORDER_##name, unlock_level, recurse_count); } \
++#define declare_mm_order_constraint(name) \
++ static inline void mm_enforce_order_lock_pre_##name(const struct domain *d) \
++ { _mm_enforce_order_lock_pre(d, MM_LOCK_ORDER_##name); } \
++ static inline void mm_enforce_order_lock_post_##name(const struct domain *d,\
++ int *unlock_level, unsigned short *recurse_count) \
++ { _mm_enforce_order_lock_post(d, MM_LOCK_ORDER_##name, unlock_level, \
++ recurse_count); }
+
+ static inline void mm_unlock(mm_lock_t *l)
+ {
+@@ -221,7 +249,7 @@ static inline void mm_enforce_order_unlock(int unlock_level,
+
+ #define MM_LOCK_ORDER_nestedp2m 8
+ declare_mm_lock(nestedp2m)
+-#define nestedp2m_lock(d) mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock)
++#define nestedp2m_lock(d) mm_lock(nestedp2m, d, &(d)->arch.nested_p2m_lock)
+ #define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock)
+
+ /* P2M lock (per-non-alt-p2m-table)
+@@ -260,9 +288,10 @@ declare_mm_rwlock(p2m);
+
+ #define MM_LOCK_ORDER_per_page_sharing 24
+ declare_mm_order_constraint(per_page_sharing)
+-#define page_sharing_mm_pre_lock() mm_enforce_order_lock_pre_per_page_sharing()
++#define page_sharing_mm_pre_lock() \
++ mm_enforce_order_lock_pre_per_page_sharing(NULL)
+ #define page_sharing_mm_post_lock(l, r) \
+- mm_enforce_order_lock_post_per_page_sharing((l), (r))
++ mm_enforce_order_lock_post_per_page_sharing(NULL, (l), (r))
+ #define page_sharing_mm_unlock(l, r) mm_enforce_order_unlock((l), (r))
+
+ /* Alternate P2M list lock (per-domain)
+@@ -275,7 +304,8 @@ declare_mm_order_constraint(per_page_sharing)
+
+ #define MM_LOCK_ORDER_altp2mlist 32
+ declare_mm_lock(altp2mlist)
+-#define altp2m_list_lock(d) mm_lock(altp2mlist, &(d)->arch.altp2m_list_lock)
++#define altp2m_list_lock(d) mm_lock(altp2mlist, d, \
++ &(d)->arch.altp2m_list_lock)
+ #define altp2m_list_unlock(d) mm_unlock(&(d)->arch.altp2m_list_lock)
+
+ /* P2M lock (per-altp2m-table)
+@@ -294,9 +324,9 @@ declare_mm_rwlock(altp2m);
+ static inline void p2m_lock(struct p2m_domain *p)
+ {
+ if ( p2m_is_altp2m(p) )
+- mm_write_lock(altp2m, &p->lock);
++ mm_write_lock(altp2m, p->domain, &p->lock);
+ else
+- mm_write_lock(p2m, &p->lock);
++ mm_write_lock(p2m, p->domain, &p->lock);
+ p->defer_flush++;
+ }
+
+@@ -310,7 +340,7 @@ static inline void p2m_unlock(struct p2m_domain *p)
+
+ #define gfn_lock(p,g,o) p2m_lock(p)
+ #define gfn_unlock(p,g,o) p2m_unlock(p)
+-#define p2m_read_lock(p) mm_read_lock(p2m, &(p)->lock)
++#define p2m_read_lock(p) mm_read_lock(p2m, (p)->domain, &(p)->lock)
+ #define p2m_read_unlock(p) mm_read_unlock(&(p)->lock)
+ #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock)
+ #define gfn_locked_by_me(p,g) p2m_locked_by_me(p)
+@@ -322,7 +352,7 @@ static inline void p2m_unlock(struct p2m_domain *p)
+
+ #define MM_LOCK_ORDER_pod 48
+ declare_mm_lock(pod)
+-#define pod_lock(p) mm_lock(pod, &(p)->pod.lock)
++#define pod_lock(p) mm_lock(pod, (p)->domain, &(p)->pod.lock)
+ #define pod_unlock(p) mm_unlock(&(p)->pod.lock)
+ #define pod_locked_by_me(p) mm_locked_by_me(&(p)->pod.lock)
+
+@@ -335,8 +365,9 @@ declare_mm_lock(pod)
+
+ #define MM_LOCK_ORDER_page_alloc 56
+ declare_mm_order_constraint(page_alloc)
+-#define page_alloc_mm_pre_lock() mm_enforce_order_lock_pre_page_alloc()
+-#define page_alloc_mm_post_lock(l) mm_enforce_order_lock_post_page_alloc(&(l), NULL)
++#define page_alloc_mm_pre_lock(d) mm_enforce_order_lock_pre_page_alloc(d)
++#define page_alloc_mm_post_lock(d, l) \
++ mm_enforce_order_lock_post_page_alloc(d, &(l), NULL)
+ #define page_alloc_mm_unlock(l) mm_enforce_order_unlock((l), NULL)
+
+ /* Paging lock (per-domain)
+@@ -356,9 +387,9 @@ declare_mm_order_constraint(page_alloc)
+
+ #define MM_LOCK_ORDER_paging 64
+ declare_mm_lock(paging)
+-#define paging_lock(d) mm_lock(paging, &(d)->arch.paging.lock)
++#define paging_lock(d) mm_lock(paging, d, &(d)->arch.paging.lock)
+ #define paging_lock_recursive(d) \
+- mm_lock_recursive(paging, &(d)->arch.paging.lock)
++ mm_lock_recursive(paging, d, &(d)->arch.paging.lock)
+ #define paging_unlock(d) mm_unlock(&(d)->arch.paging.lock)
+ #define paging_locked_by_me(d) mm_locked_by_me(&(d)->arch.paging.lock)
+
+diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
+index 631e9aec33..725a2921d9 100644
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -34,9 +34,10 @@
+ /* Enforce lock ordering when grabbing the "external" page_alloc lock */
+ static inline void lock_page_alloc(struct p2m_domain *p2m)
+ {
+- page_alloc_mm_pre_lock();
++ page_alloc_mm_pre_lock(p2m->domain);
+ spin_lock(&(p2m->domain->page_alloc_lock));
+- page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level);
++ page_alloc_mm_post_lock(p2m->domain,
++ p2m->domain->arch.page_alloc_unlock_level);
+ }
+
+ static inline void unlock_page_alloc(struct p2m_domain *p2m)
+--
+2.17.2 (Apple Git-113)
+
diff --git a/emulators/xen-kernel/files/xsa284.patch b/emulators/xen-kernel/files/xsa284.patch
new file mode 100644
index 000000000000..0b5dcd0f029c
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa284.patch
@@ -0,0 +1,31 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: gnttab: set page refcount for copy-on-grant-transfer
+
+Commit 5cc77f9098 ("32-on-64: Fix domain address-size clamping,
+implement"), which introduced this functionality, took care of clearing
+the old page's PGC_allocated, but failed to set the bit (and install the
+associated reference) on the newly allocated one. Furthermore the "mfn"
+local variable was never updated, and hence the wrong MFN was passed to
+guest_physmap_add_page() (and back to the destination domain) in this
+case, leading to an IOMMU mapping into an unowned page.
+
+Ideally the code would use assign_pages(), but the call to
+gnttab_prepare_for_transfer() sits in the middle of the actions
+mirroring that function.
+
+This is XSA-284.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: George Dunlap <george.dunlap@citrix.com>
+
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -2183,6 +2183,8 @@ gnttab_transfer(
+ page->count_info &= ~(PGC_count_mask|PGC_allocated);
+ free_domheap_page(page);
+ page = new_page;
++ page->count_info = PGC_allocated | 1;
++ mfn = page_to_mfn(page);
+ }
+
+ spin_lock(&e->page_alloc_lock);
diff --git a/emulators/xen-kernel/files/xsa287-4.11.patch b/emulators/xen-kernel/files/xsa287-4.11.patch
new file mode 100644
index 000000000000..8563560e151f
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa287-4.11.patch
@@ -0,0 +1,328 @@
+From 67620c1ccb13f7b58645f48248ba1f408b021fdc Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Fri, 18 Jan 2019 15:00:34 +0000
+Subject: [PATCH] steal_page: Get rid of bogus struct page states
+
+The original rules for `struct page` required the following invariants
+at all times:
+
+- refcount > 0 implies owner != NULL
+- PGC_allocated implies refcount > 0
+
+steal_page, in a misguided attempt to protect against unknown races,
+violates both of these rules, thus introducing other races:
+
+- Temporarily, the count_info has the refcount go to 0 while
+ PGC_allocated is set
+
+- It explicitly returns the page PGC_allocated set, but owner == NULL
+ and page not on the page_list.
+
+The second one meant that page_get_owner_and_reference() could return
+NULL even after having successfully grabbed a reference on the page,
+leading the caller to leak the reference (since "couldn't get ref" and
+"got ref but no owner" look the same).
+
+Furthermore, rather than grabbing a page reference to ensure that the
+owner doesn't change under its feet, it appears to rely on holding
+d->page_alloc lock to prevent this.
+
+Unfortunately, this is ineffective: page->owner remains non-NULL for
+some time after the count has been set to 0; meaning that it would be
+entirely possible for the page to be freed and re-allocated to a
+different domain between the page_get_owner() check and the count_info
+check.
+
+Modify steal_page to instead follow the appropriate access discipline,
+taking the page through series of states similar to being freed and
+then re-allocated with MEMF_no_owner:
+
+- Grab an extra reference to make sure we don't race with anyone else
+ freeing the page
+
+- Drop both references and PGC_allocated atomically, so that (if
+successful), anyone else trying to grab a reference will fail
+
+- Attempt to reset Xen's mappings
+
+- Reset the rest of the state.
+
+Then, modify the two callers appropriately:
+
+- Leave count_info alone (it's already been cleared)
+- Call free_domheap_page() directly if appropriate
+- Call assign_pages() rather than open-coding a partial assign
+
+With all callers to assign_pages() now passing in pages with the
+type_info field clear, tighten the respective assertion there.
+
+This is XSA-287.
+
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 84 ++++++++++++++++++++++++++++------------
+ xen/common/grant_table.c | 20 +++++-----
+ xen/common/memory.c | 19 +++++----
+ xen/common/page_alloc.c | 2 +-
+ 4 files changed, 83 insertions(+), 42 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 6509035a5c..d8ff58c901 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -3966,70 +3966,106 @@ int donate_page(
+ return -EINVAL;
+ }
+
++/*
++ * Steal page will attempt to remove `page` from domain `d`. Upon
++ * return, `page` will be in a state similar to the state of a page
++ * returned from alloc_domheap_page() with MEMF_no_owner set:
++ * - refcount 0
++ * - type count cleared
++ * - owner NULL
++ * - page caching attributes cleaned up
++ * - removed from the domain's page_list
++ *
++ * If MEMF_no_refcount is not set, the domain's tot_pages will be
++ * adjusted. If this results in the page count falling to 0,
++ * put_domain() will be called.
++ *
++ * The caller should either call free_domheap_page() to free the
++ * page, or assign_pages() to put it back on some domain's page list.
++ */
+ int steal_page(
+ struct domain *d, struct page_info *page, unsigned int memflags)
+ {
+ unsigned long x, y;
+ bool drop_dom_ref = false;
+- const struct domain *owner = dom_xen;
++ const struct domain *owner;
++ int rc;
+
+ if ( paging_mode_external(d) )
+ return -EOPNOTSUPP;
+
+- spin_lock(&d->page_alloc_lock);
+-
+- if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
++ /* Grab a reference to make sure the page doesn't change under our feet */
++ rc = -EINVAL;
++ if ( !(owner = page_get_owner_and_reference(page)) )
+ goto fail;
+
++ if ( owner != d || is_xen_heap_page(page) )
++ goto fail_put;
++
+ /*
+- * We require there is just one reference (PGC_allocated). We temporarily
+- * drop this reference now so that we can safely swizzle the owner.
++ * We require there are exactly two references -- the one we just
++ * took, and PGC_allocated. We temporarily drop both these
++ * references so that the page becomes effectively non-"live" for
++ * the domain.
+ */
+ y = page->count_info;
+ do {
+ x = y;
+- if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
+- goto fail;
+- y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
++ if ( (x & (PGC_count_mask|PGC_allocated)) != (2 | PGC_allocated) )
++ goto fail_put;
++ y = cmpxchg(&page->count_info, x, x & ~(PGC_count_mask|PGC_allocated));
+ } while ( y != x );
+
+ /*
+- * With the sole reference dropped temporarily, no-one can update type
+- * information. Type count also needs to be zero in this case, but e.g.
+- * PGT_seg_desc_page may still have PGT_validated set, which we need to
+- * clear before transferring ownership (as validation criteria vary
+- * depending on domain type).
++ * NB this is safe even if the page ends up being given back to
++ * the domain, because the count is zero: subsequent mappings will
++ * cause the cache attributes to be re-instated inside
++ * get_page_from_l1e().
++ */
++ if ( (rc = cleanup_page_cacheattr(page)) )
++ {
++ /*
++ * Couldn't fixup Xen's mappings; put things the way we found
++ * it and return an error
++ */
++ page->count_info |= PGC_allocated | 1;
++ goto fail;
++ }
++
++ /*
++ * With the reference count now zero, nobody can grab references
++ * to do anything else with the page. Return the page to a state
++ * that it might be upon return from alloc_domheap_pages with
++ * MEMF_no_owner set.
+ */
++ spin_lock(&d->page_alloc_lock);
++
+ BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
+ PGT_pinned));
+ page->u.inuse.type_info = 0;
+-
+- /* Swizzle the owner then reinstate the PGC_allocated reference. */
+ page_set_owner(page, NULL);
+- y = page->count_info;
+- do {
+- x = y;
+- BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
+- } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
++ page_list_del(page, &d->page_list);
+
+ /* Unlink from original owner. */
+ if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
+ drop_dom_ref = true;
+- page_list_del(page, &d->page_list);
+
+ spin_unlock(&d->page_alloc_lock);
++
+ if ( unlikely(drop_dom_ref) )
+ put_domain(d);
++
+ return 0;
+
++ fail_put:
++ put_page(page);
+ fail:
+- spin_unlock(&d->page_alloc_lock);
+ gdprintk(XENLOG_WARNING, "Bad steal mfn %" PRI_mfn
+ " from d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
+ mfn_x(page_to_mfn(page)), d->domain_id,
+ owner ? owner->domain_id : DOMID_INVALID,
+ page->count_info, page->u.inuse.type_info);
+- return -EINVAL;
++ return rc;
+ }
+
+ static int __do_update_va_mapping(
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index c0585d33f4..656fad1b42 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -2179,7 +2179,7 @@ gnttab_transfer(
+ rcu_unlock_domain(e);
+ put_gfn_and_copyback:
+ put_gfn(d, gop.mfn);
+- page->count_info &= ~(PGC_count_mask|PGC_allocated);
++ /* The count_info has already been cleaned */
+ free_domheap_page(page);
+ goto copyback;
+ }
+@@ -2202,10 +2202,9 @@ gnttab_transfer(
+
+ copy_domain_page(page_to_mfn(new_page), mfn);
+
+- page->count_info &= ~(PGC_count_mask|PGC_allocated);
++ /* The count_info has already been cleared */
+ free_domheap_page(page);
+ page = new_page;
+- page->count_info = PGC_allocated | 1;
+ mfn = page_to_mfn(page);
+ }
+
+@@ -2245,12 +2244,17 @@ gnttab_transfer(
+ */
+ spin_unlock(&e->page_alloc_lock);
+ okay = gnttab_prepare_for_transfer(e, d, gop.ref);
+- spin_lock(&e->page_alloc_lock);
+
+- if ( unlikely(!okay) || unlikely(e->is_dying) )
++ if ( unlikely(!okay || assign_pages(e, page, 0, MEMF_no_refcount)) )
+ {
+- bool_t drop_dom_ref = !domain_adjust_tot_pages(e, -1);
++ bool drop_dom_ref;
+
++ /*
++ * Need to grab this again to safely free our "reserved"
++ * page in the page total
++ */
++ spin_lock(&e->page_alloc_lock);
++ drop_dom_ref = !domain_adjust_tot_pages(e, -1);
+ spin_unlock(&e->page_alloc_lock);
+
+ if ( okay /* i.e. e->is_dying due to the surrounding if() */ )
+@@ -2263,10 +2267,6 @@ gnttab_transfer(
+ goto unlock_and_copyback;
+ }
+
+- page_list_add_tail(page, &e->page_list);
+- page_set_owner(page, e);
+-
+- spin_unlock(&e->page_alloc_lock);
+ put_gfn(d, gop.mfn);
+
+ TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
+diff --git a/xen/common/memory.c b/xen/common/memory.c
+index 4fb7962c79..f71163221f 100644
+--- a/xen/common/memory.c
++++ b/xen/common/memory.c
+@@ -675,20 +675,22 @@ static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg)
+ * Success! Beyond this point we cannot fail for this chunk.
+ */
+
+- /* Destroy final reference to each input page. */
++ /*
++ * These pages have already had owner and reference cleared.
++ * Do the final two steps: Remove from the physmap, and free
++ * them.
++ */
+ while ( (page = page_list_remove_head(&in_chunk_list)) )
+ {
+ unsigned long gfn;
+
+- if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
+- BUG();
+ mfn = page_to_mfn(page);
+ gfn = mfn_to_gmfn(d, mfn_x(mfn));
+ /* Pages were unshared above */
+ BUG_ON(SHARED_M2P(gfn));
+ if ( guest_physmap_remove_page(d, _gfn(gfn), mfn, 0) )
+ domain_crash(d);
+- put_page(page);
++ free_domheap_page(page);
+ }
+
+ /* Assign each output page to the domain. */
+@@ -761,13 +763,16 @@ static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg)
+ * chunks succeeded.
+ */
+ fail:
+- /* Reassign any input pages we managed to steal. */
++ /*
++ * Reassign any input pages we managed to steal. NB that if the assign
++ * fails again, we're on the hook for freeing the page, since we've already
++ * cleared PGC_allocated.
++ */
+ while ( (page = page_list_remove_head(&in_chunk_list)) )
+ if ( assign_pages(d, page, 0, MEMF_no_refcount) )
+ {
+ BUG_ON(!d->is_dying);
+- if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+- put_page(page);
++ free_domheap_page(page);
+ }
+
+ dying:
+diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
+index 482f0988f7..52da7762e3 100644
+--- a/xen/common/page_alloc.c
++++ b/xen/common/page_alloc.c
+@@ -2221,7 +2221,7 @@ int assign_pages(
+ for ( i = 0; i < (1 << order); i++ )
+ {
+ ASSERT(page_get_owner(&pg[i]) == NULL);
+- ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
++ ASSERT(!pg[i].count_info);
+ page_set_owner(&pg[i], d);
+ smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
+ pg[i].count_info = PGC_allocated | 1;
+--
+2.20.1
+
diff --git a/emulators/xen-kernel/files/xsa290-4.11-1.patch b/emulators/xen-kernel/files/xsa290-4.11-1.patch
new file mode 100644
index 000000000000..38ddb56cf29c
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa290-4.11-1.patch
@@ -0,0 +1,237 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: also allow L2 (un)validation to be preemptible
+
+Commit c612481d1c ("x86/mm: Plumbing to allow any PTE update to fail
+with -ERESTART") added assertions next to the {alloc,free}_l2_table()
+invocations to document (and validate in debug builds) that L2
+(un)validations are always preemptible.
+
+The assertion in free_page_type() was now observed to trigger when
+recursive L2 page tables get cleaned up.
+
+In particular put_page_from_l2e()'s assumption that _put_page_type()
+would always succeed is now wrong, resulting in a partially un-validated
+page left in a domain, which has no other means of getting cleaned up
+later on. If not causing any problems earlier, this would ultimately
+trigger the check for ->u.inuse.type_info having a zero count when
+freeing the page during cleanup after the domain has died.
+
+As a result it should be considered a mistake to not have extended
+preemption fully to L2 when it was added to L3/L4 table handling, which
+this change aims to correct.
+
+The validation side additions are done just for symmetry.
+
+This is part of XSA-290.
+
+Reported-by: Manuel Bouyer <bouyer@antioche.eu.org>
+Tested-by: Manuel Bouyer <bouyer@antioche.eu.org>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -1126,7 +1126,7 @@ get_page_from_l1e(
+ define_get_linear_pagetable(l2);
+ static int
+ get_page_from_l2e(
+- l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
++ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
+ {
+ unsigned long mfn = l2e_get_pfn(l2e);
+ int rc;
+@@ -1141,7 +1141,8 @@ get_page_from_l2e(
+ return -EINVAL;
+ }
+
+- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
++ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
++ partial, false);
+ if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = 0;
+
+@@ -1295,8 +1296,11 @@ void put_page_from_l1e(l1_pgentry_t l1e,
+ * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
+ * Note also that this automatically deals correctly with linear p.t.'s.
+ */
+-static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
++static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
++ int partial, bool defer)
+ {
++ int rc = 0;
++
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
+ return 1;
+
+@@ -1311,13 +1315,27 @@ static int put_page_from_l2e(l2_pgentry_
+ else
+ {
+ struct page_info *pg = l2e_get_page(l2e);
+- int rc = _put_page_type(pg, false, mfn_to_page(_mfn(pfn)));
++ struct page_info *ptpg = mfn_to_page(_mfn(pfn));
+
+- ASSERT(!rc);
+- put_page(pg);
++ if ( unlikely(partial > 0) )
++ {
++ ASSERT(!defer);
++ rc = _put_page_type(pg, true, ptpg);
++ }
++ else if ( defer )
++ {
++ current->arch.old_guest_ptpg = ptpg;
++ current->arch.old_guest_table = pg;
++ }
++ else
++ {
++ rc = _put_page_type(pg, true, ptpg);
++ if ( likely(!rc) )
++ put_page(pg);
++ }
+ }
+
+- return 0;
++ return rc;
+ }
+
+ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+@@ -1487,11 +1505,12 @@ static int alloc_l2_table(struct page_in
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l2_pgentry_t *pl2e;
+ unsigned int i;
+- int rc = 0;
++ int rc = 0, partial = page->partial_pte;
+
+ pl2e = map_domain_page(_mfn(pfn));
+
+- for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
++ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
++ i++, partial = 0 )
+ {
+ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+ {
+@@ -1501,23 +1520,33 @@ static int alloc_l2_table(struct page_in
+ }
+
+ if ( !is_guest_l2_slot(d, type, i) ||
+- (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
++ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
+ continue;
+
+- if ( unlikely(rc == -ERESTART) )
++ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- break;
++ page->partial_pte = partial ?: 1;
+ }
+-
+- if ( rc < 0 )
++ else if ( rc == -EINTR && i )
++ {
++ page->nr_validated_ptes = i;
++ page->partial_pte = 0;
++ rc = -ERESTART;
++ }
++ else if ( rc < 0 && rc != -EINTR )
+ {
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
+- while ( i-- > 0 )
+- if ( is_guest_l2_slot(d, type, i) )
+- put_page_from_l2e(pl2e[i], pfn);
+- break;
++ if ( i )
++ {
++ page->nr_validated_ptes = i;
++ page->partial_pte = 0;
++ current->arch.old_guest_ptpg = NULL;
++ current->arch.old_guest_table = page;
++ }
+ }
++ if ( rc < 0 )
++ break;
+
+ pl2e[i] = adjust_guest_l2e(pl2e[i], d);
+ }
+@@ -1797,28 +1826,50 @@ static int free_l2_table(struct page_inf
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l2_pgentry_t *pl2e;
+- unsigned int i = page->nr_validated_ptes - 1;
+- int err = 0;
++ int rc = 0, partial = page->partial_pte;
++ unsigned int i = page->nr_validated_ptes - !partial;
+
+ pl2e = map_domain_page(_mfn(pfn));
+
+- ASSERT(page->nr_validated_ptes);
+- do {
+- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+- put_page_from_l2e(pl2e[i], pfn) == 0 &&
+- i && hypercall_preempt_check() )
++ for ( ; ; )
++ {
++ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
++ rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
++ if ( rc < 0 )
++ break;
++
++ partial = 0;
++
++ if ( !i-- )
++ break;
++
++ if ( hypercall_preempt_check() )
+ {
+- page->nr_validated_ptes = i;
+- err = -ERESTART;
++ rc = -EINTR;
++ break;
+ }
+- } while ( !err && i-- );
++ }
+
+ unmap_domain_page(pl2e);
+
+- if ( !err )
++ if ( rc >= 0 )
++ {
+ page->u.inuse.type_info &= ~PGT_pae_xen_l2;
++ rc = 0;
++ }
++ else if ( rc == -ERESTART )
++ {
++ page->nr_validated_ptes = i;
++ page->partial_pte = partial ?: -1;
++ }
++ else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
++ {
++ page->nr_validated_ptes = i + 1;
++ page->partial_pte = 0;
++ rc = -ERESTART;
++ }
+
+- return err;
++ return rc;
+ }
+
+ static int free_l3_table(struct page_info *page)
+@@ -2138,7 +2189,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
+ return -EBUSY;
+ }
+
+- if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
++ if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d, 0)) < 0) )
+ return rc;
+
+ nl2e = adjust_guest_l2e(nl2e, d);
+@@ -2157,7 +2208,8 @@ static int mod_l2_entry(l2_pgentry_t *pl
+ return -EBUSY;
+ }
+
+- put_page_from_l2e(ol2e, pfn);
++ put_page_from_l2e(ol2e, pfn, 0, true);
++
+ return rc;
+ }
+
diff --git a/emulators/xen-kernel/files/xsa290-4.11-2.patch b/emulators/xen-kernel/files/xsa290-4.11-2.patch
new file mode 100644
index 000000000000..9355e02f82ff
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa290-4.11-2.patch
@@ -0,0 +1,71 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: add explicit preemption checks to L3 (un)validation
+
+When recursive page tables are used at the L3 level, unvalidation of a
+single L4 table may incur unvalidation of two levels of L3 tables, i.e.
+a maximum iteration count of 512^3 for unvalidating an L4 table. The
+preemption check in free_l2_table() as well as the one in
+_put_page_type() may never be reached, so explicit checking is needed in
+free_l3_table().
+
+When recursive page tables are used at the L4 level, the iteration count
+at L4 alone is capped at 512^2. As soon as a present L3 entry is hit
+which itself needs unvalidation (and hence requiring another nested loop
+with 512 iterations), the preemption checks added here kick in, so no
+further preemption checking is needed at L4 (until we decide to permit
+5-level paging for PV guests).
+
+The validation side additions are done just for symmetry.
+
+This is part of XSA-290.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -1581,6 +1581,13 @@ static int alloc_l3_table(struct page_in
+ for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+ i++, partial = 0 )
+ {
++ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
++ {
++ page->nr_validated_ptes = i;
++ rc = -ERESTART;
++ break;
++ }
++
+ if ( is_pv_32bit_domain(d) && (i == 3) )
+ {
+ if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
+@@ -1882,15 +1889,25 @@ static int free_l3_table(struct page_inf
+
+ pl3e = map_domain_page(_mfn(pfn));
+
+- do {
++ for ( ; ; )
++ {
+ rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
+ if ( rc < 0 )
+ break;
++
+ partial = 0;
+- if ( rc > 0 )
+- continue;
+- pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+- } while ( i-- );
++ if ( rc == 0 )
++ pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
++
++ if ( !i-- )
++ break;
++
++ if ( hypercall_preempt_check() )
++ {
++ rc = -EINTR;
++ break;
++ }
++ }
+
+ unmap_domain_page(pl3e);
+
diff --git a/emulators/xen-kernel/files/xsa292.patch b/emulators/xen-kernel/files/xsa292.patch
new file mode 100644
index 000000000000..198fd4fdd4d8
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa292.patch
@@ -0,0 +1,95 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: properly flush TLB in switch_cr3_cr4()
+
+The CR3 values used for contexts run with PCID enabled uniformly have
+CR3.NOFLUSH set, resulting in the CR3 write itself to not cause any
+flushing at all. When the second CR4 write is skipped or doesn't do any
+flushing, there's nothing so far which would purge TLB entries which may
+have accumulated again if the PCID doesn't change; the "just in case"
+flush only affects the case where the PCID actually changes. (There may
+be particularly many TLB entries re-accumulated in case of a watchdog
+NMI kicking in during the critical time window.)
+
+Suppress the no-flush behavior of the CR3 write in this particular case.
+
+Similarly the second CR4 write may not cause any flushing of TLB entries
+established again while the original PCID was still in use - it may get
+performed because of unrelated bits changing. The flush of the old PCID
+needs to happen nevertheless.
+
+At the same time also eliminate a possible race with lazy context
+switch: Just like for CR4, CR3 may change at any time while interrupts
+are enabled, due to the __sync_local_execstate() invocation from the
+flush IPI handler. It is for that reason that the CR3 read, just like
+the CR4 one, must happen only after interrupts have been turned off.
+
+This is XSA-292.
+
+Reported-by: Sergey Dyasli <sergey.dyasli@citrix.com>
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Tested-by: Sergey Dyasli <sergey.dyasli@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+---
+v3: Adjust comments. Drop old_cr4 from the PGE check in the expression
+ controlling the invocation of invpcid_flush_single_context(), as PGE
+ is always clear there.
+v2: Decouple invpcid_flush_single_context() from 2nd CR4 write.
+
+--- a/xen/arch/x86/flushtlb.c
++++ b/xen/arch/x86/flushtlb.c
+@@ -103,9 +103,8 @@ static void do_tlb_flush(void)
+
+ void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
+ {
+- unsigned long flags, old_cr4;
++ unsigned long flags, old_cr4, old_pcid;
+ u32 t;
+- unsigned long old_pcid = cr3_pcid(read_cr3());
+
+ /* This non-reentrant function is sometimes called in interrupt context. */
+ local_irq_save(flags);
+@@ -133,15 +132,38 @@ void switch_cr3_cr4(unsigned long cr3, u
+ */
+ invpcid_flush_all_nonglobals();
+
++ /*
++ * If we don't change PCIDs, the CR3 write below needs to flush this very
++ * PCID, even when a full flush was performed above, as we are currently
++ * accumulating TLB entries again from the old address space.
++ * NB: Clearing the bit when we don't use PCID is benign (as it is clear
++ * already in that case), but allows the if() to be more simple.
++ */
++ old_pcid = cr3_pcid(read_cr3());
++ if ( old_pcid == cr3_pcid(cr3) )
++ cr3 &= ~X86_CR3_NOFLUSH;
++
+ write_cr3(cr3);
+
+ if ( old_cr4 != cr4 )
+ write_cr4(cr4);
+- else if ( old_pcid != cr3_pcid(cr3) )
+- /*
+- * Make sure no TLB entries related to the old PCID created between
+- * flushing the TLB and writing the new %cr3 value remain in the TLB.
+- */
++
++ /*
++ * Make sure no TLB entries related to the old PCID created between
++ * flushing the TLB and writing the new %cr3 value remain in the TLB.
++ *
++ * The write to CR4 just above has performed a wider flush in certain
++ * cases, which therefore get excluded here. Since that write is
++ * conditional, note in particular that it won't be skipped if PCIDE
++ * transitions from 1 to 0. This is because the CR4 write further up will
++ * have been skipped in this case, as PCIDE and PGE won't both be set at
++ * the same time.
++ *
++ * Note also that PGE is always clear in old_cr4.
++ */
++ if ( old_pcid != cr3_pcid(cr3) &&
++ !(cr4 & X86_CR4_PGE) &&
++ (old_cr4 & X86_CR4_PCIDE) <= (cr4 & X86_CR4_PCIDE) )
+ invpcid_flush_single_context(old_pcid);
+
+ post_flush(t);
diff --git a/emulators/xen-kernel/files/xsa293-4.11-1.patch b/emulators/xen-kernel/files/xsa293-4.11-1.patch
new file mode 100644
index 000000000000..0b8499fbbb15
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa293-4.11-1.patch
@@ -0,0 +1,317 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Rewrite guest %cr4 handling from scratch
+
+The PV cr4 logic is almost impossible to follow, and leaks bits into guest
+context which definitely shouldn't be visible (in particular, VMXE).
+
+The biggest problem however, and source of the complexity, is that it derives
+new real and guest cr4 values from the current value in hardware - this is
+context dependent and an inappropriate source of information.
+
+Rewrite the cr4 logic to be invariant of the current value in hardware.
+
+First of all, modify write_ptbase() to always use mmu_cr4_features for IDLE
+and HVM contexts. mmu_cr4_features *is* the correct value to use, and makes
+the ASSERT() obviously redundant.
+
+For PV guests, curr->arch.pv.ctrlreg[4] remains the guests view of cr4, but
+all logic gets reworked in terms of this and mmu_cr4_features only.
+
+Two masks are introduced; bits which the guest has control over, and bits
+which are forwarded from Xen's settings. One guest-visible change here is
+that Xen's VMXE setting is no longer visible at all.
+
+pv_make_cr4() follows fairly closely from pv_guest_cr4_to_real_cr4(), but
+deliberately starts with mmu_cr4_features, and only alters the minimal subset
+of bits.
+
+The boot-time {compat_,}pv_cr4_mask variables are removed, as they are a
+remnant of the pre-CPUID policy days. pv_fixup_guest_cr4() gains a related
+derivation from the policy.
+
+Another guest visible change here is that a 32bit PV guest can now flip
+FSGSBASE in its view of CR4. While the {RD,WR}{FS,GS}BASE instructions are
+unusable outside of a 64bit code segment, the ability to modify FSGSBASE
+matches real hardware behaviour, and avoids the need for any 32bit/64bit
+differences in the logic.
+
+Overall, this patch shouldn't have a practical change in guest behaviour.
+VMXE will disappear from view, and an inquisitive 32bit kernel can now see
+FSGSBASE changing, but this new logic is otherwise bug-compatible with before.
+
+This is part of XSA-293
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index b1e50d1..675152a 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -733,49 +733,6 @@ int arch_domain_soft_reset(struct domain *d)
+ return ret;
+ }
+
+-/*
+- * These are the masks of CR4 bits (subject to hardware availability) which a
+- * PV guest may not legitimiately attempt to modify.
+- */
+-static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask;
+-
+-static int __init init_pv_cr4_masks(void)
+-{
+- unsigned long common_mask = ~X86_CR4_TSD;
+-
+- /*
+- * All PV guests may attempt to modify TSD, DE and OSXSAVE.
+- */
+- if ( cpu_has_de )
+- common_mask &= ~X86_CR4_DE;
+- if ( cpu_has_xsave )
+- common_mask &= ~X86_CR4_OSXSAVE;
+-
+- pv_cr4_mask = compat_pv_cr4_mask = common_mask;
+-
+- /*
+- * 64bit PV guests may attempt to modify FSGSBASE.
+- */
+- if ( cpu_has_fsgsbase )
+- pv_cr4_mask &= ~X86_CR4_FSGSBASE;
+-
+- return 0;
+-}
+-__initcall(init_pv_cr4_masks);
+-
+-unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
+-{
+- unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
+- unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask;
+-
+- if ( (guest_cr4 & mask) != (hv_cr4 & mask) )
+- printk(XENLOG_G_WARNING
+- "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n",
+- current->domain->domain_id, v, hv_cr4, guest_cr4);
+-
+- return (hv_cr4 & mask) | (guest_cr4 & ~mask);
+-}
+-
+ #define xen_vcpu_guest_context vcpu_guest_context
+ #define fpu_ctxt fpu_ctxt.x
+ CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
+@@ -789,7 +746,7 @@ int arch_set_info_guest(
+ struct domain *d = v->domain;
+ unsigned long cr3_gfn;
+ struct page_info *cr3_page;
+- unsigned long flags, cr4;
++ unsigned long flags;
+ unsigned int i;
+ int rc = 0, compat;
+
+@@ -978,9 +935,8 @@ int arch_set_info_guest(
+ v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS;
+ v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
+
+- cr4 = v->arch.pv_vcpu.ctrlreg[4];
+- v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) :
+- real_cr4_to_pv_guest_cr4(mmu_cr4_features);
++ v->arch.pv_vcpu.ctrlreg[4] =
++ pv_fixup_guest_cr4(v, v->arch.pv_vcpu.ctrlreg[4]);
+
+ memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg));
+ for ( i = 0; i < 8; i++ )
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 6509035..08634b7 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -505,33 +505,13 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
+ v->arch.cr3 |= get_pcid_bits(v, false);
+ }
+
+-unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v)
+-{
+- const struct domain *d = v->domain;
+- unsigned long cr4;
+-
+- cr4 = v->arch.pv_vcpu.ctrlreg[4] & ~X86_CR4_DE;
+- cr4 |= mmu_cr4_features & (X86_CR4_PSE | X86_CR4_SMEP | X86_CR4_SMAP |
+- X86_CR4_OSXSAVE | X86_CR4_FSGSBASE);
+-
+- if ( d->arch.pv_domain.pcid )
+- cr4 |= X86_CR4_PCIDE;
+- else if ( !d->arch.pv_domain.xpti )
+- cr4 |= X86_CR4_PGE;
+-
+- cr4 |= d->arch.vtsc ? X86_CR4_TSD : 0;
+-
+- return cr4;
+-}
+-
+ void write_ptbase(struct vcpu *v)
+ {
+ struct cpu_info *cpu_info = get_cpu_info();
+ unsigned long new_cr4;
+
+ new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v))
+- ? pv_guest_cr4_to_real_cr4(v)
+- : ((read_cr4() & ~(X86_CR4_PCIDE | X86_CR4_TSD)) | X86_CR4_PGE);
++ ? pv_make_cr4(v) : mmu_cr4_features;
+
+ if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti )
+ {
+@@ -550,8 +530,6 @@ void write_ptbase(struct vcpu *v)
+ switch_cr3_cr4(v->arch.cr3, new_cr4);
+ cpu_info->pv_cr3 = 0;
+ }
+-
+- ASSERT(is_pv_vcpu(v) || read_cr4() == mmu_cr4_features);
+ }
+
+ /*
+diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
+index b75ff6b..3965959 100644
+--- a/xen/arch/x86/pv/domain.c
++++ b/xen/arch/x86/pv/domain.c
+@@ -97,6 +97,52 @@ static void release_compat_l4(struct vcpu *v)
+ v->arch.guest_table_user = pagetable_null();
+ }
+
++unsigned long pv_fixup_guest_cr4(const struct vcpu *v, unsigned long cr4)
++{
++ const struct cpuid_policy *p = v->domain->arch.cpuid;
++
++ /* Discard attempts to set guest controllable bits outside of the policy. */
++ cr4 &= ~((p->basic.tsc ? 0 : X86_CR4_TSD) |
++ (p->basic.de ? 0 : X86_CR4_DE) |
++ (p->feat.fsgsbase ? 0 : X86_CR4_FSGSBASE) |
++ (p->basic.xsave ? 0 : X86_CR4_OSXSAVE));
++
++ /* Masks expected to be disjoint sets. */
++ BUILD_BUG_ON(PV_CR4_GUEST_MASK & PV_CR4_GUEST_VISIBLE_MASK);
++
++ /*
++ * A guest sees the policy subset of its own choice of guest controllable
++ * bits, and a subset of Xen's choice of certain hardware settings.
++ */
++ return ((cr4 & PV_CR4_GUEST_MASK) |
++ (mmu_cr4_features & PV_CR4_GUEST_VISIBLE_MASK));
++}
++
++unsigned long pv_make_cr4(const struct vcpu *v)
++{
++ const struct domain *d = v->domain;
++ unsigned long cr4 = mmu_cr4_features &
++ ~(X86_CR4_PCIDE | X86_CR4_PGE | X86_CR4_TSD);
++
++ /*
++ * PCIDE or PGE depends on the PCID/XPTI settings, but must not both be
++ * set, as it impacts the safety of TLB flushing.
++ */
++ if ( d->arch.pv_domain.pcid )
++ cr4 |= X86_CR4_PCIDE;
++ else if ( !d->arch.pv_domain.xpti )
++ cr4 |= X86_CR4_PGE;
++
++ /*
++ * TSD is needed if either the guest has elected to use it, or Xen is
++ * virtualising the TSC value the guest sees.
++ */
++ if ( d->arch.vtsc || (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) )
++ cr4 |= X86_CR4_TSD;
++
++ return cr4;
++}
++
+ int switch_compat(struct domain *d)
+ {
+ struct vcpu *v;
+@@ -191,7 +237,7 @@ int pv_vcpu_initialise(struct vcpu *v)
+ /* PV guests by default have a 100Hz ticker. */
+ v->periodic_period = MILLISECS(10);
+
+- v->arch.pv_vcpu.ctrlreg[4] = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
++ v->arch.pv_vcpu.ctrlreg[4] = pv_fixup_guest_cr4(v, 0);
+
+ if ( is_pv_32bit_domain(d) )
+ {
+diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
+index ce2ec76..4abbc14 100644
+--- a/xen/arch/x86/pv/emul-priv-op.c
++++ b/xen/arch/x86/pv/emul-priv-op.c
+@@ -32,6 +32,7 @@
+ #include <asm/hypercall.h>
+ #include <asm/mc146818rtc.h>
+ #include <asm/p2m.h>
++#include <asm/pv/domain.h>
+ #include <asm/pv/traps.h>
+ #include <asm/shared.h>
+ #include <asm/traps.h>
+@@ -785,8 +786,8 @@ static int write_cr(unsigned int reg, unsigned long val,
+ }
+
+ case 4: /* Write CR4 */
+- curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+- write_cr4(pv_guest_cr4_to_real_cr4(curr));
++ curr->arch.pv_vcpu.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
++ write_cr4(pv_make_cr4(curr));
+ ctxt_switch_levelling(curr);
+ return X86EMUL_OKAY;
+ }
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index ec81d78..c8aa8a5 100644
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -610,17 +610,6 @@ bool update_secondary_system_time(struct vcpu *,
+ void vcpu_show_execution_state(struct vcpu *);
+ void vcpu_show_registers(const struct vcpu *);
+
+-/* Clean up CR4 bits that are not under guest control. */
+-unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4);
+-
+-/* Convert between guest-visible and real CR4 values. */
+-unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v);
+-
+-#define real_cr4_to_pv_guest_cr4(c) \
+- ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | \
+- X86_CR4_OSXSAVE | X86_CR4_SMEP | \
+- X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE))
+-
+ #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS)
+
+ static inline struct vcpu_guest_context *alloc_vcpu_guest_context(void)
+diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h
+index 4fea764..4e4710c 100644
+--- a/xen/include/asm-x86/pv/domain.h
++++ b/xen/include/asm-x86/pv/domain.h
+@@ -59,6 +59,23 @@ int pv_vcpu_initialise(struct vcpu *v);
+ void pv_domain_destroy(struct domain *d);
+ int pv_domain_initialise(struct domain *d);
+
++/*
++ * Bits which a PV guest can toggle in its view of cr4. Some are loaded into
++ * hardware, while some are fully emulated.
++ */
++#define PV_CR4_GUEST_MASK \
++ (X86_CR4_TSD | X86_CR4_DE | X86_CR4_FSGSBASE | X86_CR4_OSXSAVE)
++
++/* Bits which a PV guest may observe from the real hardware settings. */
++#define PV_CR4_GUEST_VISIBLE_MASK \
++ (X86_CR4_PAE | X86_CR4_MCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT)
++
++/* Given a new cr4 value, construct the resulting guest-visible cr4 value. */
++unsigned long pv_fixup_guest_cr4(const struct vcpu *v, unsigned long cr4);
++
++/* Create a cr4 value to load into hardware, based on vcpu settings. */
++unsigned long pv_make_cr4(const struct vcpu *v);
++
+ #else /* !CONFIG_PV */
+
+ #include <xen/errno.h>
+@@ -68,6 +85,8 @@ static inline int pv_vcpu_initialise(struct vcpu *v) { return -EOPNOTSUPP; }
+ static inline void pv_domain_destroy(struct domain *d) {}
+ static inline int pv_domain_initialise(struct domain *d) { return -EOPNOTSUPP; }
+
++static inline unsigned long pv_make_cr4(const struct vcpu *v) { return ~0ul; }
++
+ #endif /* CONFIG_PV */
+
+ void paravirt_ctxt_switch_from(struct vcpu *v);
diff --git a/emulators/xen-kernel/files/xsa293-4.11-2.patch b/emulators/xen-kernel/files/xsa293-4.11-2.patch
new file mode 100644
index 000000000000..a005021b1483
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa293-4.11-2.patch
@@ -0,0 +1,260 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Don't have %cr4.fsgsbase active behind a guest kernels back
+
+Currently, a 64bit PV guest can appear to set and clear FSGSBASE in %cr4, but
+the bit remains set in hardware. Therefore, the {RD,WR}{FS,GS}BASE are usable
+even when the guest kernel believes that they are disabled.
+
+The FSGSBASE feature isn't currently supported in Linux, and its context
+switch path has some optimisations which rely on userspace being unable to use
+the WR{FS,GS}BASE instructions. Xen's current behaviour undermines this
+expectation.
+
+In 64bit PV guest context, always load the guest kernels setting of FSGSBASE
+into %cr4. This requires adjusting how Xen uses the {RD,WR}{FS,GS}BASE
+instructions.
+
+ * Delete the cpu_has_fsgsbase helper. It is no longer safe, as users need to
+ check %cr4 directly.
+ * The raw __rd{fs,gs}base() helpers are only safe to use when %cr4.fsgsbase
+ is set. Comment this property.
+ * The {rd,wr}{fs,gs}{base,shadow}() and read_msr() helpers are updated to use
+ the current %cr4 value to determine which mechanism to use.
+ * toggle_guest_mode() and save_segments() are update to avoid reading
+ fs/gsbase if the values in hardware cannot be stale WRT struct vcpu. A
+ consequence of this is that the write_cr() path needs to cache the current
+ bases, as subsequent context switches will skip saving the values.
+ * write_cr4() is updated to ensure that the shadow %cr4.fsgsbase value is
+ observed in a safe way WRT the hardware setting, if an interrupt happens to
+ hit in the middle.
+ * pv_make_cr4() is updated for 64bit PV guests to use the guest kernels
+ choice of FSGSBASE.
+
+This is part of XSA-293
+
+Reported-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 675152a..29f892c 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -1433,7 +1433,8 @@ static void save_segments(struct vcpu *v)
+ regs->fs = read_sreg(fs);
+ regs->gs = read_sreg(gs);
+
+- if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
++ /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
++ if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) )
+ {
+ v->arch.pv_vcpu.fs_base = __rdfsbase();
+ if ( v->arch.flags & TF_kernel_mode )
+diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
+index 3965959..228a174 100644
+--- a/xen/arch/x86/pv/domain.c
++++ b/xen/arch/x86/pv/domain.c
+@@ -140,6 +140,16 @@ unsigned long pv_make_cr4(const struct vcpu *v)
+ if ( d->arch.vtsc || (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) )
+ cr4 |= X86_CR4_TSD;
+
++ /*
++ * The {RD,WR}{FS,GS}BASE are only useable in 64bit code segments. While
++ * we must not have CR4.FSGSBASE set behind the back of a 64bit PV kernel,
++ * we do leave it set in 32bit PV context to speed up Xen's context switch
++ * path.
++ */
++ if ( !is_pv_32bit_domain(d) &&
++ !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_FSGSBASE) )
++ cr4 &= ~X86_CR4_FSGSBASE;
++
+ return cr4;
+ }
+
+@@ -375,7 +385,8 @@ void toggle_guest_mode(struct vcpu *v)
+ {
+ ASSERT(!is_pv_32bit_vcpu(v));
+
+- if ( cpu_has_fsgsbase )
++ /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ {
+ if ( v->arch.flags & TF_kernel_mode )
+ v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
+diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
+index 4abbc14..312c1ee 100644
+--- a/xen/arch/x86/pv/emul-priv-op.c
++++ b/xen/arch/x86/pv/emul-priv-op.c
+@@ -786,6 +786,17 @@ static int write_cr(unsigned int reg, unsigned long val,
+ }
+
+ case 4: /* Write CR4 */
++ /*
++ * If this write will disable FSGSBASE, refresh Xen's idea of the
++ * guest bases now that they can no longer change.
++ */
++ if ( (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_FSGSBASE) &&
++ !(val & X86_CR4_FSGSBASE) )
++ {
++ curr->arch.pv_vcpu.fs_base = __rdfsbase();
++ curr->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
++ }
++
+ curr->arch.pv_vcpu.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
+ write_cr4(pv_make_cr4(curr));
+ ctxt_switch_levelling(curr);
+@@ -835,14 +846,15 @@ static int read_msr(unsigned int reg, uint64_t *val,
+ case MSR_FS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+- *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
++ *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase()
++ : curr->arch.pv_vcpu.fs_base;
+ return X86EMUL_OKAY;
+
+ case MSR_GS_BASE:
+ if ( is_pv_32bit_domain(currd) )
+ break;
+- *val = cpu_has_fsgsbase ? __rdgsbase()
+- : curr->arch.pv_vcpu.gs_base_kernel;
++ *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase()
++ : curr->arch.pv_vcpu.gs_base_kernel;
+ return X86EMUL_OKAY;
+
+ case MSR_SHADOW_GS_BASE:
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index ecb0149..a353d76 100644
+--- a/xen/arch/x86/setup.c
++++ b/xen/arch/x86/setup.c
+@@ -1567,7 +1567,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+
+ cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
+
+- if ( cpu_has_fsgsbase )
++ if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
+ set_in_cr4(X86_CR4_FSGSBASE);
+
+ if ( opt_invpcid && cpu_has_invpcid )
+diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
+index b237da1..861cb0a 100644
+--- a/xen/include/asm-x86/cpufeature.h
++++ b/xen/include/asm-x86/cpufeature.h
+@@ -90,7 +90,6 @@
+ #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
+
+ /* CPUID level 0x00000007:0.ebx */
+-#define cpu_has_fsgsbase boot_cpu_has(X86_FEATURE_FSGSBASE)
+ #define cpu_has_bmi1 boot_cpu_has(X86_FEATURE_BMI1)
+ #define cpu_has_hle boot_cpu_has(X86_FEATURE_HLE)
+ #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
+diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
+index afbeb7f..1ba6ee3 100644
+--- a/xen/include/asm-x86/msr.h
++++ b/xen/include/asm-x86/msr.h
+@@ -120,6 +120,14 @@ static inline uint64_t rdtsc_ordered(void)
+ : "=a" (low), "=d" (high) \
+ : "c" (counter))
+
++/*
++ * On hardware supporting FSGSBASE, the value loaded into hardware is the
++ * guest kernel's choice for 64bit PV guests (Xen's choice for Idle, HVM and
++ * 32bit PV).
++ *
++ * Therefore, the {RD,WR}{FS,GS}BASE instructions are only safe to use if
++ * %cr4.fsgsbase is set.
++ */
+ static inline unsigned long __rdfsbase(void)
+ {
+ unsigned long base;
+@@ -150,7 +158,7 @@ static inline unsigned long rdfsbase(void)
+ {
+ unsigned long base;
+
+- if ( cpu_has_fsgsbase )
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ return __rdfsbase();
+
+ rdmsrl(MSR_FS_BASE, base);
+@@ -162,7 +170,7 @@ static inline unsigned long rdgsbase(void)
+ {
+ unsigned long base;
+
+- if ( cpu_has_fsgsbase )
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ return __rdgsbase();
+
+ rdmsrl(MSR_GS_BASE, base);
+@@ -174,7 +182,7 @@ static inline unsigned long rdgsshadow(void)
+ {
+ unsigned long base;
+
+- if ( cpu_has_fsgsbase )
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ {
+ asm volatile ( "swapgs" );
+ base = __rdgsbase();
+@@ -188,7 +196,7 @@ static inline unsigned long rdgsshadow(void)
+
+ static inline void wrfsbase(unsigned long base)
+ {
+- if ( cpu_has_fsgsbase )
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ #ifdef HAVE_AS_FSGSBASE
+ asm volatile ( "wrfsbase %0" :: "r" (base) );
+ #else
+@@ -200,7 +208,7 @@ static inline void wrfsbase(unsigned long base)
+
+ static inline void wrgsbase(unsigned long base)
+ {
+- if ( cpu_has_fsgsbase )
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ #ifdef HAVE_AS_FSGSBASE
+ asm volatile ( "wrgsbase %0" :: "r" (base) );
+ #else
+@@ -212,7 +220,7 @@ static inline void wrgsbase(unsigned long base)
+
+ static inline void wrgsshadow(unsigned long base)
+ {
+- if ( cpu_has_fsgsbase )
++ if ( read_cr4() & X86_CR4_FSGSBASE )
+ {
+ asm volatile ( "swapgs\n\t"
+ #ifdef HAVE_AS_FSGSBASE
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 2bd9e69..8e253dc 100644
+--- a/xen/include/asm-x86/processor.h
++++ b/xen/include/asm-x86/processor.h
+@@ -305,11 +305,31 @@ static inline unsigned long read_cr4(void)
+
+ static inline void write_cr4(unsigned long val)
+ {
++ struct cpu_info *info = get_cpu_info();
++
+ /* No global pages in case of PCIDs enabled! */
+ ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE));
+
+- get_cpu_info()->cr4 = val;
+- asm volatile ( "mov %0,%%cr4" : : "r" (val) );
++ /*
++ * On hardware supporting FSGSBASE, the value in %cr4 is the kernel's
++ * choice for 64bit PV guests, which impacts whether Xen can use the
++ * instructions.
++ *
++ * The {rd,wr}{fs,gs}base() helpers use info->cr4 to work out whether it
++ * is safe to execute the {RD,WR}{FS,GS}BASE instruction, falling back to
++ * the MSR path if not. Some users require interrupt safety.
++ *
++ * If FSGSBASE is currently or about to become clear, reflect this in
++ * info->cr4 before updating %cr4, so an interrupt which hits in the
++ * middle won't observe FSGSBASE set in info->cr4 but clear in %cr4.
++ */
++ info->cr4 = val & (info->cr4 | ~X86_CR4_FSGSBASE);
++
++ asm volatile ( "mov %[val], %%cr4"
++ : "+m" (info->cr4) /* Force ordering without a barrier. */
++ : [val] "r" (val) );
++
++ info->cr4 = val;
+ }
+
+ /* Clear and set 'TS' bit respectively */
diff --git a/emulators/xen-kernel/files/xsa294-4.11.patch b/emulators/xen-kernel/files/xsa294-4.11.patch
new file mode 100644
index 000000000000..a0784d928f35
--- /dev/null
+++ b/emulators/xen-kernel/files/xsa294-4.11.patch
@@ -0,0 +1,71 @@
+From: Jan Beulich <JBeulich@suse.com>
+Subject: x86/pv: _toggle_guest_pt() may not skip TLB flush for shadow mode guests
+
+For shadow mode guests (e.g. PV ones forced into that mode as L1TF
+mitigation, or during migration) update_cr3() -> sh_update_cr3() may
+result in a change to the (shadow) root page table (compared to the
+previous one when running the same vCPU with the same PCID). This can,
+first and foremost, be a result of memory pressure on the shadow memory
+pool of the domain. Shadow code legitimately relies on the original
+(prior to commit 5c81d260c2 ["xen/x86: use PCID feature"]) behavior of
+the subsequent CR3 write to flush the TLB of entries still left from
+walks with an earlier, different (shadow) root page table.
+
+Restore the flushing behavior, also for the second CR3 write on the exit
+path to guest context when XPTI is active. For the moment accept that
+this will introduce more flushes than are strictly necessary - no flush
+would be needed when the (shadow) root page table doesn't actually
+change, but this information isn't readily (i.e. without introducing a
+layering violation) available here.
+
+This is XSA-294.
+
+Reported-by: XXX PERSON <XXX EMAIL>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Tested-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
+index b75ff6b..528413a 100644
+--- a/xen/arch/x86/pv/domain.c
++++ b/xen/arch/x86/pv/domain.c
+@@ -296,21 +296,35 @@ int pv_domain_initialise(struct domain *d)
+ static void _toggle_guest_pt(struct vcpu *v)
+ {
+ const struct domain *d = v->domain;
++ struct cpu_info *cpu_info = get_cpu_info();
++ unsigned long cr3;
+
+ v->arch.flags ^= TF_kernel_mode;
+ update_cr3(v);
+ if ( d->arch.pv_domain.xpti )
+ {
+- struct cpu_info *cpu_info = get_cpu_info();
+-
+ cpu_info->root_pgt_changed = true;
+ cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
+ (d->arch.pv_domain.pcid
+ ? get_pcid_bits(v, true) : 0);
+ }
+
+- /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
+- write_cr3(v->arch.cr3);
++ /*
++ * Don't flush user global mappings from the TLB. Don't tick TLB clock.
++ *
++ * In shadow mode, though, update_cr3() may need to be accompanied by a
++ * TLB flush (for just the incoming PCID), as the top level page table may
++ * have changed behind our backs. To be on the safe side, suppress the
++ * no-flush unconditionally in this case. The XPTI CR3 write, if enabled,
++ * will then need to be a flushing one too.
++ */
++ cr3 = v->arch.cr3;
++ if ( shadow_mode_enabled(d) )
++ {
++ cr3 &= ~X86_CR3_NOFLUSH;
++ cpu_info->pv_cr3 &= ~X86_CR3_NOFLUSH;
++ }
++ write_cr3(cr3);
+
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ return;