From 2b1f93a9e819f813f4da56a6b9b4d483712a7416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Fri, 10 Oct 2025 03:35:19 +0200 Subject: [PATCH] Apply fix for quick network reconnect after backend destroy This especially is relevant for paused VMs (like preloaded disposables). If backend was restarted while frontend was paused, qubesd will attempt to reconnect it on unpause, which will trigger the race condition. --- ...n-xenbus-better-handle-backend-crash.patch | 126 ++++++++++++++++++ kernel.spec.in | 1 + 2 files changed, 127 insertions(+) create mode 100644 0001-xen-xenbus-better-handle-backend-crash.patch diff --git a/0001-xen-xenbus-better-handle-backend-crash.patch b/0001-xen-xenbus-better-handle-backend-crash.patch new file mode 100644 index 00000000..ebf473a1 --- /dev/null +++ b/0001-xen-xenbus-better-handle-backend-crash.patch @@ -0,0 +1,126 @@ +From c2b6a8e0c215ad574f49001994ab1ef661df57e1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Thu, 9 Oct 2025 14:51:04 +0200 +Subject: [PATCH] xen/xenbus: better handle backend crash +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When the backend domain crashes, coordinated device cleanup is not +possible (as it involves waiting for the backend state change). In that +case, toolstack forcefully removes frontend xenstore entries. +xenbus_dev_changed() handles this case, and triggers device cleanup. +It's possible that toolstack manages to connect new device in that +place, before xenbus_dev_changed() notices the old one is missing. If +that happens, new one won't be probed and will forever remain in +XenbusStateInitialising. + +Fix this by checking backend-id and if it changes, consider it +unplug+plug operation. It's important that cleanup on such unplug +doesn't modify xenstore entries (especially the "state" key) as it +belong to the new device to be probed - changing it would derail +establishing connection to the new backend (most likely, closing the +device before it was even connected). Handle this case by setting new +xenbus_device->vanished flag to true, and check it before changing state +entry. + +And even if xenbus_dev_changed() correctly detects the device was +forcefully removed, the cleanup handling is still racy. Since this whole +handling doesn't happend in a single xenstore transaction, it's possible +that toolstack might put a new device there already. Avoid re-creating +the state key (which in the case of loosing the race would actually +close newly attached device). + +The problem does not apply to frontend domain crash, as this case +involves coordinated cleanup. + +Problem originally reported at +https://lore.kernel.org/xen-devel/aOZvivyZ9YhVWDLN@mail-itl/T/#t, +including reproduction steps. + +Signed-off-by: Marek Marczykowski-Górecki +--- +I considered re-using one of existing fields instead of a new +xenbus_device->vanished, but I wasn't sure if that would work better. +Setting xenbus_device->nodename to NULL would prevent few other places +using it (including some log messages). Setting xenbus_device->otherend +might have less unintentional impact, but logically it doesn't feel +correct. +--- + drivers/xen/xenbus/xenbus_client.c | 2 ++ + drivers/xen/xenbus/xenbus_probe.c | 25 +++++++++++++++++++++++++ + include/xen/xenbus.h | 1 + + 3 files changed, 28 insertions(+) + +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index e73ec225d4a61..ce2f49d9aa4ad 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -275,6 +275,8 @@ __xenbus_switch_state(struct xenbus_device *dev, + */ + int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) + { ++ if (dev->vanished) ++ return 0; + return __xenbus_switch_state(dev, state, 0); + } + +diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c +index 86fe6e7790566..3c3e56b544976 100644 +--- a/drivers/xen/xenbus/xenbus_probe.c ++++ b/drivers/xen/xenbus/xenbus_probe.c +@@ -444,6 +444,9 @@ static void xenbus_cleanup_devices(const char *path, struct bus_type *bus) + info.dev = NULL; + bus_for_each_dev(bus, NULL, &info, cleanup_dev); + if (info.dev) { ++ dev_warn(&info.dev->dev, ++ "device forcefully removed from xenstore\n"); ++ info.dev->vanished = true; + device_unregister(&info.dev->dev); + put_device(&info.dev->dev); + } +@@ -659,6 +662,28 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) + return; + + dev = xenbus_device_find(root, &bus->bus); ++ /* Backend domain crash results in not coordinated frontend removal, ++ * without going through XenbusStateClosing. Check if the device ++ * wasn't replaced to point at another backend in the meantime. ++ */ ++ if (dev && !strncmp(node, "device/", sizeof("device/")-1)) { ++ int backend_id; ++ int err = xenbus_gather(XBT_NIL, root, ++ "backend-id", "%i", &backend_id, ++ NULL); ++ if (!err && backend_id != dev->otherend_id) { ++ /* It isn't the same device, assume the old one ++ * vanished and new one needs to be probed. ++ */ ++ dev_warn(&dev->dev, ++ "backend-id mismatch (%d != %d), reconnecting\n", ++ backend_id, dev->otherend_id); ++ dev->vanished = true; ++ device_unregister(&dev->dev); ++ put_device(&dev->dev); ++ dev = NULL; ++ } ++ } + if (!dev) + xenbus_probe_node(bus, type, root); + else +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h +index 7dab04cf4a36c..43a5335f1d5a3 100644 +--- a/include/xen/xenbus.h ++++ b/include/xen/xenbus.h +@@ -87,6 +87,7 @@ struct xenbus_device { + struct completion down; + struct work_struct work; + struct semaphore reclaim_sem; ++ bool vanished; + + /* Event channel based statistics and settings. */ + atomic_t event_channels; +-- +2.51.0 + diff --git a/kernel.spec.in b/kernel.spec.in index e7ada317..f2fa13d6 100644 --- a/kernel.spec.in +++ b/kernel.spec.in @@ -145,6 +145,7 @@ Patch27: 0001-amdgpu-timeout.patch Patch30: 0004-pvops-respect-removable-xenstore-flag-for-block-devi.patch Patch31: 0001-PCI-add-a-reset-quirk-for-Intel-I219LM-ethernet-adap.patch Patch32: 0001-Revert-e1000e-change-k1-configuration-on-MTP-and-lat.patch +Patch33: 0001-xen-xenbus-better-handle-backend-crash.patch # S0ix support: Patch61: xen-events-Add-wakeup-support-to-xen-pirq.patch