diff --git a/0001-xen-xenbus-better-handle-backend-crash.patch b/0001-xen-xenbus-better-handle-backend-crash.patch new file mode 100644 index 00000000..ebf473a1 --- /dev/null +++ b/0001-xen-xenbus-better-handle-backend-crash.patch @@ -0,0 +1,126 @@ +From c2b6a8e0c215ad574f49001994ab1ef661df57e1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Thu, 9 Oct 2025 14:51:04 +0200 +Subject: [PATCH] xen/xenbus: better handle backend crash +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When the backend domain crashes, coordinated device cleanup is not +possible (as it involves waiting for the backend state change). In that +case, toolstack forcefully removes frontend xenstore entries. +xenbus_dev_changed() handles this case, and triggers device cleanup. +It's possible that toolstack manages to connect new device in that +place, before xenbus_dev_changed() notices the old one is missing. If +that happens, new one won't be probed and will forever remain in +XenbusStateInitialising. + +Fix this by checking backend-id and if it changes, consider it +unplug+plug operation. It's important that cleanup on such unplug +doesn't modify xenstore entries (especially the "state" key) as it +belong to the new device to be probed - changing it would derail +establishing connection to the new backend (most likely, closing the +device before it was even connected). Handle this case by setting new +xenbus_device->vanished flag to true, and check it before changing state +entry. + +And even if xenbus_dev_changed() correctly detects the device was +forcefully removed, the cleanup handling is still racy. Since this whole +handling doesn't happend in a single xenstore transaction, it's possible +that toolstack might put a new device there already. Avoid re-creating +the state key (which in the case of loosing the race would actually +close newly attached device). + +The problem does not apply to frontend domain crash, as this case +involves coordinated cleanup. + +Problem originally reported at +https://lore.kernel.org/xen-devel/aOZvivyZ9YhVWDLN@mail-itl/T/#t, +including reproduction steps. + +Signed-off-by: Marek Marczykowski-Górecki +--- +I considered re-using one of existing fields instead of a new +xenbus_device->vanished, but I wasn't sure if that would work better. +Setting xenbus_device->nodename to NULL would prevent few other places +using it (including some log messages). Setting xenbus_device->otherend +might have less unintentional impact, but logically it doesn't feel +correct. +--- + drivers/xen/xenbus/xenbus_client.c | 2 ++ + drivers/xen/xenbus/xenbus_probe.c | 25 +++++++++++++++++++++++++ + include/xen/xenbus.h | 1 + + 3 files changed, 28 insertions(+) + +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index e73ec225d4a61..ce2f49d9aa4ad 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -275,6 +275,8 @@ __xenbus_switch_state(struct xenbus_device *dev, + */ + int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) + { ++ if (dev->vanished) ++ return 0; + return __xenbus_switch_state(dev, state, 0); + } + +diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c +index 86fe6e7790566..3c3e56b544976 100644 +--- a/drivers/xen/xenbus/xenbus_probe.c ++++ b/drivers/xen/xenbus/xenbus_probe.c +@@ -444,6 +444,9 @@ static void xenbus_cleanup_devices(const char *path, struct bus_type *bus) + info.dev = NULL; + bus_for_each_dev(bus, NULL, &info, cleanup_dev); + if (info.dev) { ++ dev_warn(&info.dev->dev, ++ "device forcefully removed from xenstore\n"); ++ info.dev->vanished = true; + device_unregister(&info.dev->dev); + put_device(&info.dev->dev); + } +@@ -659,6 +662,28 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) + return; + + dev = xenbus_device_find(root, &bus->bus); ++ /* Backend domain crash results in not coordinated frontend removal, ++ * without going through XenbusStateClosing. Check if the device ++ * wasn't replaced to point at another backend in the meantime. ++ */ ++ if (dev && !strncmp(node, "device/", sizeof("device/")-1)) { ++ int backend_id; ++ int err = xenbus_gather(XBT_NIL, root, ++ "backend-id", "%i", &backend_id, ++ NULL); ++ if (!err && backend_id != dev->otherend_id) { ++ /* It isn't the same device, assume the old one ++ * vanished and new one needs to be probed. ++ */ ++ dev_warn(&dev->dev, ++ "backend-id mismatch (%d != %d), reconnecting\n", ++ backend_id, dev->otherend_id); ++ dev->vanished = true; ++ device_unregister(&dev->dev); ++ put_device(&dev->dev); ++ dev = NULL; ++ } ++ } + if (!dev) + xenbus_probe_node(bus, type, root); + else +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h +index 7dab04cf4a36c..43a5335f1d5a3 100644 +--- a/include/xen/xenbus.h ++++ b/include/xen/xenbus.h +@@ -87,6 +87,7 @@ struct xenbus_device { + struct completion down; + struct work_struct work; + struct semaphore reclaim_sem; ++ bool vanished; + + /* Event channel based statistics and settings. */ + atomic_t event_channels; +-- +2.51.0 + diff --git a/kernel.spec.in b/kernel.spec.in index e7ada317..f2fa13d6 100644 --- a/kernel.spec.in +++ b/kernel.spec.in @@ -145,6 +145,7 @@ Patch27: 0001-amdgpu-timeout.patch Patch30: 0004-pvops-respect-removable-xenstore-flag-for-block-devi.patch Patch31: 0001-PCI-add-a-reset-quirk-for-Intel-I219LM-ethernet-adap.patch Patch32: 0001-Revert-e1000e-change-k1-configuration-on-MTP-and-lat.patch +Patch33: 0001-xen-xenbus-better-handle-backend-crash.patch # S0ix support: Patch61: xen-events-Add-wakeup-support-to-xen-pirq.patch